In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from models.adam import ADAM
from models.irls import IRLS
from models.sgd import SGD
from datasets.read import read_all_datasets

In [4]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


Compare the classification performance of logistic regression (try all 3 methods: IWLS, SGD and ADAM) and 4 popular classification methods: LDA (Linear Discriminant analysis), QDA (Quadratic Discriminant Analysis), Decision tree and Random Forest. 
Use available implementations, e.g. from scikit-learn library.

In [5]:
# Load saved datasets
import cloudpickle

try:
    with open('datasets.pkl', 'rb') as f:
        datasets = cloudpickle.load(f)
        print("Loaded datasets from file")
except FileNotFoundError:
    datasets = None

if datasets is None:
    datasets = read_all_datasets('datasets/')

Loaded datasets from file


In [6]:
# Save datasets
import cloudpickle

with open('datasets.pkl', 'wb') as f:
    cloudpickle.dump(datasets, f)

In [10]:
# Print shapes of datasets
for dataset_name, dataset in datasets.items():
    X, y = dataset
    print(f"{dataset_name}: {X.shape}, {y.shape}")

Rice_Cammeo_Osmancik: (3810, 7), (3810,)
Online_Shoppers_intention: (12330, 17), (12330,)
Dataset_for_Link_Phishing: (19431, 68), (19431,)
Banknote_Authentication: (1372, 4), (1372,)
Optdigits: (5620, 38), (5620,)
EEG_Eye_State: (14980, 4), (14980,)
Web_Page_Phishing: (100077, 19), (100077,)
Statlog_Shuttle: (58000, 6), (58000,)
Airline_Passenger_Satisfaction: (64743, 14), (64743,)


In [None]:
import traceback
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

iterations = 1
test_size = 0.3
methods = [
    ("LDA", lambda: LinearDiscriminantAnalysis()),
    ("QDA", lambda: QuadraticDiscriminantAnalysis()),
    ("Decision Tree", lambda: DecisionTreeClassifier()),
    ("Random Forest", lambda: RandomForestClassifier()),
    ("IWLS", lambda: IRLS(iter_limit=10)),
    ("SGD", lambda: SGD(iter_limit=10)),
    ("ADAM", lambda: ADAM(iter_limit=10))
]

results_by_dataset_and_method = {}

# Load saved results
import json

try:
    with open('results.json', 'r') as f:
        results_by_dataset_and_method = json.load(f)
        print("Loaded results from file")
except FileNotFoundError:
    pass

# First collect results 
for dataset_name, dataset in datasets.items():
    if dataset_name in results_by_dataset_and_method:
        print(f"Skipping {dataset_name}")
        continue
    
    results_by_method = {}
    for method_name, method in methods:
        results = {
            "accuracy": [],
            "f1": [],
            "precision": [],
            "recall": []
        }
        
        tqdm._instances.clear()
        
        bar = tqdm(total=iterations)
        bar.set_description(f"Processing {method_name} on {dataset_name}")
        max_attempts = 10
        current_iteration = 0
        while current_iteration < iterations:
            try:
                X, y = dataset
                method_instance = method()
    
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
                method_instance.fit(X_train, y_train)
                y_pred = method_instance.predict(X_test)
                results["accuracy"].append(accuracy_score(y_test, y_pred))
                results["f1"].append(f1_score(y_test, y_pred))
                results["precision"].append(precision_score(y_test, y_pred))
                results["recall"].append(recall_score(y_test, y_pred))

                bar.update(1)
                current_iteration += 1
            except Exception as e:
                print(f"Error in {method_name} on {dataset_name} iteration {current_iteration}: {e}, attempts left: {max_attempts}")
                traceback.print_exc()
                max_attempts -= 1
                
                if max_attempts == 0:
                    break
            
        bar.close()
        results_by_method[method_name] = results
    results_by_dataset_and_method[dataset_name] = results_by_method
    
    # Save results
    with open('results.json', 'w') as f:
        json.dump(results_by_dataset_and_method, f)

Loaded results from file
Skipping Rice_Cammeo_Osmancik
Skipping Online_Shoppers_intention
Skipping Dataset_for_Link_Phishing
Skipping Banknote_Authentication
Skipping Optdigits
Skipping EEG_Eye_State
Skipping Web_Page_Phishing
Skipping Statlog_Shuttle


Processing IWLS on Airline_Passenger_Satisfaction:   0%|          | 0/1 [06:15<?, ?it/s]
Processing LDA on Airline_Passenger_Satisfaction: 100%|██████████| 1/1 [00:00<00:00,  7.94it/s]
Processing QDA on Airline_Passenger_Satisfaction: 100%|██████████| 1/1 [00:00<00:00, 11.09it/s]
Processing Decision Tree on Airline_Passenger_Satisfaction: 100%|██████████| 1/1 [00:00<00:00,  6.20it/s]
Processing Random Forest on Airline_Passenger_Satisfaction: 100%|██████████| 1/1 [00:02<00:00,  2.80s/it]
Processing IWLS on Airline_Passenger_Satisfaction:   0%|          | 0/1 [00:00<?, ?it/s]

In [1]:
# Save results
import json

with open('results.json', 'w') as f:
    json.dump(results_by_dataset_and_method, f)


NameError: name 'results_by_dataset_and_method' is not defined

In [13]:
results_by_dataset_and_method

{}