## Load data, define models and search grids

In [1]:
# Load libraries
import os
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.base import clone
from xgboost import XGBClassifier
from modules.utils import load_object, load_dataset
from modules.training import fit_model

In [2]:
# Create dataset paths and names
training_dir = '../data/processed/train'
dataset_paths = os.listdir(training_dir)
dataset_names = [path.split('-')[0] for path in dataset_paths]
dataset_name_paths = list(zip(dataset_names, dataset_paths))

In [3]:
dataset_name_paths

[('kaggle_credit_card_fraud', 'kaggle_credit_card_fraud-train.csv.gz'),
 ('kaggle_patient_survival', 'kaggle_patient_survival-train.csv.gz'),
 ('uci_android_permissions', 'uci_android_permissions-train.csv.gz'),
 ('uci_breast_cancer', 'uci_breast_cancer-train.csv.gz'),
 ('uci_heart_disease', 'uci_heart_disease-train.csv.gz'),
 ('uci_indian_liver', 'uci_indian_liver-train.csv.gz'),
 ('uci_mushroom', 'uci_mushroom-train.csv.gz'),
 ('uci_phishing_url', 'uci_phishing_url-train.csv.gz'),
 ('uci_secondary_mushroom', 'uci_secondary_mushroom-train.csv.gz'),
 ('uci_spect_heart', 'uci_spect_heart-train.csv.gz')]

## Define classifiers and grids

In [4]:
# Define base classifiers
base_logreg = LogisticRegression(random_state = 42, solver = 'saga', max_iter = 3000)
base_dtree = DecisionTreeClassifier(random_state = 42)
base_rf = RandomForestClassifier(random_state = 42)
base_xgb = XGBClassifier(random_state = 42)
base_svc = SVC(random_state = 42, probability = True)

In [5]:
# Define grids
logreg_grid = {
    'logreg__penalty': ['l2', 'l1'],
    'logreg__C': [10**x for x in range(-2, 3)]
}

dtree_grid = {
    'dtree__max_depth': [None] + [x for x in range(3, 11, 2)],
    'dtree__min_samples_split': [x for x in range(2, 14, 4)],
    'dtree__min_samples_leaf': [1, 10, 20]
}

rf_grid = {
    'rf__n_estimators': [100, 200, 400, 600],
    'rf__max_depth': [None] + [x for x in range(3, 11, 2)],
    'rf__min_samples_split': [x for x in range(2, 14, 4)],
    'rf__min_samples_leaf': [1, 10, 20]
}

xgb_grid = {
    'xgb__n_estimators': [100, 200, 400],
    'xgb__max_depth': [3, 4, 5],
    'xgb__min_child_weight': [1, 3, 5],
    'xgb__subsample': [0.7, 0.8, 0.9, 1],
    'xgb__colsample_bytree': [0.7, 0.8, 0.9, 1]
}

svc_grid = {
    'svc__C': [10**x for x in range(-2, 3)],
    'svc__gamma': [10**x for x in range(-2, 3)]
}

In [6]:
# Wrap model names, base classifiers, and grids
models = ['logreg', 'dt', 'rf', 'xgb', 'svc']
base_estimators = [base_logreg, base_dtree, base_rf, base_xgb, base_svc]
grids = [logreg_grid, dtree_grid, rf_grid, xgb_grid, svc_grid]
model_grids = list(zip(models, base_estimators, grids))

## Train models

In [7]:
# Train and automatically hyperparameter tune each dataset
for name, dataset_path in dataset_name_paths:
    print(f'Loading {name} dataset.')
    train_dataset = load_dataset(os.path.join(training_dir, dataset_path))
    X_train = train_dataset.iloc[:, :-1]
    y_train = train_dataset.iloc[:, -1]
    
    model_dir = os.path.join('../models','-'.join(name.split('_')))
    
    for model_name, base_estimator, grid in model_grids:
        print(f'Training {model_name} on {name}.')
        save_path = os.path.join(model_dir, f'{name}-{model_name}-full')

        fit_model(
            X_train = X_train,
            y_train = y_train,
            model_name = model_name,
            model = clone(base_estimator),
            param_grid = grid,
            save = True,
            save_path = save_path
        )

Loading kaggle_credit_card_fraud dataset.
Training xgb on kaggle_credit_card_fraud.
Successfully saved object to ../models/kaggle-credit-card-fraud/kaggle_credit_card_fraud-xgb-full.pickle
Loading kaggle_patient_survival dataset.
Training xgb on kaggle_patient_survival.
Successfully saved object to ../models/kaggle-patient-survival/kaggle_patient_survival-xgb-full.pickle
Loading uci_android_permissions dataset.
Training xgb on uci_android_permissions.
Successfully saved object to ../models/uci-android-permissions/uci_android_permissions-xgb-full.pickle
Loading uci_breast_cancer dataset.
Training xgb on uci_breast_cancer.
Successfully saved object to ../models/uci-breast-cancer/uci_breast_cancer-xgb-full.pickle
Loading uci_heart_disease dataset.
Training xgb on uci_heart_disease.
Successfully saved object to ../models/uci-heart-disease/uci_heart_disease-xgb-full.pickle
Loading uci_indian_liver dataset.
Training xgb on uci_indian_liver.
Successfully saved object to ../models/uci-indian-l