## Load data, define models and search grids

In [None]:
# Load libraries
import os
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.base import clone
from xgboost import XGBClassifier
from modules.utils import load_object, load_dataset
from modules.training import fit_model

In [None]:
# Create dataset paths and names
training_dir = '../data/processed/train'
dataset_paths = os.listdir(training_dir)
dataset_names = [path.split('-')[0] for path in dataset_paths]
dataset_name_paths = list(zip(dataset_names, dataset_paths))

## Define classifiers and grids

In [None]:
# Define base classifiers with fixed random seeds for reproducibility
# Using saga solver in LogisticRegression for L1 and L2 penalty compatability
base_logreg = LogisticRegression(random_state = 42, solver = 'saga', max_iter = 3000)
base_dtree = DecisionTreeClassifier(random_state = 42)
base_rf = RandomForestClassifier(random_state = 42)
base_xgb = XGBClassifier(random_state = 42)
base_svc = SVC(random_state = 42)

In [None]:
# Define hyperparameter grids for each base classifier

# Logistic Regression: regularization type and strength
logreg_grid = {
    'logreg__penalty': ['l2', 'l1'],
    'logreg__C': [10**x for x in range(-2, 3)]
}

# Decision Tree: depth and split constraints
dtree_grid = {
    'dt__max_depth': [None] + [x for x in range(3, 11, 2)],
    'dt__min_samples_split': [x for x in range(2, 14, 4)],
    'dt__min_samples_leaf': [1, 10, 20]
}

# Random Forest: number of trees, depth, and split constraints
rf_grid = {
    'rf__n_estimators': [100, 200, 400, 600],
    'rf__max_depth': [None] + [x for x in range(3, 11, 2)],
    'rf__min_samples_split': [x for x in range(2, 14, 4)],
    'rf__min_samples_leaf': [1, 10, 20]
}

# XGBoost: tree count, depth, sampling, and regularization parameters
xgb_grid = {
    'xgb__n_estimators': [100, 200, 400],
    'xgb__max_depth': [3, 4, 5],
    'xgb__min_child_weight': [1, 3, 5],
    'xgb__subsample': [0.7, 0.8, 0.9, 1],
    'xgb__colsample_bytree': [0.7, 0.8, 0.9, 1]
}

# SVC: regularization strength and kernel coefficient
svc_grid = {
    'svc__C': [10**x for x in range(-2, 3)],
    'svc__gamma': [10**x for x in range(-2, 3)]
}

In [None]:
# Wrap model names, base classifiers, and their corresponding grids

# Model identifiers
models = ['logreg', 'dt', 'rf', 'xgb', 'svc']

# Base classifier instances (aligned with model names)
base_estimators = [base_logreg, base_dtree, base_rf, base_xgb, base_svc]

# Hyperparameter grids for each model
grids = [logreg_grid, dtree_grid, rf_grid, xgb_grid, svc_grid]

# Combine model name, estimator, and grid into a single iterable
model_grids = list(zip(models, base_estimators, grids))

In [None]:
# Define base classifiers optimized for larger datasets (faster or more scalable variants)
base_logreg_big = LogisticRegression(random_state = 42, solver = 'saga', max_iter = 3000)
base_dtree_big = DecisionTreeClassifier(random_state = 42)
base_rf_big = RandomForestClassifier(random_state = 42)
base_xgb_big = XGBClassifier(random_state = 42)

# Linear SVC instead of SVC with rbf kernel for larger datasets, as rbf kernel scales poorly with dataset size
base_svc_big = LinearSVC(random_state = 42)

In [None]:
# Define hyperparameter grids for larger datasets (reduced search space for efficiency)

# Logistic Regression: regularization type and strength
logreg_grid_big = {
    'logreg__penalty': ['l2', 'l1'],
    'logreg__C': [10**x for x in range(-2, 3)]
}

# Decision Tree: depth and split constraints
dtree_grid_big = {
    'dt__max_depth': [None] + [x for x in range(3, 11, 2)],
    'dt__min_samples_split': [x for x in range(2, 14, 4)],
    'dt__min_samples_leaf': [1, 10, 20]
}

# Random Forest: smaller grid for faster tuning
rf_grid_big = {
    'rf__n_estimators': [400],
    'rf__max_depth': [None] + [x for x in range(3, 7, 2)],
    'rf__min_samples_split': [x for x in range(2, 10, 4)]
}

# XGBoost: reduced grid for speed, maintaining core parameters
xgb_grid_big = {
    'xgb__n_estimators': [400],
    'xgb__max_depth': [3, 4, 5],
    'xgb__min_child_weight': [1, 3, 5],
    'xgb__subsample': [0.8, 1],
    'xgb__colsample_bytree': [0.8, 1]
}

# Linear SVC: regularization strength only (simplified for speed)
svc_grid_big = {
    'svc__C': [10**x for x in range(-2, 6)]
}

In [None]:
# Define big datasets and wrap corresponding base estimators and grids

# Dataset names used for large-scale experiments
big_datasets = [
    'kaggle-credit-card-fraud',
    'kaggle-patient-survival',
    'secondary-mushroom',
    'uci-android-permissions',
    'uci-phishing-url'
]

# Base classifiers optimized for big datasets
base_estimators_big = [base_logreg_big, base_dtree_big, base_rf_big, base_xgb_big, base_svc_big]

# Corresponding hyperparameter grids for each model
grids_big = [logreg_grid_big, dtree_grid_big, rf_grid_big, xgb_grid_big, svc_grid_big]

# Combine model name, estimator, and grid into one iterable for big dataset runs
model_grids_big = list(zip(models, base_estimators_big, grids_big))

## Train models

In [None]:
# Train and hyperparameter-tune models for each dataset
for name, dataset_path in dataset_name_paths:
    print(f'Loading {name} dataset.')
    
    # Load training data and split into features and target
    train_dataset = load_dataset(os.path.join(training_dir, dataset_path))
    X_train = train_dataset.iloc[:, :-1]
    y_train = train_dataset.iloc[:, -1]
    
    # Define directory for saving trained models
    model_dir = os.path.join('../models', '-'.join(name.split('_')))

    # Use smaller grids and fewer folds for large datasets
    if name in big_datasets:
        for model_name, base_estimator, grid in model_grids_big:
            print(f'Training {model_name} on {name}.')
            save_path = os.path.join(model_dir, f'{name}-{model_name}-full')
    
            # Fit model with 3-fold cross-validation and save results
            fit_model(
                X_train = X_train,
                y_train = y_train,
                model_name = model_name,
                model = clone(base_estimator),
                grid_search = True,
                param_grid = grid,
                cv = 3,
                save = True,
                save_path = save_path
            )

    # Use full grids and more folds for regular-sized datasets
    else:
        for model_name, base_estimator, grid in model_grids:
            print(f'Training {model_name} on {name}.')
            save_path = os.path.join(model_dir, f'{name}-{model_name}-full')
    
            # Fit model with 5-fold cross-validation and save results
            fit_model(
                X_train = X_train,
                y_train = y_train,
                model_name = model_name,
                model = clone(base_estimator),
                grid_search = True,
                param_grid = grid,
                cv = 5,
                save = True,
                save_path = save_path
            )