In [1]:
%load_ext jupyter_black
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("..")

## Modeling

In [3]:
import pandas as pd
from utils.utils import set_seed

set_seed()

### Load data

In [4]:
ds_train = pd.read_csv("../data/avila/avila-tr.txt", header=None)
ds_test = pd.read_csv("../data/avila/avila-ts.txt", header=None)

### Models

In [8]:
# -- Required models --

# KNN
from sklearn.neighbors import KNeighborsClassifier

# LVQ
from sklvq import GLVQ

# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# SVM
from sklearn.svm import SVC

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# MLP
from sklearn.neural_network import MLPClassifier

# Ensemble
from sklearn.ensemble import VotingClassifier

# -- Optional models --

# XGBoost
from xgboost import XGBClassifier

In [9]:
# -- Auxiliar functions --

from time import time


# Compute Training Time
class Timer:
    def __init__(self):
        self.start = time()

    def __call__(self):
        return time() - self.start

### Training

In [18]:
# Metrics
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

# import label encoder
from sklearn.preprocessing import LabelEncoder


# Compute metrics
def compute_metrics(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average="macro", zero_division=1),
        "recall": recall_score(y_true, y_pred, average="macro", zero_division=1),
        "f1": f1_score(y_true, y_pred, average="macro", zero_division=1),
    }


# Split data
X_train = ds_train.iloc[:, :-1]
y_train = ds_train.iloc[:, -1]
X_test = ds_test.iloc[:, :-1]
y_test = ds_test.iloc[:, -1]

# label_encoder = LabelEncoder().fit(y_train)
# y_train = label_encoder.transform(y_train)
# y_test = label_encoder.transform(y_test)

#### Training - Baseline

In [19]:
%%time

models_results = {model.__class__.__name__: {} for model in models}

models = [
    KNeighborsClassifier(),
    GLVQ(),
    SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
    MLPClassifier(),
]

for model in models:
    print("Training model: ", model.__class__.__name__)

    timer = Timer()
    model.fit(X_train, y_train)
    metrics = compute_metrics(y_test, model.predict(X_test))
    metrics["fit_time"] = timer()
    models_results[model.__class__.__name__] = metrics

Training model:  KNeighborsClassifier
Training model:  GLVQ
Training model:  SVC
Training model:  DecisionTreeClassifier
Training model:  RandomForestClassifier
Training model:  XGBClassifier
Training model:  MLPClassifier
CPU times: total: 34.7 s
Wall time: 34.7 s




In [20]:
models_results = pd.DataFrame(models_results).T
models_results

Unnamed: 0,accuracy,precision,recall,f1,fit_time
KNeighborsClassifier,0.749545,0.809206,0.665667,0.718285,0.968159
GLVQ,0.544122,0.402878,0.431919,0.373511,14.328001
SVC,0.674523,0.820989,0.532343,0.569366,7.431895
DecisionTreeClassifier,0.968382,0.951021,0.962684,0.956413,0.182029
RandomForestClassifier,0.985915,0.99104,0.982427,0.986676,2.039851
XGBClassifier,0.881958,0.943424,0.904685,0.921704,5.209744
MLPClassifier,0.784038,0.835033,0.761998,0.786684,4.582967


### Stratified KFold

In [21]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


def stratified_cross_val(model_list, data, n_folds=30, n_splits=10):
    model_results = {type(model).__name__: [] for model in model_list}

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True)

    X, y = data.iloc[:, :-1], data.iloc[:, -1]

    for model in model_list:
        pipeline = Pipeline(
            [
                # ("scaler", StandardScaler()), # uncomment to scale data
                ("model", model)
            ]
        )

        for _ in range(n_folds):
            iter_scores = cross_validate(
                pipeline,
                X,
                y,
                cv=cv,
                scoring={
                    "f1_score": "f1_macro",
                    "precision_score": "precision_macro",
                    "accuracy_score": "accuracy",
                    "recall_score": "recall_macro",
                },
                n_jobs=-1,
            )
            model_results[type(model).__name__].append(iter_scores)

    return model_results


def model_results_to_df(model_results):
    res_final = pd.DataFrame(
        columns=[
            "fit_time",
            "score_time",
            "test_f1_score",
            "test_precision_score",
            "test_accuracy_score",
            "test_recall_score",
            "split",
            "model",
        ]
    )

    for model in model_results.keys():
        for i in range(len(model_results[model])):
            _res = pd.DataFrame(
                model_results[model][i].values(),
                index=model_results[model][i].keys(),
            ).T
            _res["split"] = i
            _res["model"] = model
            res_final = pd.concat([res_final, _res])

    return res_final

In [22]:
models = [
    KNeighborsClassifier(),
    GLVQ(),
    SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
    MLPClassifier(),
]

stratified_crossval_results = model_results_to_df(
    stratified_cross_val(models, ds_train, n_folds=10, n_splits=5)
)

In [24]:
stratified_crossval_results.groupby("model").mean()

Unnamed: 0_level_0,fit_time,score_time,test_f1_score,test_precision_score,test_accuracy_score,test_recall_score,split
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DecisionTreeClassifier,0.052518,0.030884,0.93278,0.932548,0.963087,0.93508,4.5
GLVQ,13.622147,0.027879,0.37058,0.409298,0.54559,0.433158,4.5
KNeighborsClassifier,0.022398,0.206566,0.66603,0.75518,0.721333,0.619631,4.5
MLPClassifier,4.583317,0.021257,0.762875,0.811616,0.771802,0.739916,4.5
RandomForestClassifier,1.717905,0.069142,0.970916,0.981709,0.979904,0.961276,4.5
SVC,1.576511,1.071704,0.558065,0.754996,0.670662,0.519976,4.5
XGBClassifier,4.662992,0.099553,0.913161,0.926449,0.871927,0.907041,4.5
