In [None]:
%load_ext jupyter_black
%load_ext autoreload
%autoreload 2

In [None]:
import sys

sys.path.append("..")

## Modeling

In [None]:
import pandas as pd
from utils.utils import set_seed

set_seed()

### Load data

In [None]:
ds_train = pd.read_csv("../data/avila/avila-tr.txt", header=None)
ds_test = pd.read_csv("../data/avila/avila-ts.txt", header=None)

### Models

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

# LVQ
from sklvq import GLVQ

# SVM
from sklearn.svm import SVC

# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# XGBoost
from xgboost import XGBClassifier

# MLP
from sklearn.neural_network import MLPClassifier

### Training

In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [None]:
# Metrics
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

# import label encoder
from sklearn.preprocessing import LabelEncoder


# Compute metrics
def compute_metrics(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average="macro", zero_division=1),
        "recall": recall_score(y_true, y_pred, average="macro", zero_division=1),
        "f1": f1_score(y_true, y_pred, average="macro", zero_division=1),
    }


# Split data
X_train = ds_train.iloc[:, :-1]
y_train = ds_train.iloc[:, -1]
X_test = ds_test.iloc[:, :-1]
y_test = ds_test.iloc[:, -1]

label_encoder = LabelEncoder().fit(y_train)
# y_train = label_encoder.transform(y_train)
# y_test = label_encoder.transform(y_test)

# Define models to train
models = [
    KNeighborsClassifier(),
    GLVQ(),
    SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
    MLPClassifier(),
]

In [11]:
%%time

models_results = {model.__class__.__name__: {} for model in models}

for model in models:
    print("Training model: ", model.__class__.__name__)
    model.fit(X_train, y_train)
    metrics = compute_metrics(y_test, model.predict(X_test))
    models_results[model.__class__.__name__] = metrics

Training model:  KNeighborsClassifier
Training model:  GLVQ
Training model:  SVC
Training model:  DecisionTreeClassifier
Training model:  RandomForestClassifier
Training model:  XGBClassifier
Training model:  MLPClassifier
CPU times: total: 34 s
Wall time: 34 s




In [12]:
models_results = pd.DataFrame(models_results).T
models_results

Unnamed: 0,accuracy,precision,recall,f1
KNeighborsClassifier,0.749545,0.809206,0.665667,0.718285
GLVQ,0.529462,0.3893,0.408313,0.353025
SVC,0.674523,0.820989,0.532343,0.569366
DecisionTreeClassifier,0.969436,0.951906,0.964925,0.957916
RandomForestClassifier,0.982658,0.989757,0.979907,0.984751
XGBClassifier,0.881958,0.943424,0.904685,0.921704
MLPClassifier,0.791607,0.836781,0.778383,0.798748
