In [6]:
import pandas as pd


results_df = pd.DataFrame(
    {
        'Model': [],
        'Accuracy': [],
        'Recall': [],
        'ROC-AUC': [],
        'PR-AUC': [],
    }
).astype(
    {
        'Model': str,
        'Accuracy': float,
        'Recall': float,
        'ROC-AUC': float,
        'PR-AUC': float,
    }
)

In [None]:
from module import (
    evaluate_and_append,
    X,
    y,
    skf
)

from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


pipeline_knn = ImbPipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid_knn = {
    'knn__n_neighbors': [3, 5, 7, 11, 15, 21, 31, 51],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

grid_knn = GridSearchCV(
    estimator=pipeline_knn,
    param_grid=param_grid_knn,
    cv=skf,
    scoring='recall',
    n_jobs=-1,
    verbose=1,
)

grid_knn.fit(X, y)

print('KNN best params:', grid_knn.best_params_)
print('KNN best recall:', grid_knn.best_score_)

results_df = evaluate_and_append(
    model_name='KNeighborsClassifier',
    best_estimator=grid_knn.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
KNN best params: {'knn__metric': 'euclidean', 'knn__n_neighbors': 3, 'knn__p': 1, 'knn__weights': 'distance'}
KNN best recall: 0.053342816500711245
                  Model  Accuracy    Recall   ROC-AUC   PR-AUC
0  KNeighborsClassifier  0.933699  0.053476  0.565712  0.06636


In [8]:
from module import (
    evaluate_and_append,
    X,
    y,
    skf
)

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


dt = DecisionTreeClassifier(class_weight='balanced', random_state=42)

param_grid_dt = {
    'max_depth': [6, 8, 10, 12, 15],
    'min_samples_leaf': [1, 2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}

grid_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid_dt,
    cv=skf,
    scoring='recall',
    n_jobs=-1,
    verbose=1
)

grid_dt.fit(X, y)

print('DecisionTree best params:', grid_dt.best_params_)
print('DecisionTree recall:', grid_dt.best_score_)

results_df = evaluate_and_append(
    model_name='DecisionTreeClassifier',
    best_estimator=grid_dt.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
DecisionTree best params: {'criterion': 'gini', 'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 2}
DecisionTree recall: 0.8660028449502134
                    Model  Accuracy    Recall   ROC-AUC    PR-AUC
0    KNeighborsClassifier  0.933699  0.053476  0.565712  0.066360
1  DecisionTreeClassifier  0.621248  0.866310  0.774375  0.123078


In [9]:
from module import (
    evaluate_and_append,
    X,
    y,
    skf
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


rf = RandomForestClassifier(
    random_state=42,
    class_weight='balanced_subsample',
    n_jobs=-1
)

param_grid_rf = {
    'n_estimators': [300, 500],
    'max_depth': [8, 12, None],
    'min_samples_leaf': [2, 5],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

grid_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    cv=skf,
    scoring='recall',
    n_jobs=-1,
    verbose=1
)

grid_rf.fit(X, y)

print('RandomForest best params:', grid_rf.best_params_)
print('RandomForest best recall:', grid_rf.best_score_)

results_df = evaluate_and_append(
    model_name='RandomForestClassifier',
    best_estimator=grid_rf.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
RandomForest best params: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 300}
RandomForest best recall: 0.5766714082503556
                    Model  Accuracy    Recall   ROC-AUC    PR-AUC
0    KNeighborsClassifier  0.933699  0.053476  0.565712  0.066360
1  DecisionTreeClassifier  0.621248  0.866310  0.774375  0.123078
2  RandomForestClassifier  0.818585  0.577540  0.815964  0.147802
