In [1]:
import pandas as pd


results_df = pd.DataFrame(
    {
        'Model': [],
        'Accuracy': [],
        'Recall': [],
        'ROC-AUC': [],
        'PR-AUC': [],
    }
).astype(
    {
        'Model': str,
        'Accuracy': float,
        'Recall': float,
        'ROC-AUC': float,
        'PR-AUC': float,
    }
)

In [2]:
from module import (
    evaluate_and_append,
    X,
    y,
    skf
)

from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import RandomUnderSampler

pipeline_knn = ImbPipeline([
    ('undersampler', RandomUnderSampler(random_state=42)),
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid_knn = {
    'undersampler__sampling_strategy': [0.7, 1.0],
    'knn__n_neighbors': [3, 5, 7, 11, 15, 21, 31, 51],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

grid_knn = GridSearchCV(
    estimator=pipeline_knn,
    param_grid=param_grid_knn,
    cv=skf,
    scoring='recall',
    n_jobs=-1,
    verbose=1
)

grid_knn.fit(X, y)

print('KNN best params:', grid_knn.best_params_)
print('KNN best recall:', grid_knn.best_score_)

results_df = evaluate_and_append(
    model_name='KNeighborsClassifier+RUS',
    best_estimator=grid_knn.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
KNN best params: {'knn__metric': 'euclidean', 'knn__n_neighbors': 31, 'knn__weights': 'uniform', 'undersampler__sampling_strategy': 1.0}
KNN best recall: 0.8927453769559033
                      Model  Accuracy    Recall   ROC-AUC    PR-AUC
0  KNeighborsClassifier+RUS  0.659619  0.893048  0.808198  0.137083


In [3]:
from module import (
    evaluate_and_append,
    X,
    y,
    skf,
)

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


pipeline_dt = ImbPipeline([
    ('undersampler', RandomUnderSampler(random_state=42)),
    ('dt', DecisionTreeClassifier(random_state=42))
])

param_grid_dt = {
    'undersampler__sampling_strategy': [0.7, 1.0],
    'dt__max_depth': [6, 8, 10, 12, 15],
    'dt__min_samples_leaf': [1, 2, 5, 10],
    'dt__min_samples_split': [2, 5, 10],
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_features': ['sqrt', 'log2']
}

grid_dt = GridSearchCV(
    estimator=pipeline_dt,
    param_grid=param_grid_dt,
    cv=skf,
    scoring='recall',
    n_jobs=-1,
    verbose=1,
)

grid_dt.fit(X, y)

print('DecisionTree best params:', grid_dt.best_params_)
print('DecisionTree best recall:', grid_dt.best_score_)

results_df = evaluate_and_append(
    model_name='DecisionTreeClassifier+RUS',
    best_estimator=grid_dt.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits
DecisionTree best params: {'dt__criterion': 'entropy', 'dt__max_depth': 6, 'dt__max_features': 'sqrt', 'dt__min_samples_leaf': 10, 'dt__min_samples_split': 2, 'undersampler__sampling_strategy': 1.0}
DecisionTree best recall: 0.9089615931721194
                        Model  Accuracy    Recall   ROC-AUC    PR-AUC
0    KNeighborsClassifier+RUS  0.659619  0.893048  0.808198  0.137083
1  DecisionTreeClassifier+RUS  0.635082  0.909091  0.790960  0.127185


In [4]:
from module import (
    evaluate_and_append,
    X,
    y,
    skf
)

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


pipeline_rf = ImbPipeline([
    ('rus', RandomUnderSampler(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid_rf = {
    'rus__sampling_strategy': [0.7, 1.0],
    'rf__n_estimators': [300, 500],
    'rf__max_depth': [8, 12, None],
    'rf__min_samples_leaf': [2, 5],
    'rf__min_samples_split': [2, 5, 10],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__criterion': ['gini', 'entropy']
}

grid_rf = GridSearchCV(
    estimator=pipeline_rf,
    param_grid=param_grid_rf,
    cv=skf,
    scoring='recall',
    n_jobs=-1,
    verbose=1,
)

grid_rf.fit(X, y)

print('RandomForest best params:', grid_rf.best_params_)
print('RandomForest best recall:', grid_rf.best_score_)

results_df = evaluate_and_append(
    model_name='RandomForestClassifier+RUS',
    best_estimator=grid_rf.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
RandomForest best params: {'rf__criterion': 'gini', 'rf__max_depth': 8, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 5, 'rf__min_samples_split': 2, 'rf__n_estimators': 500, 'rus__sampling_strategy': 1.0}
RandomForest best recall: 0.9142247510668563
                        Model  Accuracy    Recall   ROC-AUC    PR-AUC
0    KNeighborsClassifier+RUS  0.659619  0.893048  0.808198  0.137083
1  DecisionTreeClassifier+RUS  0.635082  0.909091  0.790960  0.127185
2  RandomForestClassifier+RUS  0.652049  0.914439  0.827888  0.148221


In [5]:
from module import (
    evaluate_and_append,
    X,
    y,
    skf,
    CAT_FEATURES
)

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV


pipeline_cb = ImbPipeline([
    ('rus', RandomUnderSampler(random_state=42)),
    ('cb', CatBoostClassifier(
        random_state=42,
        thread_count=-1,
        verbose=False,
        od_type='Iter',
        od_wait=50,
        eval_metric='Recall',
        cat_features=CAT_FEATURES
    ))
])

param_grid_cb = {
    'rus__sampling_strategy': [0.7, 1.0],
    'cb__iterations': [1000],
    'cb__learning_rate': [0.03, 0.06, 0.1],
    'cb__depth': [6, 8],
    'cb__l2_leaf_reg': [3, 7]
}

grid_cb = GridSearchCV(
    estimator=pipeline_cb,
    param_grid=param_grid_cb,
    cv=skf,
    scoring='recall',
    n_jobs=1,
    verbose=1
)

grid_cb.fit(X, y)

print('CatBoost best params:', grid_cb.best_params_)
print('CatBoost best recall:', grid_cb.best_score_)

results_df = evaluate_and_append(
    model_name='CatBoostClassifier+RUS',
    best_estimator=grid_cb.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
CatBoost best params: {'cb__depth': 6, 'cb__iterations': 1000, 'cb__l2_leaf_reg': 7, 'cb__learning_rate': 0.1, 'rus__sampling_strategy': 1.0}
CatBoost best recall: 0.8500711237553343
                        Model  Accuracy    Recall   ROC-AUC    PR-AUC
0    KNeighborsClassifier+RUS  0.659619  0.893048  0.808198  0.137083
1  DecisionTreeClassifier+RUS  0.635082  0.909091  0.790960  0.127185
2  RandomForestClassifier+RUS  0.652049  0.914439  0.827888  0.148221
3      CatBoostClassifier+RUS  0.694597  0.850267  0.809535  0.137357


In [6]:
from module import (
    evaluate_and_append,
    X,
    y,
    skf
)

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV


pipeline_xgb = ImbPipeline([
    ('rus', RandomUnderSampler(random_state=42)),
    ('xgb', XGBClassifier(
        random_state=42,
        n_jobs=-1,
        verbosity=0,
        eval_metric='aucpr',
        tree_method='hist'
    ))
])

param_grid_xgb = {
    'rus__sampling_strategy': [0.7, 1.0],
    'xgb__n_estimators': [1000],
    'xgb__learning_rate': [0.03, 0.06, 0.1],
    'xgb__max_depth': [6, 8],
    'xgb__reg_lambda': [3, 7],
    'xgb__min_child_weight': [1, 3],
    'xgb__gamma': [0, 0.1, 0.3]
}

grid_xgb = GridSearchCV(
    estimator=pipeline_xgb,
    param_grid=param_grid_xgb,
    cv=skf,
    scoring='recall',
    n_jobs=1,
    verbose=1,
    return_train_score=False
)

grid_xgb.fit(X, y)

print('XGBoost best params:', grid_xgb.best_params_)
print('XGBoost best recall:', grid_xgb.best_score_)

results_df = evaluate_and_append(
    model_name='XGBoostClassifier+RUS',
    best_estimator=grid_xgb.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
XGBoost best params: {'rus__sampling_strategy': 1.0, 'xgb__gamma': 0.3, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 6, 'xgb__min_child_weight': 1, 'xgb__n_estimators': 1000, 'xgb__reg_lambda': 7}
XGBoost best recall: 0.8930298719772404




                        Model  Accuracy    Recall   ROC-AUC    PR-AUC
0    KNeighborsClassifier+RUS  0.659619  0.893048  0.808198  0.137083
1  DecisionTreeClassifier+RUS  0.635082  0.909091  0.790960  0.127185
2  RandomForestClassifier+RUS  0.652049  0.914439  0.827888  0.148221
3      CatBoostClassifier+RUS  0.694597  0.850267  0.809535  0.137357
4       XGBoostClassifier+RUS  0.691464  0.893048  0.820657  0.139393


In [7]:
from module import (
    evaluate_and_append,
    X,
    y,
    skf
)

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


pipeline_mlp = ImbPipeline([
    ('rus', RandomUnderSampler(random_state=42)),
    ('scaler', StandardScaler()),
    ('mlp', MLPClassifier(
        random_state=42,
        max_iter=1000,
        early_stopping=True,
        n_iter_no_change=50,
        validation_fraction=0.1,
        verbose=False
    ))
])

param_grid_mlp = {
    'rus__sampling_strategy': [0.7, 1.0],
    'mlp__hidden_layer_sizes': [(100,), (150,), (100, 50)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__solver': ['adam', 'lbfgs'],
    'mlp__alpha': [0.001, 0.01],
    'mlp__learning_rate_init': [0.001, 0.005],
    'mlp__batch_size': ['auto', 256]
}

grid_mlp = GridSearchCV(
    estimator=pipeline_mlp,
    param_grid=param_grid_mlp,
    cv=skf,
    scoring='recall',
    n_jobs=-1,
    verbose=1
)

grid_mlp.fit(X, y)

print('MLP best params:', grid_mlp.best_params_)
print('MLP best recall:', grid_mlp.best_score_)

results_df = evaluate_and_append(
    model_name='MLPClassifier+RUS',
    best_estimator=grid_mlp.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
MLP best params: {'mlp__activation': 'tanh', 'mlp__alpha': 0.001, 'mlp__batch_size': 'auto', 'mlp__hidden_layer_sizes': (100,), 'mlp__learning_rate_init': 0.001, 'mlp__solver': 'adam', 'rus__sampling_strategy': 1.0}
MLP best recall: 0.8980085348506401
                        Model  Accuracy    Recall   ROC-AUC    PR-AUC
0    KNeighborsClassifier+RUS  0.659619  0.893048  0.808198  0.137083
1  DecisionTreeClassifier+RUS  0.635082  0.909091  0.790960  0.127185
2  RandomForestClassifier+RUS  0.652049  0.914439  0.827888  0.148221
3      CatBoostClassifier+RUS  0.694597  0.850267  0.809535  0.137357
4       XGBoostClassifier+RUS  0.691464  0.893048  0.820657  0.139393
5           MLPClassifier+RUS  0.619159  0.898396  0.804261  0.132759
