In [1]:
import pandas as pd


results_df = pd.DataFrame(
    {
        'Model': [],
        'Accuracy': [],
        'Recall': [],
        'ROC-AUC': [],
        'PR-AUC': [],
    }
).astype(
    {
        'Model': str,
        'Accuracy': float,
        'Recall': float,
        'ROC-AUC': float,
        'PR-AUC': float,
    }
)

In [2]:
from module import (
    evaluate_and_append,
    X,
    y,
    skf
)

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV


pipeline_rf = ImbPipeline([
    ('rus', RandomUnderSampler(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid_rf_expanded = {
    'rus__sampling_strategy': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'rf__n_estimators': [300, 500, 800, 1200, 2000],
    'rf__max_depth': [6, 8, 10, 12, 15, 20, None],
    'rf__min_samples_leaf': [1, 2, 5, 10, 20, 50],
    'rf__min_samples_split': [2, 5, 10, 20, 50],
    'rf__max_features': ['sqrt', 'log2', 0.3, 0.5, 0.7, None],
    'rf__criterion': ['gini', 'entropy'],
    'rf__class_weight': [None, 'balanced', 'balanced_subsample']
}

search_rf = RandomizedSearchCV(
    estimator=pipeline_rf,
    param_distributions=param_grid_rf_expanded,
    n_iter=200,
    cv=skf,
    scoring='recall',
    n_jobs=-1,
    random_state=42,
    verbose=1,
    error_score='raise'
)

search_rf.fit(X, y)

print('RandomForest+RUS best params:', search_rf.best_params_)
print('RandomForest+RUS best recall:', search_rf.best_score_)

results_df = evaluate_and_append(
    model_name='RandomForestClassifier+RUS',
    best_estimator=search_rf.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
RandomForest+RUS best params: {'rus__sampling_strategy': 0.7, 'rf__n_estimators': 500, 'rf__min_samples_split': 20, 'rf__min_samples_leaf': 50, 'rf__max_features': 'log2', 'rf__max_depth': 15, 'rf__criterion': 'gini', 'rf__class_weight': 'balanced'}
RandomForest+RUS best recall: 0.9355618776671408
                        Model  Accuracy    Recall   ROC-AUC    PR-AUC
0  RandomForestClassifier+RUS  0.617332  0.935829  0.815808  0.148762


In [None]:
from module import (
    evaluate_and_append,
    X,
    y,
    skf
)
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

pipeline_xgb = ImbPipeline([
    ('rus', RandomUnderSampler(random_state=42)),
    ('xgb', XGBClassifier(
        random_state=42,
        n_jobs=-1,
        verbosity=0,
        eval_metric='aucpr',
        tree_method='hist'
    ))
])




param_dist_xgb_next = {
    'rus__sampling_strategy': [0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6],
    'xgb__n_estimators': [800, 1200, 1500, 2000, 2500, 3000],
    'xgb__learning_rate': [0.005, 0.008, 0.01, 0.015, 0.02, 0.03],
    'xgb__max_depth': [4, 5, 6, 7, 8],
    'xgb__min_child_weight': [5, 8, 10, 15, 20, 30],
    'xgb__gamma': [0.1, 0.2, 0.3, 0.5],
    'xgb__reg_lambda': [5, 10, 15, 20, 30, 50],
    'xgb__reg_alpha': [0.5, 1, 2, 5],
    'xgb__subsample': [0.5, 0.6, 0.7, 0.8],
    'xgb__colsample_bytree': [0.5, 0.6, 0.7, 0.8],
    'xgb__scale_pos_weight': [1.0]
}

search_xgb = RandomizedSearchCV(
    estimator=pipeline_xgb,
    param_distributions=param_dist_xgb_next,
    n_iter=300,
    cv=skf,
    scoring='recall',
    n_jobs=1,
    random_state=42,
    verbose=1,
    error_score='raise',
    return_train_score=False
)

search_xgb.fit(X, y)

print('XGBoost + RUS best params:', search_xgb.best_params_)
print('XGBoost + RUS best recall (CV):', search_xgb.best_score_)

results_df = evaluate_and_append(
    model_name='XGBoostClassifier + RUS (expanded)',
    best_estimator=search_xgb.best_estimator_,
    X=X,
    y=y,
    cv=skf,
    results_df=results_df
)

print(results_df)


print(results_df)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
XGBoost + RUS best params: {'xgb__subsample': 0.5, 'xgb__scale_pos_weight': 1.0, 'xgb__reg_lambda': 15, 'xgb__reg_alpha': 1, 'xgb__n_estimators': 3000, 'xgb__min_child_weight': 5, 'xgb__max_depth': 8, 'xgb__learning_rate': 0.005, 'xgb__gamma': 0.5, 'xgb__colsample_bytree': 0.7, 'rus__sampling_strategy': 0.6}
XGBoost + RUS best recall (CV): 0.7637268847795163
                                             Model  Accuracy    Recall  \
0                       RandomForestClassifier+RUS  0.617332  0.935829   
1  XGBoostClassifier + RUS (expanded RandomSearch)  0.048812  1.000000   
2               XGBoostClassifier + RUS (expanded)  0.772383  0.759358   

    ROC-AUC    PR-AUC  
0  0.815808  0.148762  
1  0.811442  0.132067  
2  0.832489  0.161071  
                                             Model  Accuracy    Recall  \
0                       RandomForestClassifier+RUS  0.617332  0.935829   
1  XGBoostClassifier + RUS (expand

In [10]:
from module import (
    evaluate_and_append,
    X,
    y,
    skf,
    CAT_FEATURES
)
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np


pipeline_cb = ImbPipeline([
    ('rus', RandomUnderSampler(random_state=42)),
    ('cb', CatBoostClassifier(
        random_state=42,
        thread_count=-1,
        verbose=False,
        od_type='Iter',
        od_wait=50,
        eval_metric='Recall'
    ))
])

param_dist_cb = {
    'rus__sampling_strategy': np.arange(0.25, 0.81, 0.05).tolist(),
    'cb__iterations': [800, 1000, 1200, 1500, 2000, 2500, 3000],
    'cb__learning_rate': [0.002, 0.005, 0.008, 0.01, 0.015, 0.02, 0.03, 0.05, 0.08],
    'cb__depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'cb__l2_leaf_reg': [1, 3, 5, 7, 10, 15, 20, 30, 50],
    'cb__bagging_temperature': [0, 0.5, 1, 1.5, 2],
    'cb__random_strength': [1, 2, 3, 5, 10],
    'cb__border_count': [32, 64, 128, 254],
    'cb__grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide'],
    'cb__min_data_in_leaf': [1, 3, 5, 10, 20],
    'cb__cat_features': [CAT_FEATURES]
}

search_cb = RandomizedSearchCV(
    estimator=pipeline_cb,
    param_distributions=param_dist_cb,
    n_iter=300,
    cv=skf,
    scoring='recall',
    n_jobs=1,
    random_state=42,
    verbose=1,
    error_score='raise',
    return_train_score=False
)

search_cb.fit(X, y)

print('CatBoost + RUS best params (random search):', search_cb.best_params_)
print('CatBoost + RUS best recall (CV):', search_cb.best_score_)

results_df = evaluate_and_append(
    model_name='CatBoostClassifier + RUS (max efficiency RandomSearch)',
    best_estimator=search_cb.best_estimator_,
    X=X,
    y=y,
    cv=skf,
    results_df=results_df
)

print(results_df)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
CatBoost + RUS best params (random search): {'rus__sampling_strategy': 0.7999999999999998, 'cb__random_strength': 5, 'cb__min_data_in_leaf': 20, 'cb__learning_rate': 0.002, 'cb__l2_leaf_reg': 7, 'cb__iterations': 1000, 'cb__grow_policy': 'Depthwise', 'cb__depth': 9, 'cb__cat_features': ['gender', 'hypertension', 'heart_disease', 'ever_married', 'Residence_type', 'age_old', 'age_child', 'bmi_fat'], 'cb__border_count': 254, 'cb__bagging_temperature': 1}
CatBoost + RUS best recall (CV): 0.8765291607396872
                                               Model  Accuracy    Recall  \
0                         RandomForestClassifier+RUS  0.617332  0.935829   
1    XGBoostClassifier + RUS (expanded RandomSearch)  0.048812  1.000000   
2                 XGBoostClassifier + RUS (expanded)  0.772383  0.759358   
3  CatBoostClassifier + RUS (max efficiency Rando...  0.697468  0.877005   

    ROC-AUC    PR-AUC  
0  0.815808  0.148762  