In [1]:
import pandas as pd


df = pd.read_csv('health_train.csv')

X = df.drop(columns=['stroke'])
y = df['stroke']

In [2]:
from sklearn.model_selection import StratifiedKFold


results_df = pd.DataFrame(
    {
        'Model': [],
        'Accuracy': [],
        'Recall': [],
        'ROC-AUC': [],
        'PR-AUC': [],
    }
).astype(
    {
        'Model': str,
        'Accuracy': float,
        'Recall': float,
        'ROC-AUC': float,
        'PR-AUC': float,
    }
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [3]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, average_precision_score
import pandas as pd

def evaluate_and_append(model_name, best_estimator, X, y, cv, results_df):

    y_pred = cross_val_predict(best_estimator, X, y, cv=cv, method='predict', n_jobs=-1)
    y_proba = cross_val_predict(best_estimator, X, y, cv=cv, method='predict_proba', n_jobs=-1)[:, 1]

    metrics = {
        "Model": model_name,
        "Accuracy": accuracy_score(y, y_pred),
        "Recall": recall_score(y, y_pred),
        "ROC-AUC": roc_auc_score(y, y_proba),
        "PR-AUC": average_precision_score(y, y_proba)
    }

    new_row = pd.DataFrame([metrics])
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    
    return results_df

In [4]:
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


pipeline_knn = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('knn', KNeighborsClassifier())
])

param_grid_knn = {
    'smote__k_neighbors': [3, 5, 7],
    'smote__sampling_strategy': [0.5, 0.7, 1.0],
    'knn__n_neighbors': [3, 5, 7, 11, 15, 21, 31, 51],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan'],
    'knn__p': [1, 2]
}

grid_knn = GridSearchCV(
    estimator=pipeline_knn,
    param_grid=param_grid_knn,
    cv=skf,
    scoring='average_precision',
    n_jobs=-1,
    verbose=1,
    error_score='raise'
)

grid_knn.fit(X, y)

print('KNN best params:', grid_knn.best_params_)
print('KNN best PR-AUC (CV mean):', grid_knn.best_score_)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits
KNN best params: {'knn__metric': 'manhattan', 'knn__n_neighbors': 51, 'knn__p': 1, 'knn__weights': 'uniform', 'smote__k_neighbors': 5, 'smote__sampling_strategy': 0.7}
KNN best PR-AUC (CV mean): 0.12778456697178642


In [5]:
results_df = evaluate_and_append(
    model_name='KNeighborsClassifier',
    best_estimator=grid_knn.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

                  Model  Accuracy    Recall   ROC-AUC    PR-AUC
0  KNeighborsClassifier  0.762986  0.609626  0.776072  0.115975


In [6]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

pipeline_dt = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('dt', DecisionTreeClassifier(class_weight='balanced', random_state=42))
])

param_grid_dt = {
    'smote__sampling_strategy': [0.7, 1.0],
    'smote__k_neighbors': [3, 5, 7],
    'dt__max_depth': [None, 6, 8, 10, 12, 15],
    'dt__min_samples_leaf': [1, 2, 5, 10],
    'dt__min_samples_split': [2, 5, 10],
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_features': ['sqrt', 'log2', None]
}

grid_dt = GridSearchCV(
    estimator=pipeline_dt,
    param_grid=param_grid_dt,
    cv=skf,
    scoring='average_precision',
    n_jobs=-1,
    verbose=1
)

grid_dt.fit(X, y)

print('DecisionTree best params:', grid_dt.best_params_)
print('DecisionTree best PR-AUC (CV mean):', grid_dt.best_score_)

Fitting 5 folds for each of 2592 candidates, totalling 12960 fits
DecisionTree best params: {'dt__criterion': 'gini', 'dt__max_depth': 8, 'dt__max_features': 'sqrt', 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 5, 'smote__k_neighbors': 7, 'smote__sampling_strategy': 0.7}
DecisionTree best PR-AUC (CV mean): 0.13866880785006128


In [7]:
results_df = evaluate_and_append(
    model_name='DecisionTreeClassifier',
    best_estimator=grid_dt.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

                    Model  Accuracy    Recall   ROC-AUC    PR-AUC
0    KNeighborsClassifier  0.762986  0.609626  0.776072  0.115975
1  DecisionTreeClassifier  0.786479  0.582888  0.760915  0.121034


In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


pipeline_rf = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(
        random_state=42,
        class_weight='balanced',
        n_jobs=-1
    ))
])

param_grid_rf = {
    'smote__sampling_strategy': [0.7, 1.0],
    'smote__k_neighbors': [5],
    'rf__n_estimators': [300, 500],
    'rf__max_depth': [8, 12, None],
    'rf__min_samples_leaf': [2, 5],
    'rf__min_samples_split': [2, 5, 10],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__criterion': ['gini', 'entropy']
}

grid_rf = GridSearchCV(
    estimator=pipeline_rf,
    param_grid=param_grid_rf,
    cv=skf,
    scoring='average_precision',
    n_jobs=-1,
    verbose=1
)

grid_rf.fit(X, y)

print('RandomForest best params:', grid_rf.best_params_)
print('RandomForest best PR-AUC (CV mean):', grid_rf.best_score_)

Fitting 5 folds for each of 972 candidates, totalling 4860 fits
RandomForest best params: {'rf__criterion': 'entropy', 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 5, 'rf__min_samples_split': 2, 'rf__n_estimators': 300, 'smote__k_neighbors': 5, 'smote__sampling_strategy': 0.7}
RandomForest best PR-AUC (CV mean): 0.1377703932697571


In [9]:
results_df = evaluate_and_append(
    model_name='RandomForestClassifier',
    best_estimator=grid_rf.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

                    Model  Accuracy    Recall   ROC-AUC    PR-AUC
0    KNeighborsClassifier  0.762986  0.609626  0.776072  0.115975
1  DecisionTreeClassifier  0.786479  0.582888  0.760915  0.121034
2  RandomForestClassifier  0.878100  0.262032  0.797458  0.124628


In [10]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV


pipeline_cb = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('cb', CatBoostClassifier(
        random_state=42,
        thread_count=-1,
        verbose=False,
        od_type='Iter',
        od_wait=50,
        eval_metric='PRAUC'
    ))
])

param_grid_cb = {
    'smote__sampling_strategy': [0.7, 1.0],
    'smote__k_neighbors': [5],
    'cb__iterations': [1000],
    'cb__learning_rate': [0.03, 0.06, 0.1],
    'cb__depth': [6, 8, 10],
    'cb__l2_leaf_reg': [3, 7]
}

grid_cb = GridSearchCV(
    estimator=pipeline_cb,
    param_grid=param_grid_cb,
    cv=skf,
    scoring='average_precision',
    n_jobs=1,
    verbose=1
)

grid_cb.fit(X, y)

print('CatBoost best params:', grid_cb.best_params_)
print('CatBoost best PR-AUC (CV mean):', grid_cb.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
CatBoost best params: {'cb__depth': 10, 'cb__iterations': 1000, 'cb__l2_leaf_reg': 3, 'cb__learning_rate': 0.06, 'smote__k_neighbors': 5, 'smote__sampling_strategy': 1.0}
CatBoost best PR-AUC (CV mean): 0.1260687913053267


In [11]:
results_df = evaluate_and_append(
    model_name='CatBoostClassifier',
    best_estimator=grid_cb.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

                    Model  Accuracy    Recall   ROC-AUC    PR-AUC
0    KNeighborsClassifier  0.762986  0.609626  0.776072  0.115975
1  DecisionTreeClassifier  0.786479  0.582888  0.760915  0.121034
2  RandomForestClassifier  0.878100  0.262032  0.797458  0.124628
3      CatBoostClassifier  0.906552  0.133690  0.758567  0.109835


In [12]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV


cb = CatBoostClassifier(
    random_state=42,
    thread_count=-1,
    verbose=False,
    od_type='Iter',
    od_wait=50,
    eval_metric='PRAUC',
    auto_class_weights='Balanced'
)

param_grid_cb = {
    'iterations': [1000],
    'learning_rate': [0.03, 0.06, 0.1],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [3, 7],
}

grid_cb = GridSearchCV(
    estimator=cb,
    param_grid=param_grid_cb,
    cv=skf,
    scoring='average_precision',
    n_jobs=1,
    verbose=1
)

grid_cb.fit(X, y)

print('CatBoost no smote best params:', grid_cb.best_params_)
print('CatBoost no smote best PR-AUC (CV mean):', grid_cb.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
CatBoost no smote best params: {'depth': 6, 'iterations': 1000, 'l2_leaf_reg': 7, 'learning_rate': 0.03}
CatBoost no smote best PR-AUC (CV mean): 0.15273890505011986


In [13]:
results_df = evaluate_and_append(
    model_name='CatBoostClassifier no smote',
    best_estimator=grid_cb.best_estimator_,
    X=X, y=y, cv=skf,
    results_df=results_df
)

print(results_df)

                         Model  Accuracy    Recall   ROC-AUC    PR-AUC
0         KNeighborsClassifier  0.762986  0.609626  0.776072  0.115975
1       DecisionTreeClassifier  0.786479  0.582888  0.760915  0.121034
2       RandomForestClassifier  0.878100  0.262032  0.797458  0.124628
3           CatBoostClassifier  0.906552  0.133690  0.758567  0.109835
4  CatBoostClassifier no smote  0.902897  0.245989  0.787222  0.135139


In [14]:
results_df.to_csv('results_train.csv', index=False)