In [9]:
import pandas as pd


df = pd.read_csv('health_train.csv')

X = df.drop(columns=['stroke'])
y = df['stroke']

In [10]:
from sklearn.model_selection import StratifiedKFold


results_df = pd.DataFrame(
    {
        'Model': [],
        'Accuracy': [],
        'Recall': [],
        'ROC-AUC': [],
        'PR-AUC': [],
    }
).astype(
    {
        'Model': str,
        'Accuracy': float,
        'Recall': float,
        'ROC-AUC': float,
        'PR-AUC': float,
    }
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV


pipeline_knn = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('knn', KNeighborsClassifier(weights='distance'))
])

param_grid_knn = {
    'smote__k_neighbors': [3, 5, 7],
    'smote__sampling_strategy': [0.5, 0.7, 1.0],
    'knn__n_neighbors': [3, 5, 7, 11, 15, 21, 31, 51],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan'],
    'knn__p': [1, 2]
}

grid_knn = RandomizedSearchCV(
    estimator=pipeline_knn,
    param_distributions=param_grid_knn,
    n_iter=500,
    cv=skf,
    scoring='average_precision',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

grid_knn.fit(X, y)

print('KNN best params:', grid_knn.best_params_)
print('KNN best PR-AUC (CV mean):', grid_knn.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
KNN best params: {'knn__weights': 'uniform', 'knn__p': 6, 'knn__n_neighbors': 89, 'knn__metric': 'manhattan', 'knn__leaf_size': 30, 'knn__algorithm': 'auto'}
KNN best PR-AUC (CV mean): 0.16968093979839832


In [12]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, average_precision_score


best_knn = grid_knn.best_estimator_

y_pred_cv_knn = cross_val_predict(best_knn, X, y, cv=skf, method='predict', n_jobs=-1)
y_proba_cv_knn = cross_val_predict(best_knn, X, y, cv=skf, method='predict_proba', n_jobs=-1)[:, 1]

accuracy_knn = accuracy_score(y, y_pred_cv_knn)
recall_knn = recall_score(y, y_pred_cv_knn)
roc_auc_knn = roc_auc_score(y, y_proba_cv_knn)
pr_auc_knn = average_precision_score(y, y_proba_cv_knn)


results_df = pd.concat([
    results_df,
    pd.DataFrame([{
        "Model": "KNeighborsClassifier",
        "Accuracy": accuracy_knn,
        "Recall": recall_knn,
        "ROC-AUC": roc_auc_knn,
        "PR-AUC": pr_auc_knn
    }])
], ignore_index=True)

print(results_df)

                  Model  Accuracy  Recall   ROC-AUC    PR-AUC
0  KNeighborsClassifier  0.951188     0.0  0.804757  0.152938


Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
DecisionTree best params: {'criterion': 'gini', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 20}
DecisionTree best PR-AUC (CV mean): 0.15626867236422717
