In [2]:
import pandas as pd


df = pd.read_csv('health_train.csv')

X = df.drop(columns=['stroke'])
y = df['stroke']

In [3]:
from sklearn.model_selection import StratifiedKFold


results_df = pd.DataFrame(
    {
        'Model': [],
        'Accuracy': [],
        'Recall': [],
        'ROC-AUC': [],
        'PR-AUC': [],
    }
).astype(
    {
        'Model': str,
        'Accuracy': float,
        'Recall': float,
        'ROC-AUC': float,
        'PR-AUC': float,
    }
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV


pipeline_knn = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(weights='distance'))
])

param_grid_knn = {
    'knn__n_neighbors': list(range(1, 102, 2)),
    'knn__weights': ['uniform', 'distance'],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': [10, 20, 30, 40, 50],
    'knn__metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
    'knn__p': [1, 2, 3, 4, 5, 6]
}

grid_knn = RandomizedSearchCV(
    estimator=pipeline_knn,
    param_distributions=param_grid_knn,
    n_iter=500,
    cv=skf,
    scoring='average_precision',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

grid_knn.fit(X, y)

print('KNN best params:', grid_knn.best_params_)
print('KNN best ROC-AUC (CV mean):', grid_knn.best_score_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
KNN best params: {'knn__weights': 'uniform', 'knn__p': 6, 'knn__n_neighbors': 89, 'knn__metric': 'manhattan', 'knn__leaf_size': 30, 'knn__algorithm': 'auto'}
KNN best ROC-AUC (CV mean): 0.16968093979839832
