# KNN:

Classifying with supervised learning whether diabetic patients are readmitted, and if they are, if it's before or after 30 days.

Using the dataset from here: https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008

In [1]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import pickle

In [2]:
with open("x_train_scaled_liv.pkl", 'rb') as picklefile: 
    x_train_scaled = pickle.load(picklefile)

with open("x_test_scaled_liv.pkl", 'rb') as picklefile: 
    x_test_scaled = pickle.load(picklefile)

with open("y_train_liv.pkl", 'rb') as picklefile: 
    y_train = pickle.load(picklefile)

with open("y_test_liv.pkl", 'rb') as picklefile: 
    y_test = pickle.load(picklefile)

## Converting to binary classification:

In [3]:
y_test = y_test.str.replace('>30','NO')
y_train = y_train.str.replace('>30','NO')

## KNN with single test/train split (25% for test):

In [4]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train_scaled, y_train)

y_pred = knn.predict(x_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"accuracy: {accuracy}")
f1_macro = f1_score(y_test, y_pred, average='macro')
print(f"f1 macro: {f1_macro}")

accuracy: 0.8788205681409564
f1 macro: 0.5081601131678783


## KNN Grid Search CV for hyperparameter tuning and evaluation on holdout:

In [5]:
# define the parameter values that should be searched
k_range = list(range(1, 62, 5))
print(f"testing values of k: {k_range}")

knn = KNeighborsClassifier(n_neighbors=5)

# create a parameter grid: map the parameter names to the values that should be searched 
param_grid = dict(n_neighbors=k_range)

# instantiate the grid
grid = GridSearchCV(knn, param_grid, cv=3, scoring='f1_macro', n_jobs=-1)

testing values of k: [1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61]


In [7]:
# fit the grid with data 
grid.fit(x_train_scaled, y_train)



GridSearchCV(cv=3, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_neighbors': [1, 6, 11, 16, 21, 26, 31, 36, 41, 46,
                                         51, 56, 61]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=0)

In [10]:
# view the complete results
grid.cv_results_

{'mean_fit_time': array([41.36177286, 41.44307073, 41.72322766, 42.14360547, 42.90666604,
        40.84394455, 41.31592917, 39.66971294, 41.80398949, 42.49943137,
        41.70059292, 42.09776028, 42.1913681 ]),
 'std_fit_time': array([1.00186528, 0.95403458, 0.78378393, 0.05709834, 0.06661481,
        0.09684039, 0.05228167, 0.36893794, 0.16294518, 0.10138036,
        0.35968988, 0.47969003, 0.38996873]),
 'mean_score_time': array([ 951.14902321, 1054.66232697, 1078.06965335, 1085.74959731,
        1111.54440522, 1117.85573141, 1126.85705741, 1129.66322931,
        1139.15513476, 1144.05037634, 1139.34589465, 1137.46966044,
         586.36733524]),
 'std_score_time': array([0.32060522, 1.81554377, 1.81656183, 1.34859421, 0.61600481,
        1.06057291, 2.06410752, 0.76932606, 0.55543495, 0.64043517,
        2.31917657, 1.76203971, 0.83256668]),
 'param_n_neighbors': masked_array(data=[1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61],
              mask=[False, False, False, False, Fa

In [9]:
# examine the best model
print(f"best score: {grid.best_score_}")
print(f"best params: {grid.best_params_}")
print(f"best estimator: {grid.best_estimator_}")

best score: 0.531263527933702
best params: {'n_neighbors': 1}
best estimator: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')


In [13]:
y_pred = grid.predict(x_test_scaled)

In [14]:
accuracy = accuracy_score(y_test, y_pred)
print(f"accuracy: {accuracy}")
f1_macro = f1_score(y_test, y_pred, average='macro')
print(f"f1 macro: {f1_macro}")

accuracy: 0.8175716169243678
f1 macro: 0.5369162277269839


In [None]:
# for accuracy before binning ICD9 codes: 0.5357291183456844