In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, roc_curve, RocCurveDisplay, roc_auc_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


In [2]:
diabets_classes = pd.read_csv('../../DATA/diabets_classes.csv', index_col=False)
diabets_x = diabets_classes.drop(['diabets'], axis=1)
diabets = diabets_classes['diabets']
pre_diabets_classes = pd.read_csv('../../DATA/pre_diabets_classes.csv', index_col=False)
pre_diabets_x = pre_diabets_classes.drop(['pre_diabets'], axis=1)
pre_diabets = pre_diabets_classes['pre_diabets']
non_diabets_classes = pd.read_csv('../../DATA/non_diabets_classes.csv', index_col=False)
non_diabets_x = non_diabets_classes.drop(['non_diabets'], axis=1)
non_diabets = non_diabets_classes['non_diabets']


In [3]:
x_diabets_train, x_diabets_test, y_diabets_train, y_diabets_test = train_test_split(diabets_x, diabets, test_size=0.25)
x_pre_diabets_train, x_pre_diabets_test, y_pre_diabets_train, y_pre_diabets_test = train_test_split(pre_diabets_x, pre_diabets, test_size=0.25)
x_non_diabets_train, x_non_diabets_test, y_non_diabets_train, y_non_diabets_test = train_test_split(non_diabets_x, non_diabets, test_size=0.25)

# Diabets KNN

In [4]:
diabets_classifier = GridSearchCV(KNeighborsClassifier(), 
                                  {'n_neighbors': np.arange(5, 21, 2)},
                                  n_jobs=4,
                                  scoring='f1',).fit(x_diabets_train, y_diabets_train)
# 6 min


In [5]:
diabets_predict = diabets_classifier.best_estimator_.predict(x_diabets_test)
# 1 min


In [6]:
print(classification_report(diabets_predict, y_diabets_test))


              precision    recall  f1-score   support

         0.0       0.55      0.62      0.59      1040
         1.0       0.66      0.59      0.62      1276

    accuracy                           0.60      2316
   macro avg       0.60      0.61      0.60      2316
weighted avg       0.61      0.60      0.60      2316



# Prediabets KNN

In [7]:
pre_diabets_classifier = GridSearchCV(KNeighborsClassifier(), 
                                  {'n_neighbors': np.arange(5, 21, 2)},
                                  n_jobs=4,
                                  scoring='f1',).fit(x_pre_diabets_train, y_pre_diabets_train)
# 6 min


In [8]:
pre_diabets_predict = diabets_classifier.best_estimator_.predict(x_pre_diabets_test)
# 1 min


In [9]:
print(classification_report(pre_diabets_predict, y_pre_diabets_test))


              precision    recall  f1-score   support

         0.0       0.63      0.70      0.66      7977
         1.0       0.73      0.66      0.69      9696

    accuracy                           0.68     17673
   macro avg       0.68      0.68      0.68     17673
weighted avg       0.69      0.68      0.68     17673



# Nondiabets KNN

In [10]:
x_non_diabets_train


Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
232600,1.0,1.0,1.0,0.220930,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.50,0.033333,0.033333,0.0,1.0,0.583333,1.0,1.000000
76026,0.0,0.0,1.0,0.081395,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.25,0.000000,0.000000,0.0,0.0,0.583333,1.0,0.857143
4264,1.0,1.0,1.0,0.267442,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.25,0.000000,0.000000,0.0,0.0,0.750000,0.8,0.857143
131855,1.0,0.0,0.0,0.139535,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.50,0.000000,0.100000,0.0,1.0,0.666667,1.0,0.571429
204320,1.0,0.0,1.0,0.174419,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.25,0.000000,0.000000,0.0,0.0,0.500000,1.0,0.857143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157914,1.0,1.0,1.0,0.244186,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.25,0.000000,0.000000,0.0,1.0,0.666667,1.0,0.857143
84527,1.0,1.0,1.0,0.174419,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.75,0.000000,0.100000,0.0,1.0,0.833333,0.8,0.857143
239556,1.0,1.0,1.0,0.220930,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.75,0.066667,0.066667,0.0,1.0,0.666667,0.4,0.857143
75752,0.0,0.0,0.0,0.174419,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.25,0.000000,0.000000,0.0,0.0,0.666667,1.0,1.000000


In [11]:
non_diabets_classifier = GridSearchCV(KNeighborsClassifier(), 
                                  {'n_neighbors': np.arange(5, 21, 2)},
                                  n_jobs=4,
                                  scoring='roc_auc',).fit(x_non_diabets_train, y_non_diabets_train)
# 6 min


In [12]:
non_diabets_predict = diabets_classifier.best_estimator_.predict(x_non_diabets_test)
# 1 min


In [13]:
confusion_matrix(non_diabets_predict, y_non_diabets_test)


array([[ 2864, 33870],
       [ 7164, 19522]], dtype=int64)

In [14]:
print(classification_report(non_diabets_predict, y_non_diabets_test))


              precision    recall  f1-score   support

         0.0       0.29      0.08      0.12     36734
         1.0       0.37      0.73      0.49     26686

    accuracy                           0.35     63420
   macro avg       0.33      0.40      0.31     63420
weighted avg       0.32      0.35      0.28     63420

