# KNN Model

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from  sklearn.neighbors import KNeighborsClassifier
import warnings 
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("diabetes.csv")
df.sample(8)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
374,2,122,52,43,158,36.2,0.816,28,0
638,7,97,76,32,91,40.9,0.871,32,1
680,2,56,56,28,45,24.2,0.332,22,0
292,2,128,78,37,182,43.3,1.224,31,1
105,1,126,56,29,152,28.7,0.801,21,0
70,2,100,66,20,90,32.9,0.867,28,1
508,2,84,50,23,76,30.4,0.968,21,0
117,5,78,48,0,0,33.7,0.654,25,0


In [4]:
df["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

## Model Evaluation 

### Train Test Split

In [5]:
x = df.drop("Outcome",axis = 1)
y = df["Outcome"]

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=50,stratify=y)

# 0.2 = Using 20% data in test model

## Model Training

In [7]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train,y_train)

KNeighborsClassifier()

## Evaluation

### Training Model

In [8]:
y_pred = knn_clf.predict(x_train)
cnf_matrix = confusion_matrix(y_train,y_pred)
print("Confusion Matrix \n",cnf_matrix)
print("*"*50)
accuracy = accuracy_score(y_train,y_pred)
print("Accuracy \n",accuracy)
print("*"*50)
clf_report = classification_report(y_train,y_pred)
print("Classification Report \n",clf_report)

Confusion Matrix 
 [[355  45]
 [ 76 138]]
**************************************************
Accuracy 
 0.8029315960912052
**************************************************
Classification Report 
               precision    recall  f1-score   support

           0       0.82      0.89      0.85       400
           1       0.75      0.64      0.70       214

    accuracy                           0.80       614
   macro avg       0.79      0.77      0.77       614
weighted avg       0.80      0.80      0.80       614



### Testing Model

In [9]:
y_pred = knn_clf.predict(x_test)
cnf_matrix = confusion_matrix(y_test,y_pred)
print("Confusion Matrix \n",cnf_matrix)
print("*"*50)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy \n",accuracy)
print("*"*50)
clf_report = classification_report(y_test,y_pred)
print("Classification Report \n",clf_report)

Confusion Matrix 
 [[84 16]
 [27 27]]
**************************************************
Accuracy 
 0.7207792207792207
**************************************************
Classification Report 
               precision    recall  f1-score   support

           0       0.76      0.84      0.80       100
           1       0.63      0.50      0.56        54

    accuracy                           0.72       154
   macro avg       0.69      0.67      0.68       154
weighted avg       0.71      0.72      0.71       154



In [10]:
# as the accuracies having High Bais and Low vatriance, it is a case of Overfitting
# so we use Hyperparameter tuning to overcome overfit model

## Hyperparameter Tuning

### 1) GridSearchCV

In [11]:
knn_clf = KNeighborsClassifier()
hyperparameter = {"n_neighbors":np.arange(3,25),
                 "p":[1,2]}
gscv = GridSearchCV(knn_clf,hyperparameter,cv = 5)
gscv.fit(x_train,y_train)
gscv.best_estimator_

KNeighborsClassifier(n_neighbors=18, p=1)

In [12]:
knn_clf = KNeighborsClassifier(n_neighbors=18, p=1)
knn_clf.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=18, p=1)

### Training

In [13]:
y_pred = knn_clf.predict(x_train)
cnf_matrix = confusion_matrix(y_train,y_pred)
print("Confusion Matrix \n",cnf_matrix)
print("*"*50)
accuracy = accuracy_score(y_train,y_pred)
print("Accuracy \n",accuracy)
print("*"*50)
clf_report = classification_report(y_train,y_pred)
print("Classification Report \n",clf_report)

Confusion Matrix 
 [[367  33]
 [100 114]]
**************************************************
Accuracy 
 0.7833876221498371
**************************************************
Classification Report 
               precision    recall  f1-score   support

           0       0.79      0.92      0.85       400
           1       0.78      0.53      0.63       214

    accuracy                           0.78       614
   macro avg       0.78      0.73      0.74       614
weighted avg       0.78      0.78      0.77       614



### Testing

In [14]:
y_pred = knn_clf.predict(x_test)
cnf_matrix = confusion_matrix(y_test,y_pred)
print("Confusion Matrix \n",cnf_matrix)
print("*"*50)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy \n",accuracy)
print("*"*50)
clf_report = classification_report(y_test,y_pred)
print("Classification Report \n",clf_report)

Confusion Matrix 
 [[85 15]
 [29 25]]
**************************************************
Accuracy 
 0.7142857142857143
**************************************************
Classification Report 
               precision    recall  f1-score   support

           0       0.75      0.85      0.79       100
           1       0.62      0.46      0.53        54

    accuracy                           0.71       154
   macro avg       0.69      0.66      0.66       154
weighted avg       0.70      0.71      0.70       154



### 2) RandomizedSearchCV

In [15]:
knn_clf = KNeighborsClassifier()
hyperparameter = {"n_neighbors":np.arange(3,25),
                 "p":[1,2]}
rscv = RandomizedSearchCV(knn_clf,hyperparameter,cv = 5)
rscv.fit(x_train,y_train)
rscv.best_estimator_

KNeighborsClassifier(n_neighbors=14, p=1)

In [16]:
# it gives best estimator value of k nieghbour = 14

In [17]:
knn_clf = KNeighborsClassifier(n_neighbors=14)
knn_clf.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=14)

### Training

In [18]:
y_pred = knn_clf.predict(x_train)
cnf_matrix = confusion_matrix(y_train,y_pred)
print("Confusion Matrix \n",cnf_matrix)
print("*"*50)
accuracy = accuracy_score(y_train,y_pred)
print("Accuracy \n",accuracy)
print("*"*50)
clf_report = classification_report(y_train,y_pred)
print("Classification Report \n",clf_report)

Confusion Matrix 
 [[368  32]
 [ 98 116]]
**************************************************
Accuracy 
 0.7882736156351792
**************************************************
Classification Report 
               precision    recall  f1-score   support

           0       0.79      0.92      0.85       400
           1       0.78      0.54      0.64       214

    accuracy                           0.79       614
   macro avg       0.79      0.73      0.75       614
weighted avg       0.79      0.79      0.78       614



### Testing

In [19]:
y_pred = knn_clf.predict(x_test)
cnf_matrix = confusion_matrix(y_test,y_pred)
print("Confusion Matrix \n",cnf_matrix)
print("*"*50)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy \n",accuracy)
print("*"*50)
clf_report = classification_report(y_test,y_pred)
print("Classification Report \n",clf_report)

Confusion Matrix 
 [[89 11]
 [30 24]]
**************************************************
Accuracy 
 0.7337662337662337
**************************************************
Classification Report 
               precision    recall  f1-score   support

           0       0.75      0.89      0.81       100
           1       0.69      0.44      0.54        54

    accuracy                           0.73       154
   macro avg       0.72      0.67      0.68       154
weighted avg       0.73      0.73      0.72       154



## Feature Engineering

### 1) Normalization

In [20]:
x = df.drop("Outcome",axis = 1)
y = df["Outcome"]

In [21]:
x.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [22]:
normal_scalar = MinMaxScaler()
array = normal_scalar.fit_transform(x)
x_df = pd.DataFrame(array,columns = x.columns)

In [23]:
x_df.sample(8)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
252,0.117647,0.452261,0.655738,0.141414,0.065012,0.363636,0.073015,0.05
690,0.470588,0.537688,0.655738,0.0,0.0,0.366617,0.332195,0.216667
712,0.588235,0.648241,0.508197,0.363636,0.0,0.614009,0.154996,0.283333
551,0.176471,0.422111,0.557377,0.30303,0.125296,0.47541,0.219044,0.066667
721,0.058824,0.572864,0.540984,0.363636,0.236407,0.567809,0.090094,0.0
650,0.058824,0.457286,0.442623,0.252525,0.118203,0.375559,0.06661,0.033333
672,0.588235,0.341709,0.868852,0.232323,0.05792,0.529061,0.088386,0.433333
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333


In [24]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=50,stratify=y)

In [25]:
knn_clf = KNeighborsClassifier(n_neighbors=14)
knn_clf.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=14)

### Train

In [26]:
y_pred = knn_clf.predict(x_train)
cnf_matrix = confusion_matrix(y_train,y_pred)
print("Confusion Matrix \n",cnf_matrix)
print("*"*50)
accuracy = accuracy_score(y_train,y_pred)
print("Accuracy \n",accuracy)
print("*"*50)
clf_report = classification_report(y_train,y_pred)
print("Classification Report \n",clf_report)

Confusion Matrix 
 [[368  32]
 [ 98 116]]
**************************************************
Accuracy 
 0.7882736156351792
**************************************************
Classification Report 
               precision    recall  f1-score   support

           0       0.79      0.92      0.85       400
           1       0.78      0.54      0.64       214

    accuracy                           0.79       614
   macro avg       0.79      0.73      0.75       614
weighted avg       0.79      0.79      0.78       614



### Test

In [27]:
y_pred = knn_clf.predict(x_test)
cnf_matrix = confusion_matrix(y_test,y_pred)
print("Confusion Matrix \n",cnf_matrix)
print("*"*50)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy \n",accuracy)
print("*"*50)
clf_report = classification_report(y_test,y_pred)
print("Classification Report \n",clf_report)

Confusion Matrix 
 [[89 11]
 [30 24]]
**************************************************
Accuracy 
 0.7337662337662337
**************************************************
Classification Report 
               precision    recall  f1-score   support

           0       0.75      0.89      0.81       100
           1       0.69      0.44      0.54        54

    accuracy                           0.73       154
   macro avg       0.72      0.67      0.68       154
weighted avg       0.73      0.73      0.72       154



## 2) Standardization

In [28]:
std_scalar = StandardScaler()
array = std_scalar.fit_transform(x)
x_df = pd.DataFrame(array,columns = x.columns)

In [29]:
x_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.827813,-0.622642,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136
764,-0.547919,0.034598,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023
765,0.342981,0.003301,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760
766,-0.844885,0.159787,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732


In [30]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=50,stratify=y)

In [31]:
knn_clf = KNeighborsClassifier(n_neighbors=14)
knn_clf.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=14)

### Train

In [32]:
y_pred = knn_clf.predict(x_train)
cnf_matrix = confusion_matrix(y_train,y_pred)
print("Confusion Matrix \n",cnf_matrix)
print("*"*50)
accuracy = accuracy_score(y_train,y_pred)
print("Accuracy \n",accuracy)
print("*"*50)
clf_report = classification_report(y_train,y_pred)
print("Classification Report \n",clf_report)

Confusion Matrix 
 [[368  32]
 [ 98 116]]
**************************************************
Accuracy 
 0.7882736156351792
**************************************************
Classification Report 
               precision    recall  f1-score   support

           0       0.79      0.92      0.85       400
           1       0.78      0.54      0.64       214

    accuracy                           0.79       614
   macro avg       0.79      0.73      0.75       614
weighted avg       0.79      0.79      0.78       614



### Test

In [33]:
y_pred = knn_clf.predict(x_test)
cnf_matrix = confusion_matrix(y_test,y_pred)
print("Confusion Matrix \n",cnf_matrix)
print("*"*50)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy \n",accuracy)
print("*"*50)
clf_report = classification_report(y_test,y_pred)
print("Classification Report \n",clf_report)

Confusion Matrix 
 [[89 11]
 [30 24]]
**************************************************
Accuracy 
 0.7337662337662337
**************************************************
Classification Report 
               precision    recall  f1-score   support

           0       0.75      0.89      0.81       100
           1       0.69      0.44      0.54        54

    accuracy                           0.73       154
   macro avg       0.72      0.67      0.68       154
weighted avg       0.73      0.73      0.72       154



In [34]:
# Train Accuracy = 0.78  ///  Test Accuracy = 0.73

## user input

In [37]:
df.loc[1]

Pregnancies                  1.000
Glucose                     85.000
BloodPressure               66.000
SkinThickness               29.000
Insulin                      0.000
BMI                         26.600
DiabetesPedigreeFunction     0.351
Age                         31.000
Outcome                      0.000
Name: 1, dtype: float64

In [47]:
Pregnancies                =  1.000
Glucose                    = 85.000
BloodPressure              = 66.000
SkinThickness              = 29.000
Insulin                    =  0.000
BMI                        = 26.600
DiabetesPedigreeFunction   =  0.351
Age                        = 31.000

In [38]:
x.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [39]:
encoded_columns= {"columns":list(x.columns)}

In [41]:
import json
with open ("encoded columns.json","w") as f:
    json.dump(encoded_columns,f)

In [48]:
array = np.zeros(len(x.columns),dtype=int)


array[0] =Pregnancies
array[1] =Glucose
array[2] =BloodPressure
array[3] =SkinThickness
array[4] =Insulin
array[5] =BMI
array[6] =DiabetesPedigreeFunction
array[7] =Age


In [44]:
# x.columns.shape[0]

In [49]:
array

array([ 1, 85, 66, 29,  0, 26,  0, 31])

In [52]:
predicted_class = knn_clf.predict([array])[0]
print("predicted class for diabetes >>",predicted_class)

predicted class for diabetes >> 0


In [53]:
import pickle
with open ("knn model.pkl","wb") as f:
    pickle.dump(knn_clf,f)

In [54]:
with open("std scalar.pkl","wb") as f:
    pickle.dump(std_scalar,f)