# import libraries

In [2]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,102,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Glucose                   768 non-null    int64  
 1   BloodPressure             768 non-null    int64  
 2   SkinThickness             768 non-null    int64  
 3   Insulin                   768 non-null    int64  
 4   BMI                       768 non-null    float64
 5   DiabetesPedigreeFunction  768 non-null    float64
 6   Age                       768 non-null    int64  
 7   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 48.1 KB


In [5]:
x = df.drop('Outcome', axis = 1)
y = df.Outcome

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 23, stratify = y)
x_test

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
549,189,110,31,0,28.5,0.680,37
259,155,76,28,150,33.3,1.353,51
63,141,58,34,128,25.4,0.699,24
566,99,72,30,18,38.6,0.412,21
506,180,90,26,90,36.5,0.314,35
...,...,...,...,...,...,...,...
280,146,70,0,0,37.9,0.334,28
460,120,72,22,56,20.8,0.733,48
373,105,58,40,94,34.9,0.225,25
679,101,58,17,265,24.2,0.614,23


# Model Training

In [7]:
knn_clf = KNeighborsClassifier(n_neighbors=5)  
knn_clf.fit(x_train,y_train)  # model saves training data

KNeighborsClassifier()

In [10]:
# Testing Data Evalutaion

y_pred = knn_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print('Confusuion matrix :\n', cnf_matrix)

acc_score = accuracy_score(y_test,y_pred)
print('Accuracy score :\n', acc_score)

clf_report = classification_report(y_test,y_pred)
print('Classification report :\n', clf_report)

Confusuion matrix :
 [[103  22]
 [ 36  31]]
Accuracy score :
 0.6979166666666666
Classification report :
               precision    recall  f1-score   support

           0       0.74      0.82      0.78       125
           1       0.58      0.46      0.52        67

    accuracy                           0.70       192
   macro avg       0.66      0.64      0.65       192
weighted avg       0.69      0.70      0.69       192



In [11]:
# Training Data Evalutaion

y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred_train)
print('Confusuion matrix :\n', cnf_matrix)

acc_score = accuracy_score(y_train,y_pred_train)
print('Accuracy score :\n', acc_score)

clf_report = classification_report(y_train,y_pred_train)
print('Classification report :\n', clf_report)

Confusuion matrix :
 [[331  44]
 [ 72 129]]
Accuracy score :
 0.7986111111111112
Classification report :
               precision    recall  f1-score   support

           0       0.82      0.88      0.85       375
           1       0.75      0.64      0.69       201

    accuracy                           0.80       576
   macro avg       0.78      0.76      0.77       576
weighted avg       0.79      0.80      0.79       576



# Scaling

# 1.Normalization

In [12]:
x.head() # independent variables

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,148,50,35,0,33.6,0.627,50
1,85,66,29,0,26.6,0.351,31
2,183,64,0,0,23.3,0.672,102
3,150,66,23,94,28.1,0.167,21
4,150,40,35,168,43.1,2.288,33


In [13]:
normal_scaler = MinMaxScaler()
normal_scaler.fit(x)  # It saving Minimum and Maximum value

MinMaxScaler()

In [17]:
normalized_array = normal_scaler.transform(x)
# normalized_array

In [22]:
x_df_norm = pd.DataFrame(normalized_array, columns = x.columns)
x_df_norm

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.743719,0.409836,0.353535,0.000000,0.500745,0.234415,0.358025
1,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.123457
2,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,1.000000
3,0.753769,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000
4,0.753769,0.327869,0.353535,0.198582,0.642325,0.943638,0.148148
...,...,...,...,...,...,...,...
763,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.518519
764,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.074074
765,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.111111
766,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.320988


# 2.Standardization

In [19]:
std_scaler = StandardScaler()
std_scaler.fit(x)  # Saving mean and std

StandardScaler()

In [21]:
standard_array = std_scaler.transform(x)
x_df_std = pd.DataFrame(standard_array, columns = x.columns)
x_df_std

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.845787,-0.985618,0.907270,-0.692891,0.204013,0.468492,1.392072
1,-1.136319,-0.158966,0.530902,-0.692891,-0.684422,-0.365061,-0.192538
2,1.946957,-0.262298,-1.288212,-0.692891,-1.103255,0.604397,5.728900
3,0.908711,-0.158966,0.154533,0.123302,-0.494043,-0.920763,-1.026543
4,0.908711,-1.502276,0.907270,0.765836,1.409746,5.484909,-0.025737
...,...,...,...,...,...,...,...
763,-0.632927,0.357691,1.722735,0.870031,0.115169,-0.908682,2.476279
764,0.027775,0.047697,0.405445,-0.692891,0.610154,-0.398282,-0.526140
765,-0.003687,0.151028,0.154533,0.279594,-0.735190,-0.685193,-0.275938
766,0.153623,-0.468961,-1.288212,-0.692891,-0.240205,-0.371101,1.141871


# Model Training After Normalization scaling

In [23]:
# x = x_df_norm.copy()
y = df.Outcome

x_train, x_test, y_train, y_test = train_test_split(x_df_norm, y, test_size=0.25, random_state=23, stratify =y)
x_test.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
549,0.949749,0.901639,0.313131,0.0,0.424739,0.257045,0.197531
259,0.778894,0.622951,0.282828,0.177305,0.496274,0.544406,0.37037
63,0.708543,0.47541,0.343434,0.1513,0.378539,0.265158,0.037037
566,0.497487,0.590164,0.30303,0.021277,0.575261,0.142613,0.0
506,0.904523,0.737705,0.262626,0.106383,0.543964,0.100769,0.17284


In [24]:
knn_clf_norm = KNeighborsClassifier(n_neighbors=5)
knn_clf_norm.fit(x_train,y_train)

KNeighborsClassifier()

In [25]:
# Testing Data Evalutaion

y_pred = knn_clf_norm.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print('Confusuion matrix :\n', cnf_matrix)

acc_score = accuracy_score(y_test,y_pred)
print('Accuracy score :\n', acc_score)

clf_report = classification_report(y_test,y_pred)
print('Classification report :\n', clf_report)

Confusuion matrix :
 [[105  20]
 [ 34  33]]
Accuracy score :
 0.71875
Classification report :
               precision    recall  f1-score   support

           0       0.76      0.84      0.80       125
           1       0.62      0.49      0.55        67

    accuracy                           0.72       192
   macro avg       0.69      0.67      0.67       192
weighted avg       0.71      0.72      0.71       192



In [26]:
# Training Data Evalutaion

y_pred_train = knn_clf_norm.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred_train)
print('Confusuion matrix :\n', cnf_matrix)

acc_score = accuracy_score(y_train,y_pred_train)
print('Accuracy score :\n', acc_score)

clf_report = classification_report(y_train,y_pred_train)
print('Classification report :\n', clf_report)

Confusuion matrix :
 [[338  37]
 [ 59 142]]
Accuracy score :
 0.8333333333333334
Classification report :
               precision    recall  f1-score   support

           0       0.85      0.90      0.88       375
           1       0.79      0.71      0.75       201

    accuracy                           0.83       576
   macro avg       0.82      0.80      0.81       576
weighted avg       0.83      0.83      0.83       576



# Model Training After Standardization scaling

In [27]:
# x = x_df_std.copy()
y = df.Outcome

x_train, x_test, y_train, y_test = train_test_split(x_df_std, y, test_size=0.25, random_state=23, stratify =y)
x_test.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
549,2.135729,2.114327,0.656358,-0.692891,-0.443275,0.628558,0.307865
259,1.066021,0.357691,0.468173,0.609544,0.165937,2.661098,1.475473
63,0.625553,-0.572292,0.844542,0.41852,-0.836725,0.685941,-0.776342
566,-0.695851,0.151028,0.59363,-0.536598,0.838609,-0.180834,-1.026543
506,1.852571,1.081012,0.342717,0.08857,0.572079,-0.476805,0.141064


In [28]:
knn_clf_std = KNeighborsClassifier(n_neighbors=5)
knn_clf_std.fit(x_train,y_train)

KNeighborsClassifier()

In [29]:
# Testing Data Evalutaion

y_pred = knn_clf_std.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print('Confusuion matrix :\n', cnf_matrix)

acc_score = accuracy_score(y_test,y_pred)
print('Accuracy score :\n', acc_score)

clf_report = classification_report(y_test,y_pred)
print('Classification report :\n', clf_report)

Confusuion matrix :
 [[103  22]
 [ 33  34]]
Accuracy score :
 0.7135416666666666
Classification report :
               precision    recall  f1-score   support

           0       0.76      0.82      0.79       125
           1       0.61      0.51      0.55        67

    accuracy                           0.71       192
   macro avg       0.68      0.67      0.67       192
weighted avg       0.70      0.71      0.71       192



In [30]:
# Training Data Evalutaion

y_pred_train = knn_clf_std.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred_train)
print('Confusuion matrix :\n', cnf_matrix)

acc_score = accuracy_score(y_train,y_pred_train)
print('Accuracy score :\n', acc_score)

clf_report = classification_report(y_train,y_pred_train)
print('Classification report :\n', clf_report)

Confusuion matrix :
 [[334  41]
 [ 64 137]]
Accuracy score :
 0.8177083333333334
Classification report :
               precision    recall  f1-score   support

           0       0.84      0.89      0.86       375
           1       0.77      0.68      0.72       201

    accuracy                           0.82       576
   macro avg       0.80      0.79      0.79       576
weighted avg       0.81      0.82      0.81       576

