In [447]:
import pandas as pd
import sklearn
from sklearn.metrics import classification_report,classification_report,roc_auc_score
from sklearn.model_selection import GridSearchCV,cross_validate
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Exploring Data

In [448]:
df = pd.read_csv("diabetes.csv")

In [449]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [450]:
df.shape

(768, 9)

In [451]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [452]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [453]:
df['Outcome'].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


# preparing the model

In [454]:
Y = df['Outcome']
x = df.drop(['Outcome'],axis = 1)

In [455]:
x_scaled = StandardScaler().fit_transform(x)
X = pd.DataFrame(x_scaled,columns = x.columns)

In [456]:
model = KNeighborsClassifier()
model.fit(X,Y)

In [457]:
test = X.sample(10,random_state = 123)
test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
236,0.936914,1.88113,0.770014,0.029077,0.974225,0.495927,0.344667,1.511083
395,-0.547919,0.191084,-0.574128,0.217261,1.694906,-0.544811,3.407067,-0.701198
36,2.12478,0.535352,0.356432,-1.288212,-0.692891,0.153245,-0.156673,0.149679
210,-0.547919,-1.248585,-0.470732,0.091805,-0.692891,-0.544811,-0.549288,-0.701198
483,-1.141852,-1.154694,0.666618,0.656358,0.392471,0.787841,-0.721435,-0.871374
743,1.530847,0.597947,1.286991,-1.288212,-0.692891,0.089785,0.791645,1.000557
408,1.23388,2.381884,0.253036,-1.288212,-0.692891,-0.773265,2.171839,0.49003
468,1.23388,-0.027996,-3.572597,-1.288212,-0.692891,-0.252897,-0.872441,0.404942
283,0.936914,1.255187,0.873409,-1.288212,-0.692891,-0.202129,-0.926803,1.170732
167,0.046014,-0.027996,-0.05715,-1.288212,-0.692891,-0.303664,0.716142,0.064591


In [458]:
print(model.predict(test)) #working

[1 0 0 0 0 1 1 0 1 0]


# model evaluation

In [471]:
y_proba = model.predict_proba(X)[:,1]
print(classification_report(y,y_pred))

              precision    recall  f1-score   support

       False       0.85      0.90      0.87       500
        True       0.79      0.70      0.74       268

    accuracy                           0.83       768
   macro avg       0.82      0.80      0.81       768
weighted avg       0.83      0.83      0.83       768



In [460]:
roc_auc_score(y,y_proba)

np.float64(0.9017686567164179)

# testing

In [461]:
data = {
    'Pregnancies': [0.9, -0.5, 2.1, -0.5, -1.1, 1.5],
    'Glucose': [1.88, 0.19, 0.53, -1.25, -1.15, 0.59],
    'BloodPressure': [0.77, -0.57, 0.35, -0.47, 0.66, 1.28],
    'SkinThickness': [0.02, 0.21, -1.28, 0.09, 0.65, -1.28],
    'Insulin': [0.97, 1.69, -0.69, -0.69, 0.39, -0.69],
    'BMI': [0.49, -0.54, 0.15, -0.54, 0.78, 0.08],
    'DiabetesPedigreeFunction': [0.34, 3.40, -0.15, -0.54, -0.72, 0.79],
    'Age': [1.51, -0.70, 0.14, -0.70, -0.87, 1.00],
    'Outcome': [0, 1, 0, 1, 0, 1]
}

sample_data = pd.DataFrame(data)

print(sample_data)

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          0.9     1.88           0.77           0.02     0.97  0.49   
1         -0.5     0.19          -0.57           0.21     1.69 -0.54   
2          2.1     0.53           0.35          -1.28    -0.69  0.15   
3         -0.5    -1.25          -0.47           0.09    -0.69 -0.54   
4         -1.1    -1.15           0.66           0.65     0.39  0.78   
5          1.5     0.59           1.28          -1.28    -0.69  0.08   

   DiabetesPedigreeFunction   Age  Outcome  
0                      0.34  1.51        0  
1                      3.40 -0.70        1  
2                     -0.15  0.14        0  
3                     -0.54 -0.70        1  
4                     -0.72 -0.87        0  
5                      0.79  1.00        1  


In [462]:
testing_data = sample_data.drop(['Outcome'],axis = 1)
model.predict(testing_data)

array([1, 0, 0, 0, 0, 1])

In [463]:
from sklearn.metrics import accuracy_score
accuracy_score(sample_data['Outcome'], model.predict(testing_data)) #low accuracy becuase test data is too small

0.5