## Machine learning model for diabetes prediction 

### step 1 : Importing the dataset 

In [5]:
#importing Dataset 
#dataset available on kaggle 
import pandas as pd
url="https://bd29ee0e-54ab-4daa-9671-d153865d1620.usrfiles.com/ugd/bd29ee_7d5d0d16a6454b0987a63e487c91c25b.csv"
df=pd.read_csv(url)
df.head()



Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### step 2 : data preprocessing

In [6]:
dataset_new.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [7]:
#replacing nan values with 0 so that total empty entries can be identified 
import numpy as np
dataset_new = df
dataset_new[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]] = dataset_new[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]].replace(0, np.NaN)


In [8]:
dataset_new.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


####  Replacing the null values with the mean for particular column 

In [10]:

dataset_new["Glucose"].fillna(dataset_new["Glucose"].mean(), inplace = True)
dataset_new["BloodPressure"].fillna(dataset_new["BloodPressure"].mean(), inplace = True)
dataset_new["SkinThickness"].fillna(dataset_new["SkinThickness"].mean(), inplace = True)
dataset_new["Insulin"].fillna(dataset_new["Insulin"].mean(), inplace = True)
dataset_new["BMI"].fillna(dataset_new["BMI"].mean(), inplace = True)

In [11]:
dataset_new.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [17]:
# Selecting features - [Glucose, Insulin, BMI, Age]
X = dataset_new.iloc[:, [1, 4, 5, 7]].values
Y = dataset_new.iloc[:, 8].values
X=pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3
0,148.0,155.548223,33.6,50.0
1,85.0,155.548223,26.6,31.0
2,183.0,155.548223,23.3,32.0
3,89.0,94.000000,28.1,21.0
4,137.0,168.000000,43.1,33.0
...,...,...,...,...
763,101.0,180.000000,32.9,63.0
764,122.0,155.548223,36.8,27.0
765,121.0,112.000000,26.2,30.0
766,126.0,155.548223,30.1,47.0


### step 3: splitting the data into training and testing datset

In [18]:
# Splitting X and Y
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42, stratify = dataset_new['Outcome'] )


### step 4 :Applying the algorithms 

In [19]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

X_train shape: (614, 4)
X_test shape: (154, 4)
Y_train shape: (614,)
Y_test shape: (154,)


In [20]:
# K nearest neighbors Algorithm
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 24, metric = 'minkowski', p = 2)
knn.fit(X_train, Y_train)

KNeighborsClassifier(n_neighbors=24)

In [22]:
Y_pred_knn = knn.predict(X_test)
Y_pred_knn=pd.DataFrame(Y_pred_knn)
Y_pred_knn.head()
Y_pred_knn.tail()


Unnamed: 0,0
149,0
150,0
151,0
152,1
153,0


In [23]:
from sklearn.metrics import accuracy_score
accuracy_knn = accuracy_score(Y_test, Y_pred_knn)

accuracy_knn

0.6883116883116883

In [24]:
Y_test=pd.DataFrame(Y_test)
# o_vs_pred=pd.concat(Y_test,Y_pred_knn)
# o_vs_pred=pd.DataFrame(o_vs_pred)
# o_vs_pred

Y_test
o_vs_pred=pd.concat([Y_test,Y_pred_knn],axis=1)
o_vs_pred.head()
o_vs_pred.tail()




Unnamed: 0,0,0.1
149,0,0
150,0,0
151,0,0
152,1,1
153,0,0


In [25]:
o_vs_pred.rename(index={0: "original", 0: "predicted"})
o_vs_pred.tail()

Unnamed: 0,0,0.1
149,0,0
150,0,0
151,0,0
152,1,1
153,0,0


### step 5 : loading the model into pickle for future use in the application 

In [26]:
# knn.predict(183,0,23.3,30)

import pickle
pickle.dump(knn,open('aajdia.pkl','wb'))

import sys, json
model=pickle.load(open('aajdia.pkl','rb'))



#get our data as an array from read_in()
lines = [148,155,33,50]


final=model.predict(np.array([lines]).reshape(1,-1).astype(np.float64))

print(int(final))

1
