# Diabetes Prediction using ML

In [219]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.utils import shuffle

In [220]:
#Data Collection and Analysis

In [221]:
dataset = pd.read_csv('diabetes.csv')

In [222]:
#Shuffle the data
dataset = shuffle(dataset)

In [223]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
246,10,122,68,0,0,31.2,0.258,41,0
168,4,110,66,0,0,31.9,0.471,29,0
755,1,128,88,39,110,36.5,1.057,37,1
324,2,112,75,32,0,35.7,0.148,21,0
614,11,138,74,26,144,36.1,0.557,50,1


In [224]:
#Finding no of rows and columns
dataset.shape

(768, 9)

In [225]:
#Getting the statistical details of the data
dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [226]:
#Value count of the outcome
dataset['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [227]:
#0 for non diabetic
#1 for diabetic

In [228]:
dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [229]:
#Seperating data and labels
X = dataset.drop(columns = 'Outcome', axis =1)
Y =dataset['Outcome']

In [230]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
246,10,122,68,0,0,31.2,0.258,41
168,4,110,66,0,0,31.9,0.471,29
755,1,128,88,39,110,36.5,1.057,37
324,2,112,75,32,0,35.7,0.148,21
614,11,138,74,26,144,36.1,0.557,50
...,...,...,...,...,...,...,...,...
606,1,181,78,42,293,40.0,1.258,22
374,2,122,52,43,158,36.2,0.816,28
119,4,99,76,15,51,23.2,0.223,21
121,6,111,64,39,0,34.2,0.260,24


In [231]:
Y

246    0
168    0
755    1
324    0
614    1
      ..
606    1
374    0
119    0
121    0
407    0
Name: Outcome, Length: 768, dtype: int64

In [232]:
#Now data need to be standardised for better prediction


In [233]:
scalar = StandardScaler()

In [234]:
scalar.fit(X)   #fitting the data

StandardScaler()

In [235]:
Standard_data = scalar.transform(X)   #Transforming the data

In [236]:
#All the data in the similar range so the model will predict better
Standard_data

array([[ 1.82781311,  0.03459802, -0.05715025, ..., -0.10059342,
        -0.64593181,  0.66020563],
       [ 0.04601433, -0.34096773, -0.16054575, ..., -0.01174995,
        -0.00264654, -0.36084741],
       [-0.84488505,  0.22238089,  0.97680475, ...,  0.57207858,
         1.76714301,  0.31985461],
       ...,
       [ 0.04601433, -0.68523633,  0.35643175, ..., -1.11594738,
        -0.75163597, -1.04154944],
       [ 0.63994726, -0.30967058, -0.26394125, ...,  0.28016432,
        -0.63989158, -0.78628618],
       [-1.14185152, -0.62264204, -0.36733675, ..., -1.2809424 ,
        -0.41036256, -0.70119842]])

In [237]:
#Now storing it in X
X = Standard_data

In [238]:
X

array([[ 1.82781311,  0.03459802, -0.05715025, ..., -0.10059342,
        -0.64593181,  0.66020563],
       [ 0.04601433, -0.34096773, -0.16054575, ..., -0.01174995,
        -0.00264654, -0.36084741],
       [-0.84488505,  0.22238089,  0.97680475, ...,  0.57207858,
         1.76714301,  0.31985461],
       ...,
       [ 0.04601433, -0.68523633,  0.35643175, ..., -1.11594738,
        -0.75163597, -1.04154944],
       [ 0.63994726, -0.30967058, -0.26394125, ...,  0.28016432,
        -0.63989158, -0.78628618],
       [-1.14185152, -0.62264204, -0.36733675, ..., -1.2809424 ,
        -0.41036256, -0.70119842]])

In [239]:
Y

246    0
168    0
755    1
324    0
614    1
      ..
606    1
374    0
119    0
121    0
407    0
Name: Outcome, Length: 768, dtype: int64

In [240]:
#Splitting the data
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15,stratify=Y,random_state =1)

In [241]:
#Training the model will train data

In [242]:
X_train.shape,X_test.shape

((652, 8), (116, 8))

In [243]:
classifier = svm.SVC(kernel = 'linear')

In [244]:
classifier.fit(X_train,Y_train)

SVC(kernel='linear')

In [245]:
#Finding the accuracy for the training data

In [246]:
X_pred_train = classifier.predict(X_train)

In [247]:
X_train_accuracy = accuracy_score(X_pred_train,Y_train)

In [248]:
print(f'The accuracy of the training data is : {X_train_accuracy}')

The accuracy of the training data is : 0.7837423312883436


In [249]:
#For test data accuracy
X_test_pred = classifier.predict(X_test)
X_test_Accuracy = accuracy_score(X_test_pred,Y_test)

In [250]:
print(f'The accuracy of the test data set is {X_test_Accuracy}')

The accuracy of the test data set is 0.7413793103448276


In [251]:
#Building a predictive system :-

In [252]:
input_data =(13,126,90,0,0,43.4,0.583,42)
#convert the input data into numpy array
input_data_narray = np.asarray(input_data)

In [253]:
input_data_narray

array([ 13.   , 126.   ,  90.   ,   0.   ,   0.   ,  43.4  ,   0.583,
        42.   ])

In [254]:
#Standardising the input data
#data need to be reshaped as the model actually expects to get 768,8 shape array :)
input_data_reshaped = input_data_narray.reshape(1,-1)
Standard_input = scalar.transform(input_data_reshaped)
#(1,-1) for telling the model we are using only 1 input set

In [255]:
Standard_input

array([[ 2.7187125 ,  0.1597866 ,  1.08020025, -1.28821221, -0.69289057,
         1.44782138,  0.33560676,  0.74529338]])

In [256]:
Predict_Output = classifier.predict(Standard_input)

In [257]:
#Conditional Statement
if Predict_Output ==0:
    print("You are alright. No diabetes. Enjoy!!!")
else:
    print("Gosh!! Take care. You have diabetes")

Gosh!! Take care. You have diabetes
