In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Data viz. and EDA
import matplotlib.pyplot as plt 
%matplotlib inline  
import plotly.offline as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)

## For scaling data 
from mlxtend.preprocessing import minmax_scaling 

# Tensorflow / Deep Learning
import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

In [4]:
data = pd.read_csv("diabetes.csv")

In [5]:
# checking missing values if any
display(data.info(),data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


None

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


'Outcome' specifies whether the person is diabetic or not. 

on close inspection it is found that there are many '0' values that doesn't make any revalence. So we are considering them as null values.

# Affected People from Diabeties. 

In [6]:
## lets see how many are affected by diabeties 
D = data[data['Outcome'] == 1]
H = data[data['Outcome'] == 0]

## here I am using graph_obs as I am not able to costimize px. 

def target_count():
    trace = go.Bar( x = data['Outcome'].value_counts().values.tolist(), 
                    y = ['healthy','diabetic' ], 
                    orientation = 'h', 
                    text=data['Outcome'].value_counts().values.tolist(), 
                    textfont=dict(size=15),
                    textposition = 'auto',
                    opacity = 0.5,marker=dict(
                    color=['lightskyblue', ' indigo'],
                    line=dict(color='#000000',width=1.5)))

    layout = dict(title =  'Count of Outcome variable')

    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)

# --------------- donut chart to show there percentage -------------------- # 

def target_per():
    trace = go.Pie(labels=['healthy','diabetic' ],values=data['Outcome'].value_counts(),
                textfont=dict(size=15),
                opacity = 0.5,marker=dict(
                colors=['lightskyblue','indigo'],line=dict(color='#000000', width=1.5)),
                hole=0.6
                )
    layout = dict(title='Distribution of Outcome variable')
    fig = dict(data=[trace],layout=layout)
    py.iplot(fig)

In [7]:
target_count()
target_per()

In [8]:
## As seen earlier there is no null value. However on close inspection we find that null values are filled with '0'

data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']] = data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']].replace(0,np.NaN)    

In [9]:
## Checking the new null values found.
data.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [10]:
# Define missing plot to detect all missing values in dataset
def missing_plot(dataset, key) :
    null_feat = pd.DataFrame(len(dataset[key]) - dataset.isnull().sum(), columns = ['Count'])
    percentage_null = pd.DataFrame((dataset.isnull().sum())/len(dataset[key])*100, columns = ['Count'])
    percentage_null = percentage_null.round(2)

    trace = go.Bar(x = null_feat.index, y = null_feat['Count'] ,opacity = 0.8, text = percentage_null['Count'],  textposition = 'auto',marker=dict(color = '#7EC0EE',
            line=dict(color='#000000',width=1.5)))

    layout = dict(title =  "Missing Values (count & %)")

    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)
    

In [11]:
missing_plot(data,'Outcome')

## Lets first fill null values and then find relations.

In [12]:
## to find the median for filling null values

def find_median(var):
    temp = data[data[var].notnull()]
    temp = data[[var,'Outcome']].groupby('Outcome')[[var]].median().reset_index()
    return temp

In [13]:
def density_plot(var,size_bin):
    tmp1 = D[var]
    tmp2 = H[var]
    
    hist_data = [tmp1,tmp2]
    labels = ['Diabeties','Healthy']
    color = ['skyblue','indigo']
    fig = ff.create_distplot(hist_data,labels,colors = color,show_hist=True,bin_size=size_bin,curve_type='kde')
    
    fig['layout'].update(title = var)

    py.iplot(fig, filename = 'Density plot')
    

In [14]:
density_plot('Insulin',0)

In [15]:
find_median('Insulin')

Unnamed: 0,Outcome,Insulin
0,0,102.5
1,1,169.5


In [16]:
## Now we will be filling these values instead of null values

data.loc[(data['Outcome'] == 0) & (data['Insulin'].isnull()), 'Insulin'] = 102.5
data.loc[(data['Outcome'] == 1) & (data['Insulin'].isnull()), 'Insulin'] = 169.5

In [17]:
# SkinThickness density plot 

density_plot('SkinThickness',0)

In [18]:
find_median('SkinThickness')

Unnamed: 0,Outcome,SkinThickness
0,0,27.0
1,1,32.0


In [19]:
## Now we will be filling these values instead of null values

data.loc[(data['Outcome'] == 0) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 27.0
data.loc[(data['Outcome'] == 1) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 32.0

In [20]:
density_plot('BloodPressure',0)

In [21]:
find_median('BloodPressure')

Unnamed: 0,Outcome,BloodPressure
0,0,70.0
1,1,74.5


In [22]:
data.loc[(data['Outcome'] == 0) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 27.0
data.loc[(data['Outcome'] == 1) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 32.0

In [23]:
density_plot('BMI',0)

In [24]:
find_median('BMI')

Unnamed: 0,Outcome,BMI
0,0,30.1
1,1,34.3


In [25]:
data.loc[(data['Outcome'] == 0) & (data['BMI'].isnull()), 'BMI'] = 30.1
data.loc[(data['Outcome'] == 1) & (data['BMI'].isnull()), 'BMI'] = 34.3

In [26]:
density_plot('Glucose',0)

In [27]:
find_median('Glucose')

Unnamed: 0,Outcome,Glucose
0,0,107.0
1,1,140.0


In [28]:
data.loc[(data['Outcome'] == 0) & (data['Glucose'].isnull()) , 'Glucose'] = 107.0
data.loc[(data['Outcome'] == 1) & (data['Glucose'].isnull()) , 'Glucose'] = 140.0

In [29]:
## lets check if any null value is still left

display(data.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Now that we do not have any null values we can start with finding some correlations between the data presents 

In [30]:
def correlation_plot():
    #correlation
    correlation = data.corr()
    #tick labels
    matrix_cols = correlation.columns.tolist()
    #convert to array
    corr_array  = np.array(correlation)
    trace = go.Heatmap(z = corr_array,
                       x = matrix_cols,
                       y = matrix_cols,
                       colorscale='Viridis',
                       colorbar   = dict() 
                      )
    layout = go.Layout(dict(title = 'Correlation Matrix for variables',
                            #autosize = False,
                            #height  = 1400,
                            #width   = 1600,
                            margin  = dict(r = 0 ,l = 100,
                                           t = 0,b = 100,
                                         ),
                            yaxis   = dict(tickfont = dict(size = 9)),
                            xaxis   = dict(tickfont = dict(size = 9)),
                           )
                      )
    fig = go.Figure(data = [trace],layout = layout)
    py.iplot(fig)

In [31]:
correlation_plot()

We find 3 pairs which were having good correlations. 

In [32]:
def plot_feat1_feat2(feat1, feat2) :  
    D = data[(data['Outcome'] != 0)]
    H = data[(data['Outcome'] == 0)]
    trace0 = go.Scatter(
        x = D[feat1],
        y = D[feat2],
        name = 'diabetic',
        mode = 'markers', 
        opacity=0.8,
        marker = dict(color = 'lightskyblue',
            line = dict(
                width = 1)))

    trace1 = go.Scatter(
        x = H[feat1],
        y = H[feat2],
        name = 'healthy',
        opacity=0.8,
        mode = 'markers',
        marker = dict(color = 'indigo',
            line = dict(
                width = 1)))

    layout = dict(title = feat1 +" "+"vs"+" "+ feat2,
                  yaxis = dict(title = feat2,zeroline = False),
                  xaxis = dict(title = feat1, zeroline = False)
                 )

    plots = [trace0, trace1]

    fig = dict(data = plots, layout=layout)
    py.iplot(fig)


In [33]:
plot_feat1_feat2('Pregnancies', 'Age')

Here we can see that people with Age < 35 and Pragnancies < 6 are less likly to be affected with diageties.  

In [34]:
plot_feat1_feat2('Glucose', 'Insulin')


Here we can see that people with Glucose> 100 and Insulin > 180 are more likly to be affected with diageties.  

In [35]:
plot_feat1_feat2('SkinThickness', 'BMI')

Here we can see that people with SkinThickness< 30 and BMI <45 are less likly to be affected with diageties.  

Scaling data using minmax_scaling

We are doing this step as to to load the data in the model and avoid multi-variable multi-output data problem. 

In [36]:
scaled_data = minmax_scaling(data,columns=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'])

Creating the binary classifier model.  

In [37]:

def sgd_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation='relu', input_shape=[len(scaled_data.keys())]),
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
  ])

    optimizer = tf.keras.optimizers.SGD(0.01)

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

def adam_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation='relu', input_shape=[len(scaled_data.keys())]),
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
  ])

    optimizer = tf.keras.optimizers.Adam(0.01)

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model
  

def rms_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation='relu', input_shape=[len(scaled_data.keys())]),
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
  ])

    optimizer = tf.keras.optimizers.RMSprop(0.01)

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model
  

sgdModel = sgd_model()

adamModel = adam_model()

rmsModel = rms_model()

In [38]:
rmsModel.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 8)                 72        
                                                                 
 dense_7 (Dense)             (None, 4)                 36        
                                                                 
 dense_8 (Dense)             (None, 1)                 5         
                                                                 
Total params: 113
Trainable params: 113
Non-trainable params: 0
_________________________________________________________________


In [39]:
EPOCHS = 1000

sgdHistory = sgdModel.fit(scaled_data, data['Outcome'],epochs=EPOCHS, validation_split=0.3, verbose=2)

adamHistory = adamModel.fit(scaled_data, data['Outcome'],epochs=EPOCHS, validation_split=0.3, verbose=2)

rmsHistory = rmsModel.fit(scaled_data, data['Outcome'],epochs=EPOCHS, validation_split=0.3, verbose=2)

Epoch 1/1000
17/17 - 1s - loss: 0.7011 - accuracy: 0.6480 - val_loss: 0.6952 - val_accuracy: 0.6580 - 725ms/epoch - 43ms/step
Epoch 2/1000
17/17 - 0s - loss: 0.6972 - accuracy: 0.6480 - val_loss: 0.6918 - val_accuracy: 0.6580 - 46ms/epoch - 3ms/step
Epoch 3/1000
17/17 - 0s - loss: 0.6942 - accuracy: 0.6480 - val_loss: 0.6890 - val_accuracy: 0.6580 - 50ms/epoch - 3ms/step
Epoch 4/1000
17/17 - 0s - loss: 0.6916 - accuracy: 0.6480 - val_loss: 0.6864 - val_accuracy: 0.6580 - 52ms/epoch - 3ms/step
Epoch 5/1000
17/17 - 0s - loss: 0.6893 - accuracy: 0.6480 - val_loss: 0.6841 - val_accuracy: 0.6580 - 47ms/epoch - 3ms/step
Epoch 6/1000
17/17 - 0s - loss: 0.6872 - accuracy: 0.6480 - val_loss: 0.6819 - val_accuracy: 0.6580 - 66ms/epoch - 4ms/step
Epoch 7/1000
17/17 - 0s - loss: 0.6851 - accuracy: 0.6480 - val_loss: 0.6798 - val_accuracy: 0.6580 - 61ms/epoch - 4ms/step
Epoch 8/1000
17/17 - 0s - loss: 0.6832 - accuracy: 0.6480 - val_loss: 0.6778 - val_accuracy: 0.6580 - 66ms/epoch - 4ms/step
Epoch 

In [40]:
sgdHist = pd.DataFrame(sgdHistory.history)

adamHist = pd.DataFrame(adamHistory.history)

rmsHist = pd.DataFrame(rmsHistory.history)

#### Final Training and Validation Accuracy 

In [42]:
sgd_accuracy = (sgdHist['accuracy'].tail().sum())*100/5 

adam_accuracy = (adamHist['accuracy'].tail().sum())*100/5 

rms_accuracy = (rmsHist['accuracy'].tail().sum())*100/5 

print("SDG Accuracy = {}% ".format(sgd_accuracy))

print("Adam Accuracy = {}% ".format(adam_accuracy))

print("RMS Accuracy = {}% ".format(rms_accuracy))

# datatest = pd.read_csv("D:\SEM5\MiniProject\diabetes.csv")
# hist = pd.DataFrame(history.history)
# hist['epoch'] = history.epoch
# hist.tail()

SDG Accuracy = 79.73929286003113% 
Adam Accuracy = 89.05027985572815% 
RMS Accuracy = 89.7951591014862% 


In [43]:
# dpsModel.save('dps-dl-model.h5')
adamModel.save('dps-dl-model')
# tf.keras.models.save_model(dpsModel,"D:\SEM5\MiniProject")

INFO:tensorflow:Assets written to: dps-dl-model\assets
