In [1]:
import pandas as pd

data = pd.read_csv('../datasets/pima.csv')
display(data.info(),data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


None

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
import plotly.graph_objects as go
import plotly.offline as py

# 2 datasets
D = data[(data['Outcome'] != 0)]
H = data[(data['Outcome'] == 0)]

#------------COUNT-----------------------
def target_count():
    trace = go.Bar( x = data['Outcome'].value_counts().values.tolist(), 
                    y = ['healthy','diabetic' ], 
                    orientation = 'h', 
                    text=data['Outcome'].value_counts().values.tolist(), 
                    textfont=dict(size=15),
                    textposition = 'auto',
                    opacity = 0.8,marker=dict(
                    color=['lightskyblue', 'gold'],
                    line=dict(color='#000000',width=1.5)))

    layout = dict(title =  'Count of Outcome variable')

    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)

#------------PERCENTAGE-------------------
def target_percent():
    trace = go.Pie(labels = ['healthy','diabetic'], values = data['Outcome'].value_counts(), 
                   textfont=dict(size=15), opacity = 0.8,
                   marker=dict(colors=['lightskyblue', 'gold'], 
                               line=dict(color='#000000', width=1.5)))


    layout = dict(title =  'Distribution of Outcome variable')

    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)

In [3]:
target_count()
target_percent()

In [4]:
import numpy as np

data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [5]:
# Define missing plot to detect all missing values in dataset
def missing_plot(dataset, key) :
    null_feat = pd.DataFrame(len(dataset[key]) - dataset.isnull().sum(), columns = ['Count'])
    percentage_null = pd.DataFrame((len(dataset[key]) - (len(dataset[key]) - dataset.isnull().sum()))/len(dataset[key])*100, columns = ['Count'])
    percentage_null = percentage_null.round(2)

    trace = go.Bar(x = null_feat.index, y = null_feat['Count'] ,opacity = 0.8, text = percentage_null['Count'],  textposition = 'auto',marker=dict(color = '#7EC0EE',
            line=dict(color='#000000',width=1.5)))

    layout = dict(title =  "Missing Values (count & %)")

    fig = dict(data = [trace], layout=layout)
    py.iplot(fig)


In [6]:
# Plotting 
missing_plot(data, 'Outcome')

In [7]:
def median_target(var):   
    temp = data[data[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return temp

In [8]:
import plotly.figure_factory as ff

def plot_distribution(data_select, size_bin) :  
    # 2 datasets
    tmp1 = D[data_select]
    tmp2 = H[data_select]
    hist_data = [tmp1, tmp2]
    
    group_labels = ['diabetic', 'healthy']
    colors = ['#FFD700', '#7EC0EE']

    fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, bin_size = size_bin, curve_type='kde')
    
    fig['layout'].update(title = data_select)

    py.iplot(fig, filename = 'Density plot')

In [9]:
plot_distribution('Insulin', 0)
median_target('Insulin')

Unnamed: 0,Outcome,Insulin
0,0,102.5
1,1,169.5


In [10]:
# Replacing with median values based on outcome

data.loc[(data['Outcome'] == 0 ) & (data['Insulin'].isnull()), 'Insulin'] = 102.5
data.loc[(data['Outcome'] == 1 ) & (data['Insulin'].isnull()), 'Insulin'] = 169.5

In [11]:
plot_distribution('Glucose', 0)
median_target('Glucose')

Unnamed: 0,Outcome,Glucose
0,0,107.0
1,1,140.0


In [12]:
data.loc[(data['Outcome'] == 0 ) & (data['Glucose'].isnull()), 'Glucose'] = 107
data.loc[(data['Outcome'] == 1 ) & (data['Glucose'].isnull()), 'Glucose'] = 140

In [13]:
plot_distribution('SkinThickness', 10)
median_target('SkinThickness')

Unnamed: 0,Outcome,SkinThickness
0,0,27.0
1,1,32.0


In [14]:
data.loc[(data['Outcome'] == 0 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 27
data.loc[(data['Outcome'] == 1 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 32

In [15]:
plot_distribution('BloodPressure', 5)
median_target('BloodPressure')

Unnamed: 0,Outcome,BloodPressure
0,0,70.0
1,1,74.5


In [16]:
data.loc[(data['Outcome'] == 0 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 70
data.loc[(data['Outcome'] == 1 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 74.5

In [17]:
plot_distribution('BMI', 0)
median_target('BMI')

Unnamed: 0,Outcome,BMI
0,0,30.1
1,1,34.3


In [18]:
data.loc[(data['Outcome'] == 0 ) & (data['BMI'].isnull()), 'BMI'] = 30.1
data.loc[(data['Outcome'] == 1 ) & (data['BMI'].isnull()), 'BMI'] = 34.3

In [19]:
#plot distribution 
plot_distribution('Age', 0)
plot_distribution('Pregnancies', 0)
plot_distribution('DiabetesPedigreeFunction', 0)

In [20]:
missing_plot(data, 'Outcome')

In [21]:
def plot_feat1_feat2(feat1, feat2) :  
    D = data[(data['Outcome'] != 0)]
    H = data[(data['Outcome'] == 0)]
    trace0 = go.Scatter(
        x = D[feat1],
        y = D[feat2],
        name = 'diabetic',
        mode = 'markers', 
        marker = dict(color = '#FFD700',
            line = dict(
                width = 1)))

    trace1 = go.Scatter(
        x = H[feat1],
        y = H[feat2],
        name = 'healthy',
        mode = 'markers',
        marker = dict(color = '#7EC0EE',
            line = dict(
                width = 1)))

    layout = dict(title = feat1 +" "+"vs"+" "+ feat2,
                  yaxis = dict(title = feat2,zeroline = False),
                  xaxis = dict(title = feat1, zeroline = False)
                 )

    plots = [trace0, trace1]

    fig = dict(data = plots, layout=layout)
    py.iplot(fig)

def barplot(var_select, sub) :
    tmp1 = data[(data['Outcome'] != 0)]
    tmp2 = data[(data['Outcome'] == 0)]
    tmp3 = pd.DataFrame(pd.crosstab(data[var_select],data['Outcome']), )
    tmp3['% diabetic'] = tmp3[1] / (tmp3[1] + tmp3[0]) * 100

    color=['lightskyblue','gold' ]
    trace1 = go.Bar(
        x=tmp1[var_select].value_counts().keys().tolist(),
        y=tmp1[var_select].value_counts().values.tolist(),
        text=tmp1[var_select].value_counts().values.tolist(),
        textposition = 'auto',
        name='diabetic',opacity = 0.8, marker=dict(
        color='gold',
        line=dict(color='#000000',width=1)))

    trace2 = go.Bar(
        x=tmp2[var_select].value_counts().keys().tolist(),
        y=tmp2[var_select].value_counts().values.tolist(),
        text=tmp2[var_select].value_counts().values.tolist(),
        textposition = 'auto',
        name='healthy', opacity = 0.8, marker=dict(
        color='lightskyblue',
        line=dict(color='#000000',width=1)))
    
    trace3 =  go.Scatter(   
        x=tmp3.index,
        y=tmp3['% diabetic'],
        yaxis = 'y2',
        name='% diabetic', opacity = 0.6, marker=dict(
        color='black',
        line=dict(color='#000000',width=0.5
        )))

    layout = dict(title =  str(var_select)+' '+(sub),
              xaxis=dict(), 
              yaxis=dict(title= 'Count'), 
              yaxis2=dict(range= [-0, 75], 
                          overlaying= 'y', 
                          anchor= 'x', 
                          side= 'right',
                          zeroline=False,
                          showgrid= False, 
                          title= '% diabetic'
                         ))

    fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
    py.iplot(fig)

# Define pie plot to visualize each variable repartition vs target modalities : Survived or Died (train)
def plot_pie(var_select, sub) :
    D = data[(data['Outcome'] != 0)]
    H = data[(data['Outcome'] == 0)]
    
    col =['Silver', 'mediumturquoise','#CF5C36','lightblue','magenta', '#FF5D73','#F2D7EE','mediumturquoise']
    
    trace1 = go.Pie(values  = D[var_select].value_counts().values.tolist(),
                    labels  = D[var_select].value_counts().keys().tolist(),
                    textfont=dict(size=15), opacity = 0.8,
                    hole = 0.5, 
                    hoverinfo = "label+percent+name",
                    domain  = dict(x = [.0,.48]),
                    name    = "Diabetic",
                    marker  = dict(colors = col, line = dict(width = 1.5)))
    trace2 = go.Pie(values  = H[var_select].value_counts().values.tolist(),
                    labels  = H[var_select].value_counts().keys().tolist(),
                    textfont=dict(size=15), opacity = 0.8,
                    hole = 0.5,
                    hoverinfo = "label+percent+name",
                    marker  = dict(line = dict(width = 1.5)),
                    domain  = dict(x = [.52,1]),
                    name    = "Healthy" )

    layout = go.Layout(dict(title = var_select + " distribution by target <br>"+(sub),
                            annotations = [ dict(text = "Diabetic"+" : "+"268",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .22, y = -0.1),
                                            dict(text = "Healthy"+" : "+"500",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .8,y = -.1)]))
                                          

    fig  = go.Figure(data = [trace1,trace2],layout = layout)
    py.iplot(fig)

In [22]:
plot_feat1_feat2('Glucose','Age')

In [23]:
data.loc[:,'N1']=0
data.loc[(data['Age']<=30) & (data['Glucose']<=120),'N1']=1

barplot('N1', ':Glucose <= 120 and Age <= 30')

In [24]:
plot_pie('N1', '(Glucose <= 120 and Age <= 30)')

In [25]:
# N2: BMI<=30 because a healthy persons bmi should be <=30
data.loc[:,'N2']=0
data.loc[(data['BMI']<=30),'N2']=1

barplot('N2', ': BMI <= 30')
plot_pie('N2', 'BMI <= 30')

In [26]:
plot_feat1_feat2('Pregnancies','Age')

In [27]:
data.loc[:,'N3']=0
data.loc[(data['Age']<=30) & (data['Pregnancies']<=6),'N3']=1

barplot('N3', ': Age <= 30 and Pregnancies <= 6')
plot_pie('N3', 'Age <= 30 and Pregnancies <= 6')

In [28]:
plot_feat1_feat2('Glucose','BloodPressure')

In [29]:
data.loc[:,'N4']=0
data.loc[(data['Glucose']<=105) & (data['BloodPressure']<=80),'N4']=1

barplot('N4', ': Glucose <= 105 and BloodPressure <= 80')
plot_pie('N4', 'Glucose <= 105 and BloodPressure <= 80')

In [30]:
# Skin Thickness

data.loc[:,'N5']=0
data.loc[(data['SkinThickness']<=20) ,'N5']=1

barplot('N5', ':SkinThickness <= 20')

plot_pie('N5', 'SkinThickness <= 20')

In [31]:
# SkinThickness and BMI

plot_feat1_feat2('SkinThickness','BMI')

In [32]:
data.loc[:,'N6']=0
data.loc[(data['BMI']<30) & (data['SkinThickness']<=20),'N6']=1

barplot('N6', ': BMI < 30 and SkinThickness <= 20')

plot_pie('N6', 'BMI < 30 and SkinThickness <= 20')

In [33]:
# Glucose and BMI

plot_feat1_feat2('Glucose','BMI')

In [34]:
data.loc[:,'N7']=0
data.loc[(data['Glucose']<=105) & (data['BMI']<=30),'N7']=1

barplot('N7', ': Glucose <= 105 and BMI <= 30')
plot_pie('N7', 'Glucose <= 105 and BMI <= 30')

In [35]:
plot_distribution('Insulin', 0)

In [36]:
data.loc[:,'N9']=0
data.loc[(data['Insulin']<200),'N9']=1

barplot('N9', ': Insulin < 200')
plot_pie('N9', 'Insulin < 200')

In [37]:
# Bloodpressure

data.loc[:,'N10']=0
data.loc[(data['BloodPressure']<80),'N10']=1

barplot('N10', ': BloodPressure < 80')
plot_pie('N10', 'BloodPressure < 80')

In [38]:
# Pregnencies

plot_distribution('Pregnancies', 0)

In [39]:
data.loc[:,'N11']=0
data.loc[(data['Pregnancies']<4) & (data['Pregnancies']!=0) ,'N11']=1

barplot('N11', ': Pregnancies > 0 and < 4')
plot_pie('N11', 'Pregnancies > 0 and < 4')

In [40]:
# data['N0'] = data['BMI'] * data['SkinThickness']

# data['N8'] =  data['Pregnancies'] / data['Age']

# data['N13'] = data['Glucose'] / data['DiabetesPedigreeFunction']

# data['N12'] = data['Age'] * data['DiabetesPedigreeFunction']

# data['N14'] = data['Age'] / data['Insulin']

In [41]:
D = data[(data['Outcome'] != 0)]
H = data[(data['Outcome'] == 0)]

In [44]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

target_col = ["Outcome"]
cat_cols   = data.nunique()[data.nunique() < 12].keys().tolist()
cat_cols   = [x for x in cat_cols ]
#numerical columns
num_cols   = [x for x in data.columns if x not in cat_cols + target_col]
#Binary columns with 2 values
bin_cols   = data.nunique()[data.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    data[i] = le.fit_transform(data[i])
    
#Duplicating columns for multi value columns
data = pd.get_dummies(data = data, columns=multi_cols )

#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index=True,right_index=True,how = "left")
data

Unnamed: 0,Outcome,N1,N2,N3,N4,N5,N6,N7,N9,N10,N11,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,0,0,0,0,0,0,0,1,1,0,0.639947,0.864625,-0.032180,0.665181,0.311604,0.169483,0.468492,1.425995
1,0,0,1,0,1,0,0,1,1,1,1,-0.844885,-1.204727,-0.528124,-0.010112,-0.440843,-0.848549,-0.365061,-0.190672
2,1,0,1,0,0,0,0,0,1,1,0,1.233880,2.014265,-0.693438,0.327535,0.311604,-1.328478,0.604397,-0.105584
3,0,1,1,1,1,0,0,1,1,1,1,-0.844885,-1.073339,-0.528124,-0.685405,-0.536303,-0.630399,-0.920763,-1.041549
4,1,0,0,0,0,0,0,0,1,1,0,-1.141852,0.503310,-2.677212,0.665181,0.294758,1.551096,5.484909,-0.020496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,0,0,0,0,1,0,0,0,1,1,0,1.827813,-0.679177,0.298449,2.128317,0.429525,0.067679,-0.908682,2.532136
764,0,0,0,1,0,0,0,0,1,1,1,-0.547919,0.010607,-0.197495,-0.235210,-0.440843,0.634868,-0.398282,-0.531023
765,0,0,1,1,0,0,0,0,1,1,0,0.342981,-0.022240,-0.032180,-0.685405,-0.334153,-0.906722,-0.685193,-0.275760
766,1,0,0,0,0,0,0,0,1,1,1,-0.844885,0.141994,-1.024067,0.327535,0.311604,-0.339533,-0.371101,1.170732


In [45]:
y = data['Outcome'].copy()
x = data.drop('Outcome', axis="columns")

In [46]:
# # GridSearch + LightGBM & KNN

# knn_clf = KNeighborsClassifier()

# voting_clf = VotingClassifier(estimators=[ 
#     ('lgbm_clf', lgbm_clf),
#     ('knn', KNeighborsClassifier())], voting='soft', weights = [1,1])

# params = {
#       'knn__n_neighbors': np.arange(1,30)
#       }
      
# grid = GridSearchCV(estimator=voting_clf, param_grid=params, cv=5)

# grid.fit(x,y)

# print("Best Score:" + str(grid.best_score_))
# print("Best Parameters: " + str(grid.best_params_))

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# TRAIN-TEST SPLIT
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=42)

# Scale x with a standard scaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train = pd.DataFrame(scaler.transform(x_train), columns=x.columns)
x_test = pd.DataFrame(scaler.transform(x_test), columns=x.columns)

In [69]:
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier(max_depth=5)
gbc.fit(x_train,y_train)
gbc.score(x_test, y_test)

0.8701298701298701

In [52]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'n_neighbors': [3, 5, 7, 10, 20, 30],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [30, 60, 90],
    'p': [1, 2, 3, 4],
    'metric': ["minkowski"],
    # metric_params: dict | None = None,
    # 'n_jobs': [50]
}

grid_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=params, cv=5, verbose=3, n_jobs=4)
grid_knn.fit(x_train,y_train)
print(grid_knn.best_params_)
print(grid_knn.best_estimator_)
print(grid_knn.best_score_)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'n_neighbors': 20, 'p': 1, 'weights': 'distance'}
KNeighborsClassifier(n_neighbors=20, p=1, weights='distance')
0.8307210449153672


In [55]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=20, p=1, weights='distance')
knn.fit(x_train,y_train)
knn.score(x_test,y_test)

0.7792207792207793

In [None]:
# SAVING THE MODEL USING PICKLE PACKAGE

import pickle

# save the iris classification model as a pickle file
model_pkl_file = "../pickles/diabetes-knn.pkl"

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(gbc, file)

In [56]:
from sklearn.svm import SVC # "Support vector classifier"  
svm = SVC(C=1000, gamma=0.1, kernel='rbf', random_state=42)  
svm.fit(x_train, y_train)
svm.score(x_test, y_test)

0.7792207792207793