In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
matplotlib.rcParams["figure.figsize"]=(20,10)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer 
from sklearn import metrics

Exporting The Data

In [2]:
df1= pd.read_csv('diabetes.csv')
print(df1.shape)
df1.head(10)


(768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


Feature Engineering

In [3]:
#checking null value is present or not
df1.isnull().values.any()

False

In [4]:
#correlation
cormat= df1.corr()
cormat

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [5]:
#to find data is balanced or not
true_length= len(df1[df1['Outcome']== 1])
false_length= len(df1[df1['Outcome']== 0])
true_length, false_length

(268, 500)

Checking how many 0 values are there in features

In [6]:
print(df1.shape)
print("no. of 0's in Glucose {0}".format(len(df1.loc[df1.Glucose==0])))
print("no. of 0's in BloodPressure {0}".format(len(df1.loc[df1.BloodPressure==0])))
print("no. of 0's in SkinThickness {0}".format(len(df1.loc[df1.SkinThickness==0])))
print("no. of 0's in Insulin {0}".format(len(df1.loc[df1.Insulin==0])))
print("no. of 0's in BMI {0}".format(len(df1.loc[df1.BMI==0])))
print("no. of 0's in DiabetesPedigreeFunction {0}".format(len(df1.loc[df1.DiabetesPedigreeFunction==0])))

(768, 9)
no. of 0's in Glucose 5
no. of 0's in BloodPressure 35
no. of 0's in SkinThickness 227
no. of 0's in Insulin 374
no. of 0's in BMI 11
no. of 0's in DiabetesPedigreeFunction 0


In [7]:
#Now we will fill these 0's with mean


imputer_mean = SimpleImputer(missing_values = 0, strategy ='mean')

df1[['Glucose', 'BloodPressure' ,'SkinThickness','Insulin', 'BMI' ]] = imputer_mean.fit_transform(df1[['Glucose', 'BloodPressure' ,'SkinThickness','Insulin', 'BMI' ]])

In [8]:
print(df1.shape)
print("no. of 0's in Glucose {0}".format(len(df1.loc[df1.Glucose==0])))
print("no. of 0's in BloodPressure {0}".format(len(df1.loc[df1.BloodPressure==0])))
print("no. of 0's in SkinThickness {0}".format(len(df1.loc[df1.SkinThickness==0])))
print("no. of 0's in Insulin {0}".format(len(df1.loc[df1.Insulin==0])))
print("no. of 0's in BMI {0}".format(len(df1.loc[df1.BMI==0])))
print("no. of 0's in DiabetesPedigreeFunction {0}".format(len(df1.loc[df1.DiabetesPedigreeFunction==0])))

(768, 9)
no. of 0's in Glucose 0
no. of 0's in BloodPressure 0
no. of 0's in SkinThickness 0
no. of 0's in Insulin 0
no. of 0's in BMI 0
no. of 0's in DiabetesPedigreeFunction 0


In [9]:
df1.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,29.15342,155.548223,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,72.405184,29.15342,155.548223,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,29.15342,155.548223,32.457464,0.232,54,1


## Machine Learning Model

In [10]:
x=df1.drop(['Outcome'], axis = 1)
y= df1.Outcome

In [11]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [12]:
#train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state= 100)

## Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
log_reg= LogisticRegression(random_state=100)
log_reg.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=100, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
pred=log_reg.predict(x_test)
pred

array([0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
      dtype=int64)

Evaluation of the Model

In [15]:
#classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.74      0.82      0.78       101
           1       0.57      0.45      0.51        53

    accuracy                           0.69       154
   macro avg       0.66      0.64      0.64       154
weighted avg       0.68      0.69      0.69       154



In [16]:
#confusion matrix
from sklearn.metrics import confusion_matrix
mat=confusion_matrix(y_test,pred)
print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test,pred)))
mat

Accuracy = 0.695


array([[83, 18],
       [29, 24]], dtype=int64)

In [17]:
#Generate AUC-ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
sc=roc_auc_score(y_test,pred)
sc

0.6373061834485336

## Random Forest Model

In [18]:
from sklearn.ensemble import RandomForestClassifier
rfcls=RandomForestClassifier(n_estimators=10, random_state=350)
rfcls.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=350,
                       verbose=0, warm_start=False)

In [19]:
rfpred=rfcls.predict(x_test)

In [20]:
#Generate AUC-ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
sc=roc_auc_score(y_test,rfpred)
sc

0.6144218195404445

In [21]:
print(classification_report(y_test,rfpred))

              precision    recall  f1-score   support

           0       0.72      0.85      0.78       101
           1       0.57      0.38      0.45        53

    accuracy                           0.69       154
   macro avg       0.65      0.61      0.62       154
weighted avg       0.67      0.69      0.67       154



In [22]:
print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test,rfpred)))

Accuracy = 0.688


## XGBOOSt Using RandomisedSearchCV

In [23]:
#Hyper Parameter Optimization
params={
    'learning-rate': [0.05,0.10,0.15,0.20,0.25,0.30],
    'max_depth': [3,4,5,6,8,10,12,15],
    'min_child_weight': [1,3,5,7],
    'gamma': [0.0,0.1,0.2,0.3,0.4],
    'colsample_bytree': [0.3,0.4,0.5,0.7]
}

In [25]:
#Hyper Parameter Optimization using Randomized search cv
from sklearn.model_selection import RandomizedSearchCV
import xgboost

In [32]:
classifier= xgboost.XGBClassifier()
random_search=RandomizedSearchCV(classifier, param_distributions= params, scoring= 'roc_auc', n_jobs = -1, cv=5, verbose=3)


In [33]:
random_search.fit(x_train,y_train.ravel())

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   25.2s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=0,
                                           reg_lambda=1, sc...
                                           verbosity=1),
                   iid='deprecated', n_iter=10, n_jobs=-1,
                   param_distributions={'colsample_bytree': [

In [34]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.2,
              learning-rate=0.1, learning_rate=0.1, max_delta_step=0,
              max_depth=15, min_child_weight=5, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

In [36]:
classifier= xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.2, learning_rate=0.1, max_delta_step=0,
              max_depth=15, min_child_weight=5, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

In [37]:
classifier.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.2,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [40]:
xgpred=classifier.predict(x_test)

In [41]:
print(classification_report(y_test,xgpred))

              precision    recall  f1-score   support

           0       0.76      0.83      0.80       101
           1       0.61      0.51      0.56        53

    accuracy                           0.72       154
   macro avg       0.69      0.67      0.68       154
weighted avg       0.71      0.72      0.71       154



In [42]:
print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test,xgpred)))

Accuracy = 0.721


In [43]:
roc_auc_score(y_test,xgpred)

0.6705585652904913