In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(42)

# Getting the Data Ready

In [2]:
diabetes_data = pd.read_csv("diabetes-dataset.csv")
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [3]:
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               2000 non-null   int64  
 1   Glucose                   2000 non-null   int64  
 2   BloodPressure             2000 non-null   int64  
 3   SkinThickness             2000 non-null   int64  
 4   Insulin                   2000 non-null   int64  
 5   BMI                       2000 non-null   float64
 6   DiabetesPedigreeFunction  2000 non-null   float64
 7   Age                       2000 non-null   int64  
 8   Outcome                   2000 non-null   int64  
dtypes: float64(2), int64(7)
memory usage: 140.8 KB


Its good that all of the colums are numerical.\
It also shows that there are no null or NaN values in the data which is excellent.\
Lets explore the data further.

In [4]:
diabetes_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,3.7035,121.1825,69.1455,20.935,80.254,32.193,0.47093,33.0905,0.342
std,3.306063,32.068636,19.188315,16.103243,111.180534,8.149901,0.323553,11.786423,0.474498
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,63.5,0.0,0.0,27.375,0.244,24.0,0.0
50%,3.0,117.0,72.0,23.0,40.0,32.3,0.376,29.0,0.0
75%,6.0,141.0,80.0,32.0,130.0,36.8,0.624,40.0,1.0
max,17.0,199.0,122.0,110.0,744.0,80.6,2.42,81.0,1.0


This shows that there are:
- rows where `Glucose` is 0
- rows where `BloodPressure` is 0
- rows where `SkinThickness` is 0
- rows where `Insulin` is 0
- rows where `BMI` is 0
which is not possible.\
Lets impute those rows with there column mean.

In [5]:
# First lets check how many 0 values are in these columns 
featureList = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
diabetes_data[featureList].isin([0]).sum()

Glucose           13
BloodPressure     90
SkinThickness    573
Insulin          956
BMI               28
dtype: int64

for `Glucose`, `BloodPressure`, `SkinThickness` and `BMI` 
due to low number of '0' values we can impute it with their mean.\
But we will have to check it imputing in `Insulin` will affect accuracy
beacuse there are around ~50% data that is '0'.

In [6]:
# For Glucose
diabetes_data["Glucose"] = diabetes_data["Glucose"].replace({ 0 : diabetes_data["Glucose"].mean()})

# For BloodPressure
diabetes_data["BloodPressure"] = diabetes_data["BloodPressure"].replace({ 0 : diabetes_data["BloodPressure"].mean()})

# For SkinThickness
diabetes_data["SkinThickness"] = diabetes_data["SkinThickness"].replace({ 0 : diabetes_data["SkinThickness"].mean()})

# For BMI
diabetes_data["BMI"] = diabetes_data["BMI"].replace({ 0 : diabetes_data["BMI"].mean()})

# Lets checkif it worked
diabetes_data[featureList].isin([0]).sum()

Glucose            0
BloodPressure      0
SkinThickness      0
Insulin          956
BMI                0
dtype: int64

Because there are 956/2000 rows of `Insulin` where data is '0'
we will train 2 models where we drop `Insulin` and where we impute it.

In [7]:
diabetes_data_idroped = diabetes_data.drop(diabetes_data[diabetes_data["Insulin"] == 0].index, axis=0)
diabetes_data_idroped.shape

(1044, 9)

In [8]:
# Splitting data into X & y

X = diabetes_data_idroped.drop(["Outcome"], axis=1)
y = diabetes_data_idroped["Outcome"]

X.shape , y.shape

((1044, 8), (1044,))

In [9]:
# Splitind data into ttraining and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Fitting the data to the right model

In [10]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [11]:
from sklearn.model_selection import KFold, cross_val_score
for model in [ 
    DummyClassifier,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    GaussianNB,
    SVC,
    RandomForestClassifier]:
    
    cls = model()
    kf = KFold(n_splits = 5)
    score = cross_val_score(cls, X_train, y_train, cv = kf, scoring="roc_auc")
    
    print(f"{model.__name__:22}  AUC: \t {score.mean():.3f} STD: {score.std():.2f}")

DummyClassifier         AUC: 	 0.500 STD: 0.00
DecisionTreeClassifier  AUC: 	 0.942 STD: 0.02
KNeighborsClassifier    AUC: 	 0.866 STD: 0.03
GaussianNB              AUC: 	 0.833 STD: 0.03
SVC                     AUC: 	 0.846 STD: 0.02
RandomForestClassifier  AUC: 	 0.990 STD: 0.00


Without any hyper-parameter tuning we can see that RandomForestClassifier is the best model.

In [12]:
# Fitting the modle model
cls = RandomForestClassifier()

# Fitting the model
cls.fit(X_train, y_train)

# Prediction
y_preds = cls.predict(X_test)

# Evaluating the model

In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       185
           1       0.97      0.97      0.97        76

    accuracy                           0.98       261
   macro avg       0.98      0.98      0.98       261
weighted avg       0.98      0.98      0.98       261



In [14]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, y_preds))

0.9814366998577526


This is great!!

# Tuning the parameters

In [15]:
from sklearn.model_selection import RandomizedSearchCV

# Define a grid of hyperparameters
grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10, 20, 30],
        "max_features": ["auto", "sqrt"],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 2, 4]}

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Set n_jobs to -1 to use all cores (NOTE: n_jobs=-1 is broken as of 8 Dec 2019, using n_jobs=1 works)
clf = RandomForestClassifier(n_jobs=1)

# Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid,
                            n_iter=10, # try 10 models total
                            cv=5, # 5-fold cross-validation
                            verbose=2) # print out results

# Fit the RandomizedSearchCV version of clf
rs_clf.fit(X_train, y_train);

# Find the best hyperparameters
print(rs_clf.best_params_)

# Scoring automatically uses the best hyperparameters
rs_clf.score(X_test, y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   1.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   1.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_es

0.9846743295019157