In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [28]:
import time

## read data and explore
0 for independent variable means missing: `SkinThickness`, `Insulin`, perhaps others

In [4]:
data = pd.read_csv('diabetes.csv')

In [6]:
data.shape
data.columns
data.info()
data.head()

(768, 9)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


#### these columns with `min = 0` indicate missing values
#### `Glucose`, `BloodPressure`, `SkinThickness`, `Insulin`, `BMI`

In [7]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [10]:
cols_with_missing = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
(data[cols_with_missing]==0).sum()

Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64

## replace `0` with `NaN` so will be ignored by `pandas`, `NumPy`, `scikit-learn`

In [13]:
data[cols_with_missing] = data[cols_with_missing].replace(0, np.NaN)
data.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [14]:
data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,,,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,,,,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,,,,0.232,54,1


## impute `NaN` with `mean`

In [15]:
data.fillna(data.mean(), inplace=True)
data.isnull().sum()
data.head(10)

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,29.15342,155.548223,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,72.405184,29.15342,155.548223,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,29.15342,155.548223,32.457464,0.232,54,1


## set input and output arrays

In [16]:
X = data.values[:, 0:8]
y = data.values[:, 8]
X.shape
y.shape

(768, 8)

(768,)

## `LogisticRegression` model

In [20]:
lr = LogisticRegression(penalty='l1', dual=False, max_iter=110, solver='liblinear')

In [21]:
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 110,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l1',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [22]:
lr.fit(X, y)
lr.score(X, y)

LogisticRegression(max_iter=110, penalty='l1', solver='liblinear')

0.7747395833333334

## k-fold cross validation

In [26]:
kfold = KFold(n_splits=3, random_state=7, shuffle=True)
result_kfold = cross_val_score(lr, X, y, cv=kfold, scoring='accuracy')

In [27]:
result_kfold
result_kfold.mean()

array([0.7578125, 0.75     , 0.78125  ])

0.7630208333333334

## hyper parameter tuning: `dual`, `max_iter`

In [29]:
param_dual = [True, False]
param_max_iter = [100, 110, 120, 130, 140]
param_grid = dict(dual=param_dual, max_iter=param_max_iter)

lr = LogisticRegression(penalty='l2')
grid_cv = GridSearchCV(estimator=lr, param_grid=param_grid, cv=3, n_jobs=-1)

In [30]:
start_time = time.time()
result_grid = grid_cv.fit(X, y)
end_time = time.time()

In [31]:
result_grid.best_score_
result_grid.best_params_
print('execution time, s = ', end_time - start_time)

0.7682291666666666

{'dual': False, 'max_iter': 140}

execution time, ms =  4.1427412033081055


## hyper parameter tuning: `dual`, `max_iter`, `C`
`C` is inverse of regularization strength, smaller is stronger regularization

In [48]:
param_dual = [True, False]
param_max_iter = [100, 110, 120, 130, 140]
param_C = [1.0, 1.5, 2.0, 2.5]
param_grid = dict(dual=param_dual, max_iter=param_max_iter, C=param_C)

lr = LogisticRegression(penalty='l2')
grid_cv = GridSearchCV(estimator=lr, param_grid=param_grid, cv=3, n_jobs=-1)

In [49]:
start_time = time.time()
result_grid = grid_cv.fit(X, y)
end_time = time.time()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [50]:
result_grid.best_score_
result_grid.best_params_
print('execution time, s = ', end_time - start_time)

0.7786458333333334

{'C': 1.5, 'dual': False, 'max_iter': 130}

execution time, ms =  1.3020274639129639


## random search hyper parameter tuning

In [36]:
random_cv = RandomizedSearchCV(estimator=lr, param_distributions=param_grid, cv=3, n_jobs=-1)

In [37]:
start_time = time.time()
result_random = random_cv.fit(X, y)
end_time = time.time()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [38]:
result_random.best_score_
result_random.best_params_
print('execution time, s = ', end_time - start_time)

0.7721354166666666

{'max_iter': 140, 'dual': False, 'C': 2.5}

execution time, ms =  3.8236806392669678
