In [28]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from scipy.stats import uniform

In [29]:
def model_evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

In [30]:
diabetes = pd.read_csv('data/diabetes.csv')

In [31]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Data Imputation

After checking that there are no null values, we find that there are some zero values for the columns (Glucose, BloodPressure, SkinThickness, Insulin and BMI). That does not make any sense (a human can’t have blood pressure measured as zero!)

How to handle those weird zero values? There are a bunch of techniques to impute data. We will just replace it by of the column separated by the Outcome.

In [32]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [33]:
COLS = ['Glucose',
        'BloodPressure',
        'SkinThickness',
        'Insulin',
        'BMI']
for col in COLS:
        imput_values = diabetes[col].groupby(diabetes['Outcome']).mean()
        non_diabetes_mean = imput_values.loc[0]
        diabetes_mean = imput_values.loc[1]

        non_diabetes_index = diabetes[(diabetes[col]==0)&(diabetes['Outcome']==0)].index
        diabetes_index = diabetes[(diabetes[col]==0)&(diabetes['Outcome']==1)].index

        diabetes.loc[non_diabetes_index, col] = non_diabetes_mean
        diabetes.loc[diabetes_index, col] = diabetes_mean

In [34]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,100.335821,33.6,0.627,50,1
1,1,85.0,66.0,29.0,68.792,26.6,0.351,31,0
2,8,183.0,64.0,22.164179,100.335821,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [35]:
X = diabetes[['Pregnancies',
              'Glucose',
              'BloodPressure',
              'SkinThickness',
              'Insulin',
              'BMI',
              'DiabetesPedigreeFunction',
              'Age']] # features
y = diabetes['Outcome'] # labels

In [36]:
# 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

## Model 1
No regularization

In [22]:
LRmodel = LogisticRegression(penalty = 'none', max_iter=200)
LRmodel.fit(X_train, y_train)

In [23]:
LRmodel.coef_

array([[ 1.56904339e-01,  3.59885659e-02, -1.31999823e-02,
         4.52820706e-03, -4.21528774e-04,  9.11683647e-02,
         7.67648424e-01,  1.41142254e-02]])

In [24]:
model_evaluate(LRmodel, X_test, y_test)

              precision    recall  f1-score   support

           0       0.78      0.87      0.82       108
           1       0.59      0.43      0.50        46

    accuracy                           0.74       154
   macro avg       0.69      0.65      0.66       154
weighted avg       0.73      0.74      0.73       154



## Model 2
Standard scaling and Lasso regularization

The advantage of using a cross-validation estimator over the canonical estimator class along with grid search is that they can take advantage of warm-starting by reusing precomputed results in the previous steps of the cross-validation process. This generally leads to speed improvements

By default, LogisticRegressionCV estimator will be refitted on the full training dataset after finding the best combination of hyper-parameters.

In [37]:
pipe = make_pipeline(StandardScaler(), LogisticRegressionCV(fit_intercept = True,
                                                            penalty = 'l1',
                                                            solver = 'liblinear',
                                                            cv = 10,
                                                            random_state = 47))
pipe.fit(X_train, y_train)

In [38]:
model_evaluate(pipe, X_test, y_test)

              precision    recall  f1-score   support

           0       0.80      0.88      0.83       121
           1       0.75      0.62      0.68        71

    accuracy                           0.78       192
   macro avg       0.77      0.75      0.76       192
weighted avg       0.78      0.78      0.78       192



## Model 3
Model tuning

In [102]:
model_to_tune = LogisticRegression(solver = 'liblinear')
penalty = ['l1', 'l2']

# C is random sample of the U[0, 4] distribution
C=uniform(loc=0, scale=4)

hyperparameters = dict(C=C, penalty=penalty)

In [103]:
# Create randomized search 10-fold cross validation and 100 iterations
cv = 10
clf = RandomizedSearchCV(model_to_tune,
                         hyperparameters,
                         random_state=1,
                         n_iter=100,
                         cv=cv,
                         verbose=0)
# Fit randomized search
best_model = clf.fit(X_train, y_train)
(best_model.best_score_, best_model.best_params_)

(0.7558170280274986, {'C': 1.668088018810296, 'penalty': 'l1'})

In [104]:
model_evaluate(best_model, X_test, y_test)

              precision    recall  f1-score   support

           0       0.82      0.92      0.87       103
           1       0.79      0.59      0.67        51

    accuracy                           0.81       154
   macro avg       0.80      0.76      0.77       154
weighted avg       0.81      0.81      0.80       154



## Model 4
Standard Scaling and Model tuning

While using a grid of parameter settings is currently the most widely used method for parameter optimization, other search methods have more favorable properties. RandomizedSearchCV implements a randomized search over parameters, where each setting is sampled from a distribution over possible parameter values. 

In contrast to GridSearchCV, not all parameter values are tried out, but rather a fixed number of parameter settings is sampled from the specified distributions. The number of parameter settings that are tried is given by n_iter.

In [39]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(solver = 'liblinear'))

penalty = ['l1', 'l2']

# C is a uniform distribution U[0, 4]
C=uniform(loc=0, scale=10)

hyperparameters_distributions = {"logisticregression__C":C, "logisticregression__penalty":penalty}

In [40]:
# Create randomized search 10-fold cross validation and 100 iterations
cv = 10
clf = RandomizedSearchCV(estimator = pipe,
                         param_distributions = hyperparameters_distributions,
                         random_state = 1,
                         n_iter = 100,
                         cv = cv)
# Fit randomized search
best_model = clf.fit(X_train, y_train)
(best_model.best_score_, best_model.best_params_)

(0.7692075015124017,
 {'logisticregression__C': 2.0445224973151745,
  'logisticregression__penalty': 'l1'})

In [41]:
model_evaluate(best_model, X_test, y_test)

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       121
           1       0.76      0.62      0.68        71

    accuracy                           0.79       192
   macro avg       0.78      0.75      0.76       192
weighted avg       0.78      0.79      0.78       192



In [42]:
with open('pipeline.pickle','wb') as f:
    #pickle.dump(pipe, f)
    pickle.dump(clf, f)

In [43]:
with open('pipeline.pickle', 'rb') as f:
  loaded_pipe = pickle.load(f)
  
model_evaluate(loaded_pipe, X_test, y_test)

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       121
           1       0.76      0.62      0.68        71

    accuracy                           0.79       192
   macro avg       0.78      0.75      0.76       192
weighted avg       0.78      0.79      0.78       192



In [44]:
def predict_diabetes(model, new_data):
    # Predict diabetes
    predictions = model.predict(new_data)

    pred_to_label = {0: 'Negative', 1: 'Positive'}

    # Make a list of predictions
    data = []
    for t, pred in zip(new_data, predictions):
        data.append((pred, pred_to_label[pred]))

    return data

if __name__=="__main__":
    # Sample to classify should be in a list.
    new_sample = np.array([1, 109, 56, 21, 135, 25.2, 0.833, 23])
    new_sample = [new_sample]
    predictions = predict_diabetes(loaded_pipe, new_sample)
    print(predictions)

[(0, 'Negative')]




In [45]:
loaded_pipe.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], dtype=int64)