In [1]:
# Load libraries
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
SEED = 90089

In [2]:
# Read data
patient_df = pd.read_csv('../data/hyperglycemic_patients_w_categories.csv', usecols=list(range(3,18)))
patient_df

Unnamed: 0,glucose_max,anchor_age,dod,gender,dbp_mean,sbp_mean,glucose_mean,heart_rate_mean,spo2_mean,resp_rate_mean,temperature_mean,apsiii,glucose_score,avg_bmi_value,label
0,277.0,36,0,0,64.320000,112.880000,221.272727,101.800000,92.000000,17.346154,37.186250,38,3,26.750000,0
1,2340.0,35,0,0,77.769231,122.961538,468.592593,105.361111,96.093023,19.813953,36.727586,87,5,30.558477,0
2,531.0,23,1,0,64.169118,119.661765,275.352941,154.028986,83.698413,25.584615,39.725000,158,5,30.558477,1
3,259.0,19,1,1,49.300000,74.200000,259.000000,89.750000,70.000000,26.333333,31.900000,88,3,30.558477,0
4,406.0,24,0,0,57.333333,92.848485,292.000000,86.631579,96.162162,15.289474,33.590000,150,5,30.558477,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14072,297.0,91,1,0,65.250000,118.214286,142.444444,58.740741,99.925926,15.615385,32.610000,135,3,30.558477,1
14073,265.0,91,1,0,55.280000,137.200000,203.000000,60.769231,99.800000,15.700000,37.008571,86,3,30.558477,1
14074,253.0,91,1,0,46.357143,104.750000,232.400000,71.296296,99.925926,18.370370,35.839091,48,3,31.072549,0
14075,417.0,91,1,1,73.181818,120.318182,111.600000,88.565217,99.954545,15.687500,37.156000,26,5,30.558477,0


### Without Standardization

In [3]:
# Data split
X = patient_df.loc[:, patient_df.columns != 'label']
y = patient_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [4]:
X_train.head()

Unnamed: 0,glucose_max,anchor_age,dod,gender,dbp_mean,sbp_mean,glucose_mean,heart_rate_mean,spo2_mean,resp_rate_mean,temperature_mean,apsiii,glucose_score,avg_bmi_value
9792,506.0,73,1,0,51.52381,124.714286,322.142857,77.590909,97.238095,18.47619,36.85,52,5,24.54
12480,226.0,83,1,1,54.75,81.5,226.0,101.1,81.777778,23.857143,35.0,111,3,38.281818
3769,284.0,55,1,0,41.454545,109.545455,274.230769,109.0,97.0,17.1875,37.62,57,5,30.558477
7152,213.0,65,0,0,66.6,127.36,174.25,77.178571,96.214286,21.607143,36.902857,21,3,30.558477
8129,288.0,68,0,0,63.090909,130.272727,223.6,68.782609,97.347826,19.291667,36.648333,36,3,31.05


In [5]:
# Tune model with stratifiedKFold cross validation
lr = LogisticRegression(random_state=SEED)

parameters = {
    'penalty': ['l1', 'l2'],
    'C'      : [0.1, 1, 10],
    'solver' : ['liblinear', 'saga']}
grid_lr = GridSearchCV(lr, parameters, cv=5, n_jobs=6)
grid_lr.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(random_state=90089), n_jobs=6,
             param_grid={'C': [0.1, 1, 10], 'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'saga']})

In [6]:
print("----         Results from Grid Search           ---")
print("\n The best estimator across ALL searched params:\n", grid_lr.best_estimator_)
print("\n The best parameters across ALL searched params:\n", grid_lr.best_params_)

----         Results from Grid Search           ---

 The best estimator across ALL searched params:
 LogisticRegression(C=1, penalty='l1', random_state=90089, solver='liblinear')

 The best parameters across ALL searched params:
 {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}


In [7]:
best_lr = LogisticRegression(
    C = 1,
    penalty = 'l1',
    solver = 'liblinear',
    random_state=SEED)

In [8]:
best_lr.fit(X_train, y_train)
best_lr.score(X_test, y_test)

0.8249289772727273

### With Standardization

In [13]:
numerical_features = ['glucose_max', 'dbp_mean', 'sbp_mean', 'glucose_mean', 
                      'heart_rate_mean', 'spo2_mean', 'resp_rate_mean', 'temperature_mean', 
                      'apsiii', 'glucose_score', 'avg_bmi_value']

# Standardized variable for consistent measurement across numerical values
scaler = StandardScaler()
X_std = scaler.fit_transform(patient_df[numerical_features])

# Interpolate categorical variables
X_cat = patient_df[['anchor_age', 'dod', 'gender']]


X = np.concatenate([X_std, X_cat.to_numpy()], axis=1)
y = patient_df["label"]
print("X Standardized data Shape: ", X_std.shape)
print("X Categorical data shape:  ", X_cat.shape)
print("X, y shape:                ", X.shape, y.shape)

X Standardized data Shape:  (14077, 11)
X Categorical data shape:   (14077, 3)
X, y shape:                 (14077, 14) (14077,)


In [14]:
X_train_std, X_test_std, y_train_std, y_test_std = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [16]:
lr_std = LogisticRegression(random_state=SEED)

parameters = {
    'penalty': ['l1', 'l2'],
    'C'      : [0.1, 1, 10],
    'solver' : ['liblinear', 'saga']}
grid_lr_std = GridSearchCV(lr_std, parameters, cv=5, n_jobs=6)
grid_lr_std.fit(X_train_std, y_train_std)



GridSearchCV(cv=5, estimator=LogisticRegression(random_state=90089), n_jobs=6,
             param_grid={'C': [0.1, 1, 10], 'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'saga']})

In [17]:
print("----         Results from Grid Search           ---")
print("\n The best estimator across ALL searched params:\n", grid_lr_std.best_estimator_)
print("\n The best parameters across ALL searched params:\n", grid_lr_std.best_params_)

----         Results from Grid Search           ---

 The best estimator across ALL searched params:
 LogisticRegression(C=0.1, random_state=90089, solver='saga')

 The best parameters across ALL searched params:
 {'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}


In [18]:
best_lr_std = LogisticRegression(
    C = 0.1,
    penalty = 'l2',
    solver = 'saga',
    random_state=SEED)

In [19]:
best_lr_std.fit(X_train_std, y_train_std)
best_lr_std.score(X_test_std, y_test_std)



0.8274147727272727