In [1]:
# Load libraries
import warnings
import numpy as np
import pandas as pd
import os
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import classification_report
SEED = 90089

In [2]:
# disable deprecation warnings and convergence warning after production 
os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"

### `LogisticRegression` Classifier on Hyperclygemic patients

In [3]:
# Read data
patient_df = pd.read_csv('../data/hyperglycemic_patients_w_bin_categories.csv', index_col=0)
patient_df = patient_df.drop(['los'], axis=1)

# Previewing data
print("Dataframe shape:   ", patient_df.shape)
print("Dataframe Features:", patient_df.columns)
patient_df.head()

Dataframe shape:    (13424, 26)
Dataframe Features: Index(['glucose_max', 'anchor_age', 'dod', 'gender', 'dbp_mean', 'sbp_mean',
       'glucose_mean', 'heart_rate_mean', 'spo2_mean', 'resp_rate_mean',
       'temperature_mean', 'apsiii', 'glucose_score', 'avg_bmi_value',
       'avg_sofa', 'admission_type', 'label', 'AMBULATORY OBSERVATION',
       'DIRECT EMER.', 'DIRECT OBSERVATION', 'ELECTIVE', 'EU OBSERVATION',
       'EW EMER.', 'OBSERVATION ADMIT', 'SURGICAL SAME DAY ADMISSION',
       'URGENT'],
      dtype='object')


Unnamed: 0,glucose_max,anchor_age,dod,gender,dbp_mean,sbp_mean,glucose_mean,heart_rate_mean,spo2_mean,resp_rate_mean,...,label,AMBULATORY OBSERVATION,DIRECT EMER.,DIRECT OBSERVATION,ELECTIVE,EU OBSERVATION,EW EMER.,OBSERVATION ADMIT,SURGICAL SAME DAY ADMISSION,URGENT
0,277.0,36,0,0,64.32,112.88,221.272727,101.8,92.0,17.346154,...,0,0,0,0,0,0,0,1,0,0
1,2340.0,35,0,0,77.769231,122.961538,468.592593,105.361111,96.093023,19.813953,...,1,0,0,0,0,0,1,0,0,0
2,259.0,19,1,1,49.3,74.2,259.0,89.75,70.0,26.333333,...,0,0,0,0,0,0,1,0,0,0
3,406.0,24,0,0,57.333333,92.848485,292.0,86.631579,96.162162,15.289474,...,2,0,0,0,0,0,1,0,0,0
4,398.0,31,0,0,78.6,133.8,285.0,100.826087,90.304348,25.173913,...,0,0,0,0,0,0,1,0,0,0


In [14]:
print("Null value in Feature set summary:\n",patient_df.isnull().sum(), "\n","--"*15)

Null value in Feature set summary:
 glucose_max                    0
anchor_age                     0
dod                            0
gender                         0
dbp_mean                       0
sbp_mean                       0
glucose_mean                   0
heart_rate_mean                0
spo2_mean                      0
resp_rate_mean                 0
temperature_mean               0
apsiii                         0
glucose_score                  0
avg_bmi_value                  0
avg_sofa                       0
admission_type                 0
label                          0
AMBULATORY OBSERVATION         0
DIRECT EMER.                   0
DIRECT OBSERVATION             0
ELECTIVE                       0
EU OBSERVATION                 0
EW EMER.                       0
OBSERVATION ADMIT              0
SURGICAL SAME DAY ADMISSION    0
URGENT                         0
dtype: int64 
 ------------------------------


### LR without `StandardScaler`

In [6]:
# Data split
X = patient_df.loc[:, patient_df.columns != 'label']
y = patient_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [20]:
# Resample data to tackle class imbalance
oversample = SMOTE()
X_sampled, y_sampled = oversample.fit_resample(X_train, y_train)

# summarize the resampled label distribution
counter = Counter(y_sampled)
print(counter)

Counter({0: 6729, 1: 6729, 2: 6729})


In [21]:
# Tune model with stratifiedKFold cross validation
lr = LogisticRegression(random_state=SEED)

parameters = {
    'penalty' : ['l1', 'l2'],
    'C'       : [0.1, 1, 10],
    'solver'  : ['liblinear', 'saga'], 
    'max_iter': [500, 1000]}
grid_lr = GridSearchCV(lr, parameters, cv=5, n_jobs=8)
grid_lr.fit(X_sampled, y_sampled)

GridSearchCV(cv=5, estimator=LogisticRegression(random_state=90089), n_jobs=8,
             param_grid={'C': [0.1, 1, 10], 'max_iter': [500, 1000],
                         'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'saga']})

In [10]:
print("----         Results from Grid Search           ---")
print("\n The best estimator across ALL searched params:\n", grid_lr.best_estimator_)
print("\n The best parameters across ALL searched params:\n", grid_lr.best_params_)

----         Results from Grid Search           ---

 The best estimator across ALL searched params:
 LogisticRegression(C=1, max_iter=500, penalty='l1', random_state=90089,
                   solver='liblinear')

 The best parameters across ALL searched params:
 {'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}


In [11]:
best_lr = LogisticRegression(C=1,
                            penalty='l1',
                            solver='liblinear',
                            random_state=SEED,
                            max_iter=500,
                            n_jobs=8)

In [22]:
best_lr.fit(X_sampled, y_sampled)

  " = {}.".format(effective_n_jobs(self.n_jobs))


LogisticRegression(C=1, max_iter=500, n_jobs=8, penalty='l1',
                   random_state=90089, solver='liblinear')

In [23]:
print("Model Accuracy based on Testing dataset: {:.5f}".format(best_lr.score(X_test, y_test)))

Model Accuracy based on Testing dataset: 0.56777


### LR Model with `StandardScaler`

In [4]:
numerical_features = ['glucose_max', 'anchor_age', 'dbp_mean', 'sbp_mean',
       'glucose_mean', 'heart_rate_mean', 'spo2_mean', 'resp_rate_mean',
       'temperature_mean', 'apsiii', 'glucose_score', 'avg_bmi_value', 'avg_sofa']


# Standardized variable for consistent measurement across numerical values
scaler = StandardScaler()
X_std = scaler.fit_transform(patient_df[numerical_features])

# Interpolate categorical variables
X_cat = patient_df[['dod', 'gender', 'AMBULATORY OBSERVATION',
       'DIRECT EMER.', 'DIRECT OBSERVATION', 'ELECTIVE', 'EU OBSERVATION',
       'EW EMER.', 'OBSERVATION ADMIT', 'SURGICAL SAME DAY ADMISSION',
       'URGENT']]


X = np.concatenate([X_std, X_cat.to_numpy()], axis=1)
y = patient_df["label"]
print("X Standardized data Shape: ", X_std.shape)
print("X Categorical data shape:  ", X_cat.shape)
print("X, y shape:                ", X.shape, y.shape)

X Standardized data Shape:  (13424, 13)
X Categorical data shape:   (13424, 11)
X, y shape:                 (13424, 24) (13424,)


In [5]:
X_train_std, X_test_std, y_train_std, y_test_std = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [6]:
# Resample data to tackle class imbalance
oversample = SMOTE()
X_sampled_std, y_sampled_std = oversample.fit_resample(X_train_std, y_train_std)

# summarize the resampled label distribution
counter = Counter(y_sampled_std)
print(counter)

Counter({1: 6704, 0: 6704, 2: 6704})


In [8]:
lr_std = LogisticRegression(random_state=SEED)

parameters = {
    'penalty' : ['l1', 'l2'],
    'C'       : [0.1, 1, 10],
    'solver'  : ['liblinear', 'saga'], 
    'max_iter': [100, 500]}
grid_lr_std = GridSearchCV(lr_std, parameters, cv=5, n_jobs=6)
grid_lr_std.fit(X_sampled_std, y_sampled_std)



GridSearchCV(cv=5, estimator=LogisticRegression(random_state=90089), n_jobs=6,
             param_grid={'C': [0.1, 1, 10], 'max_iter': [100, 500],
                         'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'saga']})

In [9]:
print("----         Results from Grid Search           ---")
print("\n The best estimator across ALL searched params:\n", grid_lr_std.best_estimator_)
print("\n The best parameters across ALL searched params:\n", grid_lr_std.best_params_)

----         Results from Grid Search           ---

 The best estimator across ALL searched params:
 LogisticRegression(C=0.1, penalty='l1', random_state=90089, solver='saga')

 The best parameters across ALL searched params:
 {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}


In [10]:
best_lr_std = LogisticRegression(C = 0.1,
                                penalty = 'l1',
                                solver = 'saga',
                                random_state=SEED,
                                max_iter=100, 
                                n_jobs=8)
best_lr_std.fit(X_sampled_std, y_sampled_std)



LogisticRegression(C=0.1, n_jobs=8, penalty='l1', random_state=90089,
                   solver='saga')

In [11]:
print("Model Accuracy based on Testing dataset: {:.5f}".format(best_lr_std.score(X_test_std, y_test_std)))

Model Accuracy based on Testing dataset: 0.57803


In [13]:
y_pred = best_lr_std.predict(X_test_std)
print(classification_report(y_pred, y_test_std))

              precision    recall  f1-score   support

           0       0.69      0.79      0.73      1449
           1       0.29      0.33      0.31       595
           2       0.60      0.34      0.43       641

    accuracy                           0.58      2685
   macro avg       0.53      0.48      0.49      2685
weighted avg       0.58      0.58      0.57      2685

