### `RandomForest` Modelling on Hyperglycemic data with imbalanced labelling

In [10]:
import numpy as np
import pandas as pd
import warnings

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier


from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel

from collections import Counter

SEED=90089

In [11]:
# Read Data
patient_df = pd.read_csv('../data/hyperglycemic_patients_w_bin_categories.csv', index_col=0)

# print detail summary
print("Dataframe shape:   ", patient_df.shape)
print("Dataframe Features:", patient_df.columns)
patient_df.head()

Dataframe shape:    (13243, 27)
Dataframe Features: Index(['glucose_max', 'anchor_age', 'dod', 'gender', 'dbp_mean', 'sbp_mean',
       'glucose_mean', 'heart_rate_mean', 'spo2_mean', 'resp_rate_mean',
       'temperature_mean', 'apsiii', 'glucose_score', 'los', 'avg_bmi_value',
       'avg_sofa', 'admission_type', 'label', 'AMBULATORY OBSERVATION',
       'DIRECT EMER.', 'DIRECT OBSERVATION', 'ELECTIVE', 'EU OBSERVATION',
       'EW EMER.', 'OBSERVATION ADMIT', 'SURGICAL SAME DAY ADMISSION',
       'URGENT'],
      dtype='object')


Unnamed: 0,glucose_max,anchor_age,dod,gender,dbp_mean,sbp_mean,glucose_mean,heart_rate_mean,spo2_mean,resp_rate_mean,...,label,AMBULATORY OBSERVATION,DIRECT EMER.,DIRECT OBSERVATION,ELECTIVE,EU OBSERVATION,EW EMER.,OBSERVATION ADMIT,SURGICAL SAME DAY ADMISSION,URGENT
0,277.0,36,0,0,64.32,112.88,221.272727,101.8,1.963788,1.239203,...,0,0,0,0,0,0,0,1,0,0
1,259.0,19,1,1,49.3,74.2,259.0,89.75,1.845098,1.420506,...,0,0,0,0,0,0,1,0,0,0
2,406.0,24,0,0,57.333333,92.848485,292.0,86.631579,1.983004,1.184393,...,2,0,0,0,0,0,1,0,0,0
3,398.0,31,0,0,78.6,133.8,285.0,100.826087,1.955709,1.400951,...,0,0,0,0,0,0,1,0,0,0
4,487.0,34,0,1,107.0625,164.4375,235.777778,77.0,1.996731,1.39887,...,2,0,0,0,0,0,0,0,0,1


In [12]:
print("Null value in Feature set summary:\n",patient_df.isnull().sum(), "\n","--"*15)
# print("Label Distirbution:\n", patient_df['label'].value_counts())

Null value in Feature set summary:
 glucose_max                    0
anchor_age                     0
dod                            0
gender                         0
dbp_mean                       0
sbp_mean                       0
glucose_mean                   0
heart_rate_mean                0
spo2_mean                      0
resp_rate_mean                 0
temperature_mean               0
apsiii                         0
glucose_score                  0
los                            0
avg_bmi_value                  0
avg_sofa                       0
admission_type                 0
label                          0
AMBULATORY OBSERVATION         0
DIRECT EMER.                   0
DIRECT OBSERVATION             0
ELECTIVE                       0
EU OBSERVATION                 0
EW EMER.                       0
OBSERVATION ADMIT              0
SURGICAL SAME DAY ADMISSION    0
URGENT                         0
dtype: int64 
 ------------------------------


In [13]:
print("Data Cluster Distribution", Counter(patient_df['label'].value_counts()))

Data Cluster Distribution Counter({8229: 1, 3178: 1, 1836: 1})


In [30]:
numerical_features = ['glucose_max', 'anchor_age', 'dbp_mean', 'sbp_mean',
       'glucose_mean', 'heart_rate_mean', 'spo2_mean', 'resp_rate_mean',
       'temperature_mean', 'apsiii', 'glucose_score', 'avg_sofa', 'avg_bmi_value']


# Standardized variable for consistent measurement across numerical values
scaler = StandardScaler()
X_std = scaler.fit_transform(patient_df[numerical_features])

# Interpolate categorical variables
X_cat = patient_df[['gender', 'AMBULATORY OBSERVATION',
       'DIRECT EMER.', 'DIRECT OBSERVATION', 'ELECTIVE', 'EU OBSERVATION',
       'EW EMER.', 'OBSERVATION ADMIT', 'SURGICAL SAME DAY ADMISSION',
       'URGENT']]


X = np.concatenate([X_std, X_cat.to_numpy()], axis=1)
y = patient_df["label"]
print("X Standardized data Shape: ", X_std.shape)
print("X Categorical data shape:  ", X_cat.shape)
print("X, y shape:                ", X.shape, y.shape)

X Standardized data Shape:  (13243, 13)
X Categorical data shape:   (13243, 10)
X, y shape:                 (13243, 23) (13243,)


In [31]:
# Split data based on training 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8, 
                                                    random_state=SEED)
print("Train data shape:        ", X_train.shape, y_train.shape)
print("Test data shape:         ", X_test.shape, y_test.shape)

Train data shape:         (10594, 23) (10594,)
Test data shape:          (2649, 23) (2649,)


In [32]:
# Resample data to tackle class imbalance
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

# summarize the resampled label distribution
print("Train label distribution:", dict(Counter(y_train)))

Train label distribution: {0: 6585, 1: 6585, 2: 6585}


### Hyperparameter Tunning on `RandomForest`

In [17]:
param_grid = {
 'bootstrap'        : [True, False],
 'max_depth'        : [20, 60, 100],
 'min_samples_leaf' : [1, 4],
 'max_features'     : ['sqrt', 'log2'],
 'min_samples_split': [2, 5, 10],
 'n_estimators'     : [50, 100, 200]}
rf_model = RandomForestClassifier(random_state=SEED)

In [18]:
# disable deprecation warnings
warnings.filterwarnings('ignore')

In [19]:
random_grid_rf = RandomizedSearchCV(rf_model, param_distributions=param_grid, 
                                cv=5, n_jobs=8)
random_grid_rf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=90089),
                   n_jobs=8,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [20, 60, 100],
                                        'max_features': ['sqrt', 'log2'],
                                        'min_samples_leaf': [1, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [50, 100, 200]})

In [20]:
print("----         Results from Grid Search           ---")
print("\n The best estimator across ALL searched params:\n", random_grid_rf.best_estimator_)

## Removed scoring due to combinations yielding NaN results
#  print("\n The best score across ALL searched params:\n", grid_RF.best_score_) 
print("\n The best parameters across ALL searched params:\n", random_grid_rf.best_params_)

----         Results from Grid Search           ---

 The best estimator across ALL searched params:
 RandomForestClassifier(bootstrap=False, max_depth=100, max_features='log2',
                       random_state=90089)

 The best parameters across ALL searched params:
 {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 100, 'bootstrap': False}


In [33]:
# Instantiate RF classifier
rf_clf = RandomForestClassifier(n_estimators=100, 
                                min_samples_split=2,
                                min_samples_leaf=1,
                                max_depth=100,
                                max_features='sqrt',
                                bootstrap=False,
                                random_state=SEED,
                                n_jobs=4)

### Evaluation on `RandomForest` with balance data

In [34]:
rf_clf.fit(X_train, y_train)
print("Model Accuracy based on Testing dataset:", rf_clf.score(X_test, y_test))

Model Accuracy based on Testing dataset: 0.627029067572669


In [35]:
print("RF Feature Importance:", rf_clf.feature_importances_, "\n", "_"*60)

y_pred = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred))

RF Feature Importance: [6.69792419e-02 6.74505155e-02 6.72762382e-02 6.74838967e-02
 6.71498383e-02 6.74050302e-02 6.88355677e-02 7.01150329e-02
 7.26254522e-02 1.25818335e-01 8.94232325e-03 1.16624649e-01
 5.44429205e-02 2.66215443e-02 1.32877752e-06 5.89418636e-03
 5.09101638e-05 2.33868331e-03 9.94083572e-05 1.71578655e-02
 9.67376381e-03 4.93247384e-03 1.20807947e-02] 
 ____________________________________________________________
              precision    recall  f1-score   support

           0       0.73      0.83      0.78      1644
           1       0.35      0.22      0.27       635
           2       0.40      0.42      0.41       370

    accuracy                           0.63      2649
   macro avg       0.50      0.49      0.49      2649
weighted avg       0.59      0.63      0.60      2649



In [36]:
feature_selection = SelectFromModel(rf_clf, prefit=True)
feature_idx = feature_selection.get_support()

#Identifying features after feature selections
feature_df = patient_df[numerical_features+['gender', 'AMBULATORY OBSERVATION',
       'DIRECT EMER.', 'DIRECT OBSERVATION', 'ELECTIVE', 'EU OBSERVATION',
       'EW EMER.', 'OBSERVATION ADMIT', 'SURGICAL SAME DAY ADMISSION',
       'URGENT']]
feature_name = feature_df.columns[feature_idx]

print("selected_features:    ", feature_name)

X_new = feature_selection.transform(X)
print("Select Feature.shape: ", X_new.shape)

selected_features:     Index(['glucose_max', 'anchor_age', 'dbp_mean', 'sbp_mean', 'glucose_mean',
       'heart_rate_mean', 'spo2_mean', 'resp_rate_mean', 'temperature_mean',
       'apsiii', 'avg_sofa', 'avg_bmi_value'],
      dtype='object')
Select Feature.shape:  (13243, 12)


In [37]:
# Split data based on training 
X_train, X_test, y_train, y_test = train_test_split(X_new, y, 
                                                    train_size=0.8, 
                                                    random_state=SEED)

# Resample data to tackle class imbalance
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

print("Train data shape:", X_train.shape, y_train.shape)
print("Test data shape:", X_test.shape, y_test.shape)

Train data shape: (19755, 12) (19755,)
Test data shape: (2649, 12) (2649,)


### Fitting `RandomForest` again after feature selections

In [38]:
# Instantiate SVC classifierr
rf_clf = RandomForestClassifier(n_estimators=100, 
                                min_samples_split=2,
                                min_samples_leaf=1,
                                max_features='log2',
                                max_depth=None,
                                bootstrap=False,
                                random_state=SEED,
                                n_jobs=4)
rf_clf.fit(X_train, y_train)
print("Model Accuracy based on Testing dataset:", rf_clf.score(X_test, y_test))

Model Accuracy based on Testing dataset: 0.6258965647414119


In [39]:
y_pred = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.82      0.78      1644
           1       0.35      0.22      0.27       635
           2       0.40      0.44      0.42       370

    accuracy                           0.63      2649
   macro avg       0.50      0.50      0.49      2649
weighted avg       0.60      0.63      0.61      2649

