### `RandomForest` Modelling on Hyperglycemic data with imbalanced labelling

In [63]:
import numpy as np
import pandas as pd
import warnings

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier


from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel

from collections import Counter

SEED=90089

In [64]:
# Read Data
patient_df = pd.read_csv('../data/hyperglycemic_patients_w_bin_categories.csv', index_col=0)

# print detail summary
print("Dataframe shape:   ", patient_df.shape)
print("Dataframe Features:", patient_df.columns)
patient_df.head()

Dataframe shape:    (12872, 18)
Dataframe Features: Index(['subject_id', 'stay_id', 'glucose_max', 'anchor_age', 'dod', 'gender',
       'dbp_mean', 'sbp_mean', 'glucose_mean', 'heart_rate_mean', 'spo2_mean',
       'resp_rate_mean', 'temperature_mean', 'apsiii', 'glucose_score', 'los',
       'avg_bmi_value', 'label'],
      dtype='object')


Unnamed: 0,subject_id,stay_id,glucose_max,anchor_age,dod,gender,dbp_mean,sbp_mean,glucose_mean,heart_rate_mean,spo2_mean,resp_rate_mean,temperature_mean,apsiii,glucose_score,los,avg_bmi_value,label
0,12041046,31249096,277.0,36,0,0,64.32,112.88,221.272727,101.8,92.0,17.346154,37.18625,38,3,1.938414,26.75,0
1,17421995,38100564,2340.0,35,0,0,77.769231,122.961538,468.592593,105.361111,96.093023,19.813953,36.727586,87,5,4.472905,30.558477,1
2,10352416,35043893,531.0,23,1,0,64.169118,119.661765,275.352941,154.028986,83.698413,25.584615,39.725,158,5,11.636042,30.558477,2
3,16924291,34261137,259.0,19,1,1,49.3,74.2,259.0,89.75,70.0,26.333333,31.9,88,3,0.309664,30.558477,0
4,13553087,36461140,398.0,31,0,0,78.6,133.8,285.0,100.826087,90.304348,25.173913,36.593333,34,5,2.794167,30.558477,0


In [65]:
print("Null value in Feature set summary:\n",patient_df.isnull().sum(), "\n","--"*15)
# print("Label Distirbution:\n", patient_df['label'].value_counts())

Null value in Feature set summary:
 subject_id          0
stay_id             0
glucose_max         0
anchor_age          0
dod                 0
gender              0
dbp_mean            0
sbp_mean            0
glucose_mean        0
heart_rate_mean     0
spo2_mean           0
resp_rate_mean      0
temperature_mean    0
apsiii              0
glucose_score       0
los                 0
avg_bmi_value       0
label               0
dtype: int64 
 ------------------------------


In [66]:
print("Data Cluster Distribution", Counter(patient_df['label'].value_counts()))

Data Cluster Distribution Counter({8389: 1, 3165: 1, 1318: 1})


In [67]:
numerical_features = ['glucose_max', 'anchor_age', 'dbp_mean', 
                    'sbp_mean', 'glucose_mean', 'heart_rate_mean', 'spo2_mean', 
                    'resp_rate_mean', 'temperature_mean', 'apsiii', 'glucose_score', 'avg_bmi_value']

# Standardized variable for consistent measurement across numerical values
scaler = StandardScaler()
X_std = scaler.fit_transform(patient_df[numerical_features])

# Interpolate categorical variables
X_cat = patient_df[['gender']]


X = np.concatenate([X_std, X_cat.to_numpy()], axis=1)
y = patient_df["label"]
print("X Standardized data Shape: ", X_std.shape)
print("X Categorical data shape:  ", X_cat.shape)
print("X, y shape:                ", X.shape, y.shape)

X Standardized data Shape:  (12872, 12)
X Categorical data shape:   (12872, 1)
X, y shape:                 (12872, 13) (12872,)


In [68]:
# Split data based on training 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8, 
                                                    random_state=SEED)
print("Train data shape:        ", X_train.shape, y_train.shape)
print("Test data shape:         ", X_test.shape, y_test.shape)

Train data shape:         (10297, 13) (10297,)
Test data shape:          (2575, 13) (2575,)


In [69]:
# Resample data to tackle class imbalance
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

# summarize the resampled label distribution
print("Train label distribution:", dict(Counter(y_train)))

Train label distribution: {0: 6729, 1: 6729, 2: 6729}


### Hyperparameter Tunning on `RandomForest`

In [70]:
param_grid = {
 'bootstrap'        : [True, False],
 'max_depth'        : [20, 40, 80, 100, None],
 'min_samples_leaf' : [1, 4],
 'max_features'     : ['sqrt', 'log2'],
 'min_samples_split': [2, 5, 10],
 'n_estimators'     : [200, 600, 1000]}
rf_model = RandomForestClassifier(random_state=SEED)

In [71]:
# disable deprecation warnings
warnings.filterwarnings('ignore')

In [72]:
random_grid_rf = RandomizedSearchCV(rf_model, param_distributions=param_grid, 
                                cv=5, n_jobs=8)
random_grid_rf.fit(X_train, y_train)

In [73]:
print("----         Results from Grid Search           ---")
print("\n The best estimator across ALL searched params:\n", random_grid_rf.best_estimator_)

## Removed scoring due to combinations yielding NaN results
#  print("\n The best score across ALL searched params:\n", grid_RF.best_score_) 
print("\n The best parameters across ALL searched params:\n", random_grid_rf.best_params_)

----         Results from Grid Search           ---

 The best estimator across ALL searched params:
 RandomForestClassifier(bootstrap=False, max_depth=80, n_estimators=1000,
                       random_state=90089)

 The best parameters across ALL searched params:
 {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}


In [74]:
# Instantiate RF classifier
rf_clf = RandomForestClassifier(n_estimators=1000, 
                                min_samples_split=2,
                                min_samples_leaf=1,
                                max_depth=80,
                                max_features='sqrt',
                                bootstrap=False,
                                random_state=SEED,
                                n_jobs=4)

### Evaluation on `RandomForest` with balance data

In [76]:
rf_clf.fit(X_train, y_train)
print("Model Accuracy based on Testing dataset:", rf_clf.score(X_test, y_test))

Model Accuracy based on Testing dataset: 0.6256310679611651


In [77]:
print("RF Feature Importance:", rf_clf.feature_importances_, "\n", "_"*60)

y_pred = rf_clf.predict(X_test)
print(classification_report(y_pred, y_test))

RF Feature Importance: [0.08042353 0.08198513 0.08312544 0.08458926 0.07599475 0.08211903
 0.08618964 0.08354406 0.08519538 0.14904972 0.0094305  0.06919537
 0.02915819] 
 ____________________________________________________________
              precision    recall  f1-score   support

           0       0.82      0.73      0.78      1869
           1       0.25      0.37      0.30       429
           2       0.30      0.30      0.30       277

    accuracy                           0.63      2575
   macro avg       0.46      0.47      0.46      2575
weighted avg       0.67      0.63      0.65      2575



In [56]:
feature_selection = SelectFromModel(rf_clf, prefit=True)
feature_idx = feature_selection.get_support()

#Identifying features after feature selections
feature_df = patient_df[numerical_features+['gender']]
feature_name = feature_df.columns[feature_idx]

print("selected_features:    ", feature_name)

X_new = feature_selection.transform(X)
print("Select Feature.shape: ", X_new.shape)

selected_features:     Index(['glucose_max', 'anchor_age', 'dbp_mean', 'sbp_mean', 'heart_rate_mean',
       'spo2_mean', 'resp_rate_mean', 'temperature_mean', 'apsiii'],
      dtype='object')
Select Feature.shape:  (12872, 9)


In [57]:
# Split data based on training 
X_train, X_test, y_train, y_test = train_test_split(X_new, y, 
                                                    train_size=0.8, 
                                                    random_state=SEED)

# Resample data to tackle class imbalance
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

print("Train data shape:", X_train.shape, y_train.shape)
print("Test data shape:", X_test.shape, y_test.shape)

Train data shape: (20187, 9) (20187,)
Test data shape: (2575, 9) (2575,)


### Fitting `RandomForest` again after feature selections

In [58]:
# Instantiate SVC classifierr
rf_clf = RandomForestClassifier(n_estimators=1400, 
                                min_samples_split=2,
                                min_samples_leaf=1,
                                max_features='log2',
                                max_depth=None,
                                bootstrap=False,
                                random_state=SEED,
                                n_jobs=4)
rf_clf.fit(X_train, y_train)
print("Model Accuracy based on Testing dataset:", rf_clf.score(X_test, y_test))

Model Accuracy based on Testing dataset: 0.610873786407767


In [60]:
y_pred = rf_clf.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.79      0.74      0.77      1760
           1       0.26      0.35      0.30       475
           2       0.35      0.29      0.31       340

    accuracy                           0.61      2575
   macro avg       0.47      0.46      0.46      2575
weighted avg       0.63      0.61      0.62      2575

