### `RandomForest` Modelling on Hyperglycemic data with imbalanced labelling

In [11]:
import numpy as np
import pandas as pd
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
SEED=90089

In [12]:
# Read Data
patient_df = pd.read_csv('../data/hyperglycemic_patients_w_bin_categories.csv', index_col=0)

# print detail summary
print("Dataframe shape:   ", patient_df.shape)
print("Dataframe Features:", patient_df.columns)
patient_df.head()

Dataframe shape:    (12872, 18)
Dataframe Features: Index(['subject_id', 'stay_id', 'glucose_max', 'anchor_age', 'dod', 'gender',
       'dbp_mean', 'sbp_mean', 'glucose_mean', 'heart_rate_mean', 'spo2_mean',
       'resp_rate_mean', 'temperature_mean', 'apsiii', 'glucose_score', 'los',
       'avg_bmi_value', 'label'],
      dtype='object')


Unnamed: 0,subject_id,stay_id,glucose_max,anchor_age,dod,gender,dbp_mean,sbp_mean,glucose_mean,heart_rate_mean,spo2_mean,resp_rate_mean,temperature_mean,apsiii,glucose_score,los,avg_bmi_value,label
0,12041046,31249096,277.0,36,0,0,64.32,112.88,221.272727,101.8,92.0,17.346154,37.18625,38,3,1.938414,26.75,0
1,17421995,38100564,2340.0,35,0,0,77.769231,122.961538,468.592593,105.361111,96.093023,19.813953,36.727586,87,5,4.472905,30.558477,1
2,10352416,35043893,531.0,23,1,0,64.169118,119.661765,275.352941,154.028986,83.698413,25.584615,39.725,158,5,11.636042,30.558477,2
3,16924291,34261137,259.0,19,1,1,49.3,74.2,259.0,89.75,70.0,26.333333,31.9,88,3,0.309664,30.558477,0
4,13553087,36461140,398.0,31,0,0,78.6,133.8,285.0,100.826087,90.304348,25.173913,36.593333,34,5,2.794167,30.558477,0


In [13]:
print("Null value in Feature set summary:\n",patient_df.isnull().sum())

Null value in Feature set summary:
 subject_id          0
stay_id             0
glucose_max         0
anchor_age          0
dod                 0
gender              0
dbp_mean            0
sbp_mean            0
glucose_mean        0
heart_rate_mean     0
spo2_mean           0
resp_rate_mean      0
temperature_mean    0
apsiii              0
glucose_score       0
los                 0
avg_bmi_value       0
label               0
dtype: int64


In [14]:
print("Label Distirbution:\n", patient_df['label'].value_counts())

Label Distirbution:
 0    8389
1    3165
2    1318
Name: label, dtype: int64


In [15]:
numerical_features = ['glucose_max', 'anchor_age', 'dbp_mean', 
                    'sbp_mean', 'glucose_mean', 'heart_rate_mean', 'spo2_mean', 
                    'resp_rate_mean', 'temperature_mean', 'apsiii', 'glucose_score', 'avg_bmi_value']

# Standardized variable for consistent measurement across numerical values
scaler = StandardScaler()
X_std = scaler.fit_transform(patient_df[numerical_features])

# Interpolate categorical variables
X_cat = patient_df[['gender']] 


X = np.concatenate([X_std, X_cat.to_numpy()], axis=1)
y = patient_df["label"]
print("X Standardized data Shape: ", X_std.shape)
print("X Categorical data shape:  ", X_cat.shape)
print("X, y shape:                ", X.shape, y.shape)

X Standardized data Shape:  (12872, 12)
X Categorical data shape:   (12872, 1)
X, y shape:                 (12872, 13) (12872,)


In [16]:
# Split data based on training 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8, 
                                                    random_state=SEED)
print("Train data shape:", X_train.shape, y_train.shape)
print("Test data shape:", X_test.shape, y_test.shape)

Train data shape: (10297, 13) (10297,)
Test data shape: (2575, 13) (2575,)


### Hyperparameter Tunning on `RandomForest`

In [17]:
param_grid = {
 'bootstrap'        : [True, False],
 'max_depth'        : [20, 60, 100, None],
 'min_samples_leaf' : [1, 4],
 'max_features'     : ['sqrt', 'log2'],
 'min_samples_split': [2, 5, 10],
 'n_estimators'     : [600, 1000, 1400]}
rf_model = RandomForestClassifier(random_state=SEED)

In [154]:
# disable deprecation warnings
warnings.filterwarnings('ignore')

In [58]:
random_grid_rf = RandomizedSearchCV(rf_model, param_distributions=param_grid, 
                                    n_iter=100, cv=5, n_jobs=7)
random_grid_rf.fit(X_train, y_train)

In [67]:
print("----         Results from Grid Search           ---")
print("\n The best estimator across ALL searched params:\n", random_grid_rf.best_estimator_)

## Removed scoring due to combinations yielding NaN results
#  print("\n The best score across ALL searched params:\n", grid_RF.best_score_) 
print("\n The best parameters across ALL searched params:\n", random_grid_rf.best_params_)

----         Results from Grid Search           ---

 The best estimator across ALL searched params:
 RandomForestClassifier(max_features='log2', min_samples_leaf=4,
                       min_samples_split=5, n_estimators=1400,
                       random_state=90089)

 The best parameters across ALL searched params:
 {'n_estimators': 1400, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': None, 'bootstrap': True}


In [18]:
# Instantiate RandomForest classifierr
rf_clf = RandomForestClassifier(n_estimators=600, 
                                min_samples_split=2,
                                min_samples_leaf=1,
                                max_features='log2',
                                max_depth=20,
                                random_state=SEED,
                                n_jobs=4)

### Evaluation on `RandomForest` with imbalance data

In [19]:
rf_clf.fit(X_train, y_train)
print("Model Accuracy based on Testing dataset:", rf_clf.score(X_test, y_test))

Model Accuracy based on Testing dataset: 0.6485436893203883


In [20]:
print("RF Feature Importance:", rf_clf.feature_importances_)

y_pred = rf_clf.predict(X_test)
print(classification_report(y_pred, y_test))

RF Feature Importance: [0.08032324 0.07204832 0.08521041 0.08940546 0.08247345 0.08653198
 0.09636299 0.08798932 0.09839143 0.13658905 0.00816293 0.06338162
 0.01312981]
              precision    recall  f1-score   support

           0       0.94      0.68      0.79      2283
           1       0.15      0.37      0.21       251
           2       0.06      0.39      0.10        41

    accuracy                           0.65      2575
   macro avg       0.38      0.48      0.37      2575
weighted avg       0.85      0.65      0.72      2575

