### `RandomForest` Modelling on Hyperglycemic data with imbalanced labelling

In [103]:
import numpy as np
import pandas as pd
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
SEED=90089

In [90]:
# Read Data
patient_df = pd.read_csv('../data/hyperglycemic_patients_w_categories.csv', index_col=0)

# print detail summary
print("Dataframe shape:   ", patient_df.shape)
print("Dataframe Features:", patient_df.columns)
patient_df.head()

Dataframe shape:    (14077, 17)
Dataframe Features: Index(['subject_id', 'stay_id', 'glucose_max', 'anchor_age', 'dod', 'gender',
       'dbp_mean', 'sbp_mean', 'glucose_mean', 'heart_rate_mean', 'spo2_mean',
       'resp_rate_mean', 'temperature_mean', 'apsiii', 'glucose_score',
       'avg_bmi_value', 'label'],
      dtype='object')


Unnamed: 0,subject_id,stay_id,glucose_max,anchor_age,dod,gender,dbp_mean,sbp_mean,glucose_mean,heart_rate_mean,spo2_mean,resp_rate_mean,temperature_mean,apsiii,glucose_score,avg_bmi_value,label
0,12041046,31249096,277.0,36,0,0,64.32,112.88,221.272727,101.8,92.0,17.346154,37.18625,38,3,26.75,0
1,17421995,38100564,2340.0,35,0,0,77.769231,122.961538,468.592593,105.361111,96.093023,19.813953,36.727586,87,5,30.558477,0
2,10352416,35043893,531.0,23,1,0,64.169118,119.661765,275.352941,154.028986,83.698413,25.584615,39.725,158,5,30.558477,1
3,16924291,34261137,259.0,19,1,1,49.3,74.2,259.0,89.75,70.0,26.333333,31.9,88,3,30.558477,0
4,15705944,37056020,406.0,24,0,0,57.333333,92.848485,292.0,86.631579,96.162162,15.289474,33.59,150,5,30.558477,1


In [110]:
print("Null value in Feature set summary:\n",patient_df.isnull().sum(), "\n----------------")
print("Label Distirbution:\n", patient_df['label'].value_counts())

Null value in Feature set summary:
 subject_id          0
stay_id             0
glucose_max         0
anchor_age          0
dod                 0
gender              0
dbp_mean            0
sbp_mean            0
glucose_mean        0
heart_rate_mean     0
spo2_mean           0
resp_rate_mean      0
temperature_mean    0
apsiii              0
glucose_score       0
avg_bmi_value       0
label               0
dtype: int64 
----------------
Label Distirbution:
 0    11759
1     1979
2      339
Name: label, dtype: int64


In [111]:
patient_df['label'].value_counts()

0    11759
1     1979
2      339
Name: label, dtype: int64

In [131]:
numerical_features = ['glucose_max', 'anchor_age', 'dbp_mean', 
                    'sbp_mean', 'glucose_mean', 'heart_rate_mean', 'spo2_mean', 
                    'resp_rate_mean', 'temperature_mean', 'apsiii', 'glucose_score', 'avg_bmi_value']

# Standardized variable for consistent measurement across numerical values
scaler = StandardScaler()
X_std = scaler.fit_transform(patient_df[numerical_features])

# Interpolate categorical variables
X_cat = patient_df[['dod', 'gender']]


X = np.concatenate([X_std, X_cat.to_numpy()], axis=1)
y = patient_df["label"]
print("X Standardized data Shape: ", X_std.shape)
print("X Categorical data shape:  ", X_cat.shape)
print("X, y shape:                ", X.shape, y.shape)

X Standardized data Shape:  (14077, 12)
X Categorical data shape:   (14077, 2)
X, y shape:                 (14077, 14) (14077,)


In [132]:
# Split data based on training 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8, 
                                                    random_state=SEED)
print("Train data shape:", X_train.shape, y_train.shape)
print("Test data shape:", X_test.shape, y_test.shape)

Train data shape: (11261, 14) (11261,)
Test data shape: (2816, 14) (2816,)


### Hyperparameter Tunning on `RandomForest`

In [133]:
param_grid = {
 'bootstrap'        : [True, False],
 'max_depth'        : [20, 40, 60, 80, 100, None],
 'min_samples_leaf' : [1, 4],
 'max_features'     : ['sqrt', 'log2'],
 'min_samples_split': [2, 5, 10],
 'n_estimators'     : [600, 1000, 1400]}
rf_model = RandomForestClassifier(random_state=SEED)

In [134]:
# disable deprecation warnings
warnings.filterwarnings('ignore')

In [135]:
random_grid_rf = RandomizedSearchCV(rf_model, param_distributions=param_grid, 
                                    n_iter = 100, cv=5, n_jobs=6)
random_grid_rf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=90089),
                   n_iter=100, n_jobs=6,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [20, 40, 60, 80, 100,
                                                      None],
                                        'max_features': ['sqrt', 'log2'],
                                        'min_samples_leaf': [1, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [600, 1000, 1400]})

In [136]:
print("----         Results from Grid Search           ---")
print("\n The best estimator across ALL searched params:\n", grid_RF.best_estimator_)

## Removed scoring due to combinations yielding NaN results
#  print("\n The best score across ALL searched params:\n", grid_RF.best_score_) 
print("\n The best parameters across ALL searched params:\n", grid_RF.best_params_)

----         Results from Grid Search           ---

 The best estimator across ALL searched params:
 RandomForestClassifier(max_depth=20, n_estimators=600, random_state=90089)

 The best parameters across ALL searched params:
 {'bootstrap': True, 'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 600}


In [137]:
# Instantiate SVC classifierr
rf_clf = RandomForestClassifier(n_estimators=600, 
                                min_samples_split=2,
                                min_samples_leaf=1,
                                max_features='auto',
                                max_depth=20,
                                random_state=SEED,
                                n_jobs=4)

### Evaluation on `RandomForest` with imbalance data

In [138]:
rf_clf.fit(X_train, y_train)
print("Model Accuracy based on Testing dataset:", rf_clf.score(X_test, y_test))

Model Accuracy based on Testing dataset: 0.8323863636363636


In [130]:
y_pred = rf_clf.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.99      0.84      0.91      2760
           1       0.08      0.61      0.14        56
           2       0.00      0.00      0.00         0

    accuracy                           0.83      2816
   macro avg       0.36      0.48      0.35      2816
weighted avg       0.98      0.83      0.89      2816

