# Random Forest
For the classification of hypoglycemia, euglycemia and hyperglycemia from MIMIC IV electronic health records

### chartEventsPred run

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

data = pd.read_csv('./chartEventsPred.csv', delimiter=',')

for col in data.columns:
    if data[col].dtype == object:
        print(col + " is an object, converting to category...")
        data[col] = data[col].astype('category')

label is an object, converting to category...
charttime is an object, converting to category...


In [2]:
print(data.columns.tolist())

['subject_id', 'label', 'charttime', 'glucose', 'next_glucose', 'mean_last3', 'std_last3', 'trend', 'BUN', 'Blood Pressure', 'Creatinine', 'Diastolic Blood Pressure', 'Heart Rate', 'Hemoglobin', 'O2 saturation pulseoxymetry', 'PTT', 'Respiratory Rate', 'Sodium (serum)', 'Systolic Blood Pressure', 'WBC']


In [3]:
dataY = data['label']
dataX = data.drop(columns=['next_glucose', 'subject_id', 'label', 'charttime'])

# Check for missing values in features
print("Missing values in each feature column:")
print(dataX.isnull().sum())

Missing values in each feature column:
glucose                            0
mean_last3                         0
std_last3                      41990
trend                          41990
BUN                              484
Blood Pressure                   192
Creatinine                       469
Diastolic Blood Pressure         155
Heart Rate                         0
Hemoglobin                      1125
O2 saturation pulseoxymetry       86
PTT                            29312
Respiratory Rate                 233
Sodium (serum)                   465
Systolic Blood Pressure          154
WBC                             1167
dtype: int64


In [4]:
# impute missing values with the median as RandomForestClassifier cannot handle them
dataX = dataX.fillna(dataX.median())

# transform label column to numeric representation 
le = preprocessing.LabelEncoder()
le.fit(dataY)

# store class labels
class_labels = le.classes_
dataY = le.transform(dataY)

with pd.option_context('mode.chained_assignment', None):
    train_data_X_, test_data_X, train_data_y_ , test_data_y = train_test_split(dataX, dataY, test_size=0.2, 
                                              shuffle=True,random_state=0)
    
with pd.option_context('mode.chained_assignment', None):
    train_data_X, val_data_X, train_data_y, val_data_y = train_test_split(train_data_X_, train_data_y_, test_size=0.25, 
                                            shuffle=True,random_state=0)

# Apply SMOTE only to training data
smote = SMOTE(random_state=42)
train_X_resampled, train_y_resampled = smote.fit_resample(train_data_X, train_data_y)

# Convert to NumPy arrays
train_X = train_X_resampled.to_numpy()
train_y = train_y_resampled

test_X = test_data_X.to_numpy()
test_y = test_data_y

val_X = val_data_X.to_numpy()
val_y = val_data_y

print(f"Training set size: {train_X.shape[0]} samples")
print(f"Validation set size: {val_X.shape[0]} samples")
print(f"Test set size: {test_X.shape[0]} samples")

Training set size: 1514610 samples
Validation set size: 210059 samples
Test set size: 210059 samples


### Initial model and evaluation

In [None]:
from sklearn.ensemble import RandomForestClassifier

# max_depth - max layers of a tree, reduces overfitting
# n_estimators  - number of trees, balances model performance with computation time
# balanced_subsample - gives more weight to minority classes
# random_state - sets a consistent random seed

clf = RandomForestClassifier(max_depth=10, n_estimators=350, class_weight='balanced_subsample', random_state=0)
clf.fit(train_X, train_y)

In [None]:
val_pred = clf.predict(val_X)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(val_y, val_pred, target_names=class_labels))
print("Confusion Matrix:\n", confusion_matrix(val_y, val_pred))

### Grid Search 
Finds the best hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

# define hyperparameter grid
param_grid = {
    'max_depth': [6, 8, 10, 12],
    'n_estimators': [100, 250, 350, 500],
    'class_weight': ['balanced_subsample', 'balanced']
}

# set up grid search
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=0),
    param_grid=param_grid,
    scoring='f1_macro',
    cv=3,
    n_jobs=-1,
    verbose=1
)

# run grid search on the training data
grid_search.fit(train_X, train_y)

print("\nBest parameters found:", grid_search.best_params_)
print("On validation F1 score is: {:.3f}".format(grid_search.best_score_))

### Final model

In [None]:
# Combine train and validation sets to retrain using the full training data
full_train_X = np.concatenate((train_X, val_X), axis=0)
full_train_y = np.concatenate((train_y, val_y), axis=0)

# Extract best parameters
best_params = grid_search.best_params_

# Train new model with best parameters
final_clf = RandomForestClassifier(
    max_depth=best_params['max_depth'],
    n_estimators=best_params['n_estimators'],
    class_weight=best_params['class_weight'],
    random_state=0
)

final_clf.fit(full_train_X, full_train_y)

# evaluate with test set
test_f1 = get_f1_score(final_clf, test_X, test_y)

print("Test F1 score: {:.3f}".format(test_f1))

### Feature Importance
Finds the features most indicative of a sample belonging to one of the three classes. Helps with model interpretability

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# use the best model from grid search
best_model = grid_search.best_estimator_

# get feature importances
importances = best_model.feature_importances_
feature_names = train_X.columns if hasattr(train_X, 'columns') else [f'Feature {i}' for i in range(train_X.shape[1])]

# dataframe for easy sorting and plotting
feat_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_df = feat_df.sort_values(by='Importance', ascending=False)

# plot top 10 features
feat_df.head(10).plot(kind='barh', x='Feature', y='Importance', legend=False, figsize=(8, 5))
plt.title('Top 10 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()