In [2]:
import pandas as pd
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, roc_auc_score, f1_score, precision_score, accuracy_score, recall_score

# For parallel processing (-1 means use all cores, None means 1 core)
PARALLEL = -1

# Load the data
train = pd.read_csv('../data/preprocessed/prp_combined_Y1.csv')
test = pd.read_csv('../data/preprocessed/prp_combined_Y2.csv')

## Random Forests Classifier
This notebook utilises a Random Forests Classifier to determine `DaysInHospital`. The problem is modelled as a classification task with 16 classes (0 - 15).
Hyperparameters are tested using GridSearch, with the best estimator being selected via `roc_auc` score.

In [3]:
# Separate the data into features and target
Y_train = train['DaysInHospitalY2']
X_train = train.drop(columns=['DaysInHospitalY2', 'MemberID'])
Y_test = test['DaysInHospitalY3']
X_test = test.drop(columns=['DaysInHospitalY3', 'MemberID'])

In [12]:
outer_split = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)
inner_split = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Hyperparameters
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4, 8],
    'criterion': ['gini', 'entropy']
}

rfc = RandomForestClassifier()

# Scores
scoring = {
    'roc_auc': make_scorer(roc_auc_score),
    'f1': make_scorer(f1_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score)
}

rscv = RandomizedSearchCV(
    estimator=rfc, 
    param_distributions=param_grid, 
    n_iter=30, 
    cv=inner_split, 
    scoring=scoring, 
    refit='roc_auc', 
    n_jobs=PARALLEL, 
    verbose=4, 
    random_state=42
)

scores = {
    'roc_auc': [],
    'f1': [],
    'precision': [],
    'recall': []
}

i = 1

for train_idx, test_idx in outer_split.split(X_train, Y_train):
    print(f"Loop {i}")
    i += 1

    X_train_outer, X_test_outer = X_train.iloc[train_idx], X_train.iloc[test_idx]
    Y_train_outer, Y_test_outer = Y_train.iloc[train_idx], Y_train.iloc[test_idx]

    rscv.fit(X_train_outer, Y_train_outer)

    Y_pred = rscv.best_estimator_.predict(X_test_outer)
    Y_pred_prob = rscv.best_estimator_.predict_proba(X_test_outer)

    roc_auc = roc_auc_score(Y_test_outer, Y_pred_prob, multi_class='ovr')
    f1 = f1_score(Y_test_outer, Y_pred, average='macro')
    precision = precision_score(Y_test_outer, Y_pred, average='macro')
    recall = recall_score(Y_test_outer, Y_pred, average='macro')

    scores['roc_auc'].append(roc_auc)
    scores['f1'].append(f1)
    scores['precision'].append(precision)
    scores['recall'].append(precision)

    print(f"ROC AUC: {roc_auc}")
    print(f"F1: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")


Loop 1
Fitting 4 folds for each of 15 candidates, totalling 60 fits




ROC AUC: 0.744833723539401
F1: 0.8452631578947368
Precision: 0.8452631578947368
Recall: 0.8452631578947368
Loop 2
Fitting 4 folds for each of 15 candidates, totalling 60 fits




ROC AUC: 0.7409314844479786
F1: 0.8452631578947368
Precision: 0.8452631578947368
Recall: 0.8452631578947368
Loop 3
Fitting 4 folds for each of 15 candidates, totalling 60 fits


KeyboardInterrupt: 