In [None]:
# Importing libraries 
import json 
import numpy as np 
import pandas as pd 
import sys 
from pathlib import Path
sys.path.append(str(Path.cwd().parents[0]))
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import (roc_auc_score, precision_score, recall_score)
from src.preprocessing import building_pipeline
from sklearn.ensemble import RandomForestClassifier
from src.dataset import finalizing_dataset
from sklearn.model_selection import train_test_split

In [None]:
# Retrieving previous columns used 
cat_cols = json.load(open('categorical_cols.json'))
num_cols = json.load(open('numerical_cols.json'))
features = json.load(open('feature_list.json'))

In [None]:
# Splitting model like previously done 
df = finalizing_dataset()

target = 'readmitted_30'
y = df[target]
X = df.drop(columns=[target, 'readmitted'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify= y, 
    random_state = 42
    
)

In [None]:
# Creating a random classifier model for comparison 
RF_pipeline = Pipeline(steps=[
    ('preprocessor', building_pipeline(
        cat_cols, num_cols)),
    ('model', RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        min_samples_leaf=50,
        class_weight="balanced",
        random_state=42,
    ))
])

In [None]:
# Fitting and predicting the random classifier model 
RF_pipeline.fit(X_train,y_train)

y_pred_RF = RF_pipeline.predict(X_test)
y_prob_RF = RF_pipeline.predict_proba(X_test)[:,1]

RF_results = {
    "model": "Random Forest",
    "roc_auc": roc_auc_score(y_test, y_prob_RF),
    "recall": recall_score(y_test, y_pred_RF),
    "precision": precision_score(y_test, y_pred_RF)
}

RF_results

{'model': 'Random Forest',
 'roc_auc': 0.661689762502973,
 'recall': 0.5596653456627037,
 'precision': 0.17391899288451013}

In [None]:
# Uploading results from Logistic Regression 
LR_results = {'model': 'Logistic Regression (balanced)',
              'roc_auc': 0.6422624400871046,
              'recall': 0.5486569793042713,
              'precision': 0.1669793621013133}

In [None]:
# Comparing Logistic Regression model to the Random Forest model
results_df = pd.DataFrame([LR_results, RF_results])
results_df

Unnamed: 0,model,roc_auc,recall,precision
0,Logistic Regression (balanced),0.642262,0.548657,0.166979
1,Random Forest,0.66169,0.559665,0.173919


The Random Forest improves performance slightly relative to the logistic regression model baseline. This suggest that there are some features that linear models are unable to capture which is why Random Forest has a slight improvement. 