In [1]:
import json
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, recall_score, precision_score
from sklearn.ensemble import GradientBoostingClassifier

import sys 
from pathlib import Path
sys.path.append(str(Path.cwd().parents[0]))
import warnings
warnings.filterwarnings('ignore')

from src.preprocessing import building_pipeline
from src.dataset import finalizing_dataset
from sklearn.model_selection import train_test_split

In [2]:
cat_cols = json.load(open('categorical_cols.json'))
num_cols = json.load(open('numerical_cols.json'))
features = json.load(open('feature_list.json'))

In [3]:
df = finalizing_dataset()

target = 'readmitted_30'
y = df[target]
X = df.drop(columns=[target, 'readmitted'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify= y, 
    random_state = 42
    
)

In [14]:
GB_pipeline = Pipeline(steps=[
    ('preprocessor', building_pipeline(
        cat_cols, num_cols)),
    ('model', GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05, 
        max_depth = 3, 
        random_state = 42
    ))
])

In [15]:
GB_pipeline.fit(X_train, y_train)

GB_y_pred = GB_pipeline.predict(X_test)
GB_y_prob = GB_pipeline.predict_proba(X_test)[:,1]

GB_results = {
    "model": "Gradient Boosting",
    "roc_auc": roc_auc_score(y_test, GB_y_prob),
    "recall": recall_score(y_test, GB_y_pred),
    "precision": precision_score(y_test, GB_y_pred)
}

GB_results

{'model': 'Gradient Boosting',
 'roc_auc': 0.6773767728352162,
 'recall': 0.008806693086745927,
 'precision': 0.625}

In [18]:
LR_results = {'model': 'Logistic Regression (balanced)',
              'roc_auc': 0.6422624400871046,
              'recall': 0.5486569793042713,
              'precision': 0.1669793621013133}

RF_results = {'model': 'Random Forest',
              'roc_auc': 0.661689762502973,
              'recall': 0.5596653456627037,
              'precision': 0.17391899288451013}

results = pd.DataFrame([LR_results, RF_results, GB_results])

In [19]:
results

Unnamed: 0,model,roc_auc,recall,precision
0,Logistic Regression (balanced),0.642262,0.548657,0.166979
1,Random Forest,0.66169,0.559665,0.173919
2,Gradient Boosting,0.677377,0.008807,0.625


Gradient boosting achieved the strongest ROC AUC, indicating a superior performance between all the models. Therefore the Gradient Boosting was chosen for further analysis.