In [None]:
# Importing Libraries 
import json 
import numpy as np 
import pandas as pd 
import sys 
from pathlib import Path
sys.path.append(str(Path.cwd().parents[0]))
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (roc_auc_score, precision_score, recall_score)
from src.preprocessing import building_pipeline


In [None]:
# Retrieving columns used 
cat_cols = json.load(open('categorical_cols.json'))
num_cols = json.load(open('numerical_cols.json'))
features = json.load(open('feature_list.json'))

In [4]:
# Recreating the split from previous file 
from src.dataset import finalizing_dataset
from sklearn.model_selection import train_test_split

df = finalizing_dataset()

target = 'readmitted_30'
y = df[target]
X = df.drop(columns=[target, 'readmitted'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify= y, 
    random_state = 42
    
)


In [None]:
# Dummy model baseline 

dum_pipeline = Pipeline(steps=[
    ('preprocessor', building_pipeline(
        cat_cols, num_cols)),
    ('model', DummyClassifier(strategy='most_frequent'))
])

dum_pipeline.fit(X_train, y_train)

ypred_dum = dum_pipeline.predict(X_test)
yprob_dum = dum_pipeline.predict_proba(X_test)[:,1]

results = {
    "model": "Dummy (most frequent)",
    "roc_auc": roc_auc_score(y_test, yprob_dum),
    "recall": recall_score(y_test, ypred_dum),
    "precision": precision_score(y_test, ypred_dum)}

results

{'model': 'Dummy (most frequent)',
 'roc_auc': 0.5,
 'recall': 0.0,
 'precision': 0.0}

In [None]:
# logistic regression balanced model 

LR_pipeline = Pipeline(steps=[
    ('preprocessor', building_pipeline(
        cat_cols, num_cols)),
    ('model', LogisticRegression(
        max_iter = 100, 
        class_weight= 'balanced',
        solver='liblinear'
        ))
])

# Fitting model
LR_pipeline.fit(X_train, y_train)

# Using model for prediction
y_pred_LR = LR_pipeline.predict(X_test)
y_proba_LR = LR_pipeline.predict_proba(X_test)[:, 1]

LR_results = {
    "model": "Logistic Regression (balanced)",
    "roc_auc": roc_auc_score(y_test, y_proba_LR),
    "recall": recall_score(y_test, y_pred_LR),
    "precision": precision_score(y_test, y_pred_LR)
}

LR_results


{'model': 'Logistic Regression (balanced)',
 'roc_auc': 0.6422624400871046,
 'recall': 0.5486569793042713,
 'precision': 0.1669793621013133}

In [7]:
result_df = pd.DataFrame([results, LR_results])

result_df



Unnamed: 0,model,roc_auc,recall,precision
0,Dummy (most frequent),0.5,0.0,0.0
1,Logistic Regression (balanced),0.642262,0.548657,0.166979


Logistic regression model outperforms the Dummy classifier meaning that the model was able to learn and perform better. 