In [1]:
# Importing libraries

import pandas as pd 
import numpy as np 
import sys 
from pathlib import Path
from sklearn.model_selection import train_test_split
import json 

In [2]:
# Using original data path 
sys.path.append(str(Path.cwd().parents[0]))

In [3]:
# Uploading libraries/functions built from previous files 

from src.dataset import finalizing_dataset
from src.features import drop_col_for_train
from src.preprocessing import column_type, building_pipeline

In [4]:
# using the finalized dataset function to import data 

df = finalizing_dataset()

In [5]:
df.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,diabetesMed,readmitted,readmitted_30,num_inpatient,num_emergency,num_outpatient,total_visits,num_medication_change,insulin_used,has_diabetes_complications
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,NO,0,0,0,0,0,0,0,1
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,Yes,>30,0,0,0,0,0,1,1,0
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,Yes,NO,0,1,0,2,3,0,0,0
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,Yes,NO,0,0,0,0,0,1,1,1
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,Yes,NO,0,0,0,0,0,0,1,0


In [6]:
df.shape

(101766, 53)

In [7]:
# Setting the target value and splitting variables into X and y for further ml 
target = 'readmitted_30'

y = df[target]
X = df.drop(columns=[target, 'readmitted'])

In [8]:
# Getting the column type of each of our columns 

categorical_cols, numerical_cols = column_type(X)

display(len(categorical_cols))
display(len(numerical_cols))

33

18

Binary values were treated as categorical variables. Continuous values treated as numerical variables. 

In [9]:
# Splitting the data into test and train for ml inputs 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size =0.2, stratify=y, random_state = 42
)

In [10]:
# displaying the means of the y train and test 

display(y_train.mean())
display(y_test.mean())

0.11160516877118852

0.11157512036946055

Target variable is imbalanced. Test/Train split was used to preserve class distribution. 
Train rate: 11.16%
Test rate: 11.15%

In [11]:
# Using the building pipeline function on our categorical and numerical columns 

preprocessor = building_pipeline(
    categorical_cols, 
    numerical_cols
)

preprocessor

In [12]:
# Showing the test and train split of our X value 

display(X_train.shape)
display(X_test.shape)

(81412, 51)

(20354, 51)

# Checking for any possible leakages

In [13]:
assert set(X_train.index).isdisjoint(set(X_test.index)), "Overlap detected!"

In [14]:
for col in ["patient_id", "encounter_id"]:
    assert col not in X_train.columns, f"Leakage column detected: {col}"

# Freezing data for reproducibility 

In [15]:
split_metadata = {
    "random_state": 42,
    "test_size": 0.2,
    "stratified": True,
    "train_positive_rate": float(y_train.mean()),
    "test_positive_rate": float(y_test.mean())
}

json.dump(X_train.columns.tolist(), open('feature_list.json','w'))
json.dump(categorical_cols, open("categorical_cols.json", "w"))
json.dump(numerical_cols, open("numerical_cols.json", "w"))
json.dump(split_metadata, open("split_metadata.json", "w"))

###### Freezing data for better reproducibility and consistent comparison across models. No leaks in the data.