## Libraries Imports and Data Loading

In [107]:
import joblib as jl
import numpy as np
import pandas as pd

from datetime import datetime

from lib import map_icd9_to_category, parse_dosage

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder

In [97]:
df_orig = pd.read_csv('dataset.csv')
print("Dataset loaded successfully!")
print(f"Shape: {df_orig.shape}")
df_orig.head()

Dataset loaded successfully!
Shape: (66587, 49)


Unnamed: 0,index,encounter_id,patient_id,race,gender,age,weight,time_in_hospital,medical_specialty,num_lab_procedures,...,X19,X20,X21,X22,X23,X24,X25,change,diabetesMed,readmitted
0,69375,197029140,51521940,Hispanic,Male,[80-90),?,4,Emergency/Trauma,39,...,No,No,No,No,No,No,No,No,Yes,1
1,57272,163571946,92631357,Caucasian,Male,[60-70),?,3,Emergency/Trauma,34,...,No,No,No,No,No,No,No,No,No,0
2,82347,256497366,30023982,Caucasian,Female,[50-60),?,6,Nephrology,50,...,No,Down,No,No,No,No,No,Ch,Yes,1
3,89608,289891212,93606021,AfricanAmerican,Female,[70-80),?,5,?,63,...,No,Down,No,No,No,No,No,Ch,Yes,1
4,24091,81873900,23464296,Caucasian,Male,[70-80),?,9,Psychiatry,17,...,No,No,No,No,No,No,No,No,No,1


In [98]:
# Remove duplicate rows
print(f"Shape before removing duplicates: {df_orig.shape}")
df_orig = df_orig.drop_duplicates()
print(f"Shape after removing duplicates: {df_orig.shape}")

Shape before removing duplicates: (66587, 49)
Shape after removing duplicates: (48911, 49)


## Features Engineering

In [99]:
# Create a copy of the original dataframe to be used for feature engineering
df = pd.DataFrame(index=df_orig.index)

In [100]:
# Define the features that will be used to train the model
feature_cols = [
    # Features kept as-is from original dataset
    'time_in_hospital',
    'num_lab_procedures',
    'num_procedures',
    'num_medications',
    'number_outpatient',
    'number_emergency',
    'number_inpatient',
    'number_diagnoses',

    # Reprocessed / Encoded features
    'race',     # fill missing with 'Unknown' and one-hot-encode
    'gender',   # one-hot-encode
    'age',      # ordinal encode
    'medical_specialty',  # fill missing with 'Unknown' and target-encode or feature-encode (eventually rare specialities can be grouped as 'Other')
    'diag_1', 'diag_2', 'diag_3', 'diag_4', # convert into categories using ICD-9 codes and one-hot-encode
    'X1', 'X2', # replace missing with '0' and convert to numerical (by removing leading '>')
    'X3', 'X4', 'X5', 'X6', 'X7', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X20', 'X21', 'X22', 'X23', 'X24',  # replace missing with 'No' and one-hot-encode
    'change',   # binary encode
    'diabetesMed'  # binary encode

    # Newly engineered features
    'total_prior_visits',  # = number_inpatient + number_outpatient + number_emergency
    'acute_care_ratio', # = number_emergency / total_prior_visits
    'meds_per_day',  # = num_medications / time_in_hospital
    'labs_per_day',  # = num_lab_procedures / time_in_hospital
    'procedures_per_day',   # = num_procedures / time_in_hospital
    'medication_intensity',  # = count of 'X3' to 'X24' that aren't 'No'
    'medication_stability',  # = count of 'X3' to 'X24' that are 'Steady' / medication_intensity
]

In [101]:
# Copy features meant to be kept as-is in the new dataframe
keep_as_is_features = [
    'time_in_hospital',
    'num_lab_procedures',
    'num_procedures',
    'num_medications',
    'number_outpatient',
    'number_emergency',
    'number_inpatient',
    'number_diagnoses',
]

for col in keep_as_is_features:
    df[col] = df_orig[col].copy()

print(f"Keep-as-is features copied. New df shape: {df.shape}")
df.head()

Keep-as-is features copied. New df shape: (48911, 8)


Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,4,39,1,9,1,1,4,9
1,3,34,1,12,0,0,0,6
2,6,50,2,24,0,0,5,9
3,5,63,0,14,0,0,0,7
4,9,17,1,11,0,0,0,9


In [102]:
# Add 'race' feature with missing values filled and one-hot-encoded
race_encoder = OneHotEncoder(drop='first', sparse_output=False)
race_data = df_orig['race'].replace('?', np.nan).fillna('Unknown').to_frame()
race_df = pd.DataFrame(
    race_encoder.fit_transform(race_data), 
    columns=race_encoder.get_feature_names_out(['race']),
    index=df_orig.index
)

df = pd.concat([df, race_df], axis=1)

print(f"Race feature processed. New df shape: {df.shape}")
df.head()

Race feature processed. New df shape: (48911, 13)


Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_Asian,race_Caucasian,race_Hispanic,race_Other,race_Unknown
0,4,39,1,9,1,1,4,9,0.0,0.0,1.0,0.0,0.0
1,3,34,1,12,0,0,0,6,0.0,1.0,0.0,0.0,0.0
2,6,50,2,24,0,0,5,9,0.0,1.0,0.0,0.0,0.0
3,5,63,0,14,0,0,0,7,0.0,0.0,0.0,0.0,0.0
4,9,17,1,11,0,0,0,9,0.0,1.0,0.0,0.0,0.0


In [103]:
# Add 'gender' feature with one-hot-encoding
gender_encoder = OneHotEncoder(drop='first', sparse_output=False)
gender_data = df_orig['gender'].to_frame()
gender_df = pd.DataFrame(
    gender_encoder.fit_transform(gender_data),
    columns=gender_encoder.get_feature_names_out(['gender']),
    index=df_orig.index
)

df = pd.concat([df, gender_df], axis=1)

print(f"Gender feature processed. New df shape: {df.shape}")
df.head()

Gender feature processed. New df shape: (48911, 15)


Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_Asian,race_Caucasian,race_Hispanic,race_Other,race_Unknown,gender_Male,gender_Unknown/Invalid
0,4,39,1,9,1,1,4,9,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,3,34,1,12,0,0,0,6,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,6,50,2,24,0,0,5,9,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,5,63,0,14,0,0,0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9,17,1,11,0,0,0,9,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [104]:
# Add 'age' feature with ordinal encoding
age_categories = ['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)']

age_encoder = OrdinalEncoder(categories=[age_categories])
age_df = pd.DataFrame(
    age_encoder.fit_transform(df_orig['age'].to_frame()),
    columns=['age'],
    index=df_orig.index
)

df = pd.concat([df, age_df], axis=1)

print(f"Age feature processed. New df shape: {df.shape}")
df.head()

Age feature processed. New df shape: (48911, 16)


Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_Asian,race_Caucasian,race_Hispanic,race_Other,race_Unknown,gender_Male,gender_Unknown/Invalid,age
0,4,39,1,9,1,1,4,9,0.0,0.0,1.0,0.0,0.0,1.0,0.0,8.0
1,3,34,1,12,0,0,0,6,0.0,1.0,0.0,0.0,0.0,1.0,0.0,6.0
2,6,50,2,24,0,0,5,9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5.0
3,5,63,0,14,0,0,0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
4,9,17,1,11,0,0,0,9,0.0,1.0,0.0,0.0,0.0,1.0,0.0,7.0


In [105]:
# Add 'medical_specialty' feature with missing values filled and one-hot-encoded
speciality_data = df_orig['medical_specialty'].replace('?', np.nan).fillna('Unknown')

# group rare specialities into 'Other'
threshold = 0.01
speciality_counts = speciality_data.value_counts()
specialities_to_keep = speciality_counts[speciality_counts >= len(df_orig)*threshold].index
speciality_data = speciality_data.apply(lambda x: x if x in specialities_to_keep else 'Other')

speciality_encoder = OneHotEncoder(drop='first', sparse_output=False)
speciality_df = pd.DataFrame(
    speciality_encoder.fit_transform(speciality_data.to_frame()),
    columns=speciality_encoder.get_feature_names_out(['medical_specialty']),
    index=df_orig.index
)

df = pd.concat([df, speciality_df], axis=1)

print(f"Medical specialty feature processed. New df shape: {df.shape}")
df.head()

Medical specialty feature processed. New df shape: (48911, 26)


Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_Asian,race_Caucasian,...,medical_specialty_Emergency/Trauma,medical_specialty_Family/GeneralPractice,medical_specialty_InternalMedicine,medical_specialty_Nephrology,medical_specialty_Orthopedics,medical_specialty_Orthopedics-Reconstructive,medical_specialty_Other,medical_specialty_Radiologist,medical_specialty_Surgery-General,medical_specialty_Unknown
0,4,39,1,9,1,1,4,9,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,34,1,12,0,0,0,6,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,50,2,24,0,0,5,9,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,63,0,14,0,0,0,7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,9,17,1,11,0,0,0,9,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [106]:
# Add first diagnosis feature 'diag_1'
# TODO: add also diag_2, diag_3, diag_4 if needed

diag_1_data = df_orig['diag_1'].apply(map_icd9_to_category)

# group rare diagnosis categories into 'Other'
threshold = 0.01
diag_1_counts = diag_1_data.value_counts()
diag_1_to_keep = diag_1_counts[diag_1_counts >= len(df_orig)*threshold].index
diag_1_data = diag_1_data.apply(lambda x: x if x in diag_1_to_keep else 'Other')

diag_1_encoder = OneHotEncoder(drop='first', sparse_output=False)
diag_1_df = pd.DataFrame(
    diag_1_encoder.fit_transform(diag_1_data.to_frame()),
    columns=diag_1_encoder.get_feature_names_out(['diag_1']),
    index=df_orig.index
)

df = pd.concat([df, diag_1_df], axis=1)

print(f"Diag_1 feature processed. New df shape: {df.shape}")
df.head()

Diag_1 feature processed. New df shape: (48911, 41)


Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_Asian,race_Caucasian,...,diag_1_Injury_Poisoning,diag_1_Mental,diag_1_Musculoskeletal,diag_1_Neoplasms,diag_1_Nervous_System,diag_1_Other,diag_1_Respiratory,diag_1_Skin,diag_1_Supplementary_V,diag_1_Symptoms_Signs
0,4,39,1,9,1,1,4,9,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,34,1,12,0,0,0,6,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,50,2,24,0,0,5,9,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,63,0,14,0,0,0,7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9,17,1,11,0,0,0,9,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Add 'X1' and 'X2' features after parsing dosages
for col in ['X1', 'X2']:
    parsed_col = df_orig[col].apply(parse_dosage)
    df[f"{col}_dosage"] = parsed_col.fillna(0)
    df[f"{col}_given"] = (parsed_col.notna() & (parsed_col > 0)).astype(int)

print(f"X1 and X2 features processed. New df shape: {df.shape}")
df.head()

X1 and X2 features processed. New df shape: (48911, 45)


Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_Asian,race_Caucasian,...,diag_1_Nervous_System,diag_1_Other,diag_1_Respiratory,diag_1_Skin,diag_1_Supplementary_V,diag_1_Symptoms_Signs,X1_dosage,X1_given,X2_dosage,X2_given
0,4,39,1,9,1,1,4,9,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0
1,3,34,1,12,0,0,0,6,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
2,6,50,2,24,0,0,5,9,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
3,5,63,0,14,0,0,0,7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
4,9,17,1,11,0,0,0,9,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,200.0,1,0.0,0


In [109]:
# Add other 'X*' features with missing values filled and one-hot-encoded
x_cols = ['X3', 'X4', 'X5', 'X6', 'X7', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X20', 'X21', 'X22', 'X23', 'X24']

x_encoder = OneHotEncoder(drop='first', sparse_output=False)

for col in x_cols:
    col_data = df_orig[col].to_frame()
    col_df = pd.DataFrame(
        x_encoder.fit_transform(col_data),
        columns=x_encoder.get_feature_names_out([col]),
        index=df_orig.index
    )
    df = pd.concat([df, col_df], axis=1)

print(f"X* features processed. New df shape: {df.shape}")
df.head()

X* features processed. New df shape: (48911, 88)


Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_Asian,race_Caucasian,...,X17_Steady,X20_No,X20_Steady,X20_Up,X21_No,X21_Steady,X21_Up,X22_Steady,X23_Steady,X24_Steady
0,4,39,1,9,1,1,4,9,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,3,34,1,12,0,0,0,6,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,6,50,2,24,0,0,5,9,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,5,63,0,14,0,0,0,7,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,9,17,1,11,0,0,0,9,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [110]:
# Add 'change' feature with binary encoding
change_encoder = {'Ch': 1, 'No': 0}
df['change'] = df_orig['change'].map(change_encoder)

print(f"Change feature processed. New df shape: {df.shape}")
df.head()

Change feature processed. New df shape: (48911, 89)


Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_Asian,race_Caucasian,...,X20_No,X20_Steady,X20_Up,X21_No,X21_Steady,X21_Up,X22_Steady,X23_Steady,X24_Steady,change
0,4,39,1,9,1,1,4,9,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,3,34,1,12,0,0,0,6,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,6,50,2,24,0,0,5,9,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
3,5,63,0,14,0,0,0,7,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
4,9,17,1,11,0,0,0,9,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0


In [111]:
# Add 'diabetesMed' feature with binary encoding
diabetesMed_encoder = {'Yes': 1, 'No': 0}
df['diabetesMed'] = df_orig['diabetesMed'].map(diabetesMed_encoder)

print(f"DiabetesMed feature processed. New df shape: {df.shape}")
df.head()

DiabetesMed feature processed. New df shape: (48911, 90)


Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_Asian,race_Caucasian,...,X20_Steady,X20_Up,X21_No,X21_Steady,X21_Up,X22_Steady,X23_Steady,X24_Steady,change,diabetesMed
0,4,39,1,9,1,1,4,9,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1
1,3,34,1,12,0,0,0,6,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0
2,6,50,2,24,0,0,5,9,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1
3,5,63,0,14,0,0,0,7,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1
4,9,17,1,11,0,0,0,9,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0


In [None]:
# TODO: add newly engineered features if needed

## Data Splitting

In [118]:
# Split the dataset into training and validation sets
y = df_orig['readmitted']
X_train, X_val, y_train, y_val = train_test_split(df, y, test_size=0.1, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")

Training set shape: (44019, 90)
Validation set shape: (4892, 90)


## Model Training

In [None]:
# Create and train a new RandomForestClassifier model on the just prepared training data
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## Model evaluation

In [120]:
# Generate the classification report on the validation set for the trained model
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.69      0.66      2631
           1       0.59      0.52      0.55      2261

    accuracy                           0.61      4892
   macro avg       0.61      0.61      0.61      4892
weighted avg       0.61      0.61      0.61      4892

