## Libraries Imports and Data Loading

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib as jl
from datetime import datetime

In [27]:
df = pd.read_csv('dataset.csv')
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

Dataset loaded successfully!
Shape: (66587, 49)


Unnamed: 0,index,encounter_id,patient_id,race,gender,age,weight,time_in_hospital,medical_specialty,num_lab_procedures,...,X19,X20,X21,X22,X23,X24,X25,change,diabetesMed,readmitted
0,69375,197029140,51521940,Hispanic,Male,[80-90),?,4,Emergency/Trauma,39,...,No,No,No,No,No,No,No,No,Yes,1
1,57272,163571946,92631357,Caucasian,Male,[60-70),?,3,Emergency/Trauma,34,...,No,No,No,No,No,No,No,No,No,0
2,82347,256497366,30023982,Caucasian,Female,[50-60),?,6,Nephrology,50,...,No,Down,No,No,No,No,No,Ch,Yes,1
3,89608,289891212,93606021,AfricanAmerican,Female,[70-80),?,5,?,63,...,No,Down,No,No,No,No,No,Ch,Yes,1
4,24091,81873900,23464296,Caucasian,Male,[70-80),?,9,Psychiatry,17,...,No,No,No,No,No,No,No,No,No,1


## Data Exploration and Cleaning

In [28]:
# Basic dataset information
print("Dataset Info:")
df.info()
print("\nMissing values:")
print(df.isnull().sum())
print("\nTarget distribution:")
print(df['readmitted'].value_counts())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66587 entries, 0 to 66586
Data columns (total 49 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               66587 non-null  int64  
 1   encounter_id        66587 non-null  int64  
 2   patient_id          66587 non-null  int64  
 3   race                66587 non-null  object 
 4   gender              66587 non-null  object 
 5   age                 66587 non-null  object 
 6   weight              66587 non-null  object 
 7   time_in_hospital    66587 non-null  int64  
 8   medical_specialty   66587 non-null  object 
 9   num_lab_procedures  66587 non-null  int64  
 10  num_procedures      66587 non-null  int64  
 11  num_medications     66587 non-null  int64  
 12  number_outpatient   66587 non-null  int64  
 13  number_emergency    66587 non-null  int64  
 14  number_inpatient    66587 non-null  int64  
 15  diag_1              66587 non-null  obj

In [29]:
# Define all possible values for categorical features
EXPECTED_CATEGORICAL_FEATURES = {
    'gender': ['Male', 'Female', 'Unknown/Invalid'],
    'age': ['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)'],
    'change': ['Ch', 'No'],
    'diabetesMed': ['Yes', 'No'],
    'X3': ['No', 'Steady', 'Up', 'Down'],
    'X4': ['No', 'Steady', 'Up', 'Down'],
    'X5': ['No', 'Steady', 'Up', 'Down'],
    'X6': ['No', 'Steady', 'Up', 'Down'],
    'X7': ['No', 'Steady', 'Up', 'Down'],
    'X9': ['No', 'Steady', 'Up', 'Down'],
    'X10': ['No', 'Steady', 'Up', 'Down'],
    'X11': ['No', 'Steady', 'Up', 'Down'],
    'X12': ['No', 'Steady', 'Up', 'Down'],
    'X13': ['No', 'Steady', 'Up', 'Down'],
    'X14': ['No', 'Steady', 'Up', 'Down'],
    'X15': ['No', 'Steady', 'Up', 'Down'],
    'X16': ['No', 'Steady', 'Up', 'Down'],
    'X17': ['No', 'Steady', 'Up', 'Down'],
    'X20': ['No', 'Steady', 'Up', 'Down'],
    'X21': ['No', 'Steady', 'Up', 'Down'],
    'X22': ['No', 'Steady', 'Up', 'Down'],
    'X23': ['No', 'Steady', 'Up', 'Down'],
    'X24': ['No', 'Steady', 'Up', 'Down'],
}

EXPECTED_NUMERICAL_FEATURES = [
    'time_in_hospital', 'num_lab_procedures', 'num_procedures',
    'num_medications', 'number_outpatient', 'number_emergency',
    'number_inpatient', 'number_diagnoses',
    'diag_1', 'diag_2', 'diag_3', 'diag_4', 'diag_5',
    'X1', 'X2'
]

ID_COLUMN = 'encounter_id'
TARGET_COLUMN = 'readmitted'

EXPECTED_COLUMNS = list(EXPECTED_CATEGORICAL_FEATURES.keys()) + EXPECTED_NUMERICAL_FEATURES + [TARGET_COLUMN]

print(f"Expected categorical features: {len(EXPECTED_CATEGORICAL_FEATURES.keys())}")
print(f"Expected numerical features: {len(EXPECTED_NUMERICAL_FEATURES)}")
print(f"Total expected features: {len(EXPECTED_COLUMNS)}")

Expected categorical features: 23
Expected numerical features: 15
Total expected features: 39


In [30]:
# Create a copy for preprocessing
df_clean = df.copy()

In [31]:
# Replace '?' cells with NaN values
df_clean = df_clean.replace('?', np.nan)

In [32]:
# Replace non-numeric diag_* values with NaN
for col in ['diag_1', 'diag_2', 'diag_3', 'diag_4', 'diag_5']:
    df_clean[col] = pd.to_numeric(df_clean[col].apply(lambda v: v if str(v)[0].isnumeric() else np.nan))

In [33]:
# Parse X1 and X2 columns to make them numerical
def parse_dosage(value):
    if pd.isna(value):
        return 0
    elif isinstance(value, str) and value.startswith('>'):
        try:
            return float(value[1:])
        except ValueError:
            return np.nan
    elif isinstance(value, float):
        return value
    else:
        return np.nan

for col in ['X1', 'X2']:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].apply(parse_dosage)

In [34]:
# Drop unnecessary columns, and ensure all expected ones are present
available_columns = [col for col in EXPECTED_COLUMNS if col in df_clean.columns]
missing_columns = [col for col in EXPECTED_COLUMNS if col not in df_clean.columns]
if missing_columns:
    print(f"[ERROR]: Missing some of the expected columns in training dataset: {missing_columns}")
    exit(1)

df_clean = df_clean[available_columns]

print(f"Columns kept: {len(available_columns)}")
print(f"Columns dropped: {len(df.columns) - len(available_columns)}")

Columns kept: 39
Columns dropped: 10


In [35]:
# Remove duplicate rows
print(f"Shape before removing duplicates: {df_clean.shape}")
df_clean = df_clean.drop_duplicates()
print(f"Shape after removing duplicates: {df_clean.shape}")

Shape before removing duplicates: (66587, 39)
Shape after removing duplicates: (48911, 39)


In [36]:
# Analyze missing data patterns
missing_analysis = df_clean.isnull().sum().sort_values(ascending=False)
print("Missing values per column:")
print(missing_analysis)

Missing values per column:
diag_4                3129
diag_3                3129
X2                    2397
diag_2                1317
X1                    1231
diag_1                 818
gender                   0
X5                       0
X4                       0
X3                       0
diabetesMed              0
change                   0
age                      0
X6                       0
X7                       0
X15                      0
X16                      0
X9                       0
X10                      0
X11                      0
X12                      0
X13                      0
X14                      0
X24                      0
X23                      0
X22                      0
X21                      0
X20                      0
X17                      0
time_in_hospital         0
num_lab_procedures       0
number_diagnoses         0
number_inpatient         0
number_emergency         0
number_outpatient        0
num_medications          0
n

In [37]:
# Drop incomplete rows (rows with any NaN values)
print(f"Shape before dropping NaNs: {df_clean.shape}")
df_clean = df_clean.dropna()
print(f"Shape after dropping NaNs: {df_clean.shape}")

Shape before dropping NaNs: (48911, 39)
Shape after dropping NaNs: (40887, 39)


In [38]:
print("Data cleaning completed!")
df_clean.head()

Data cleaning completed!


Unnamed: 0,gender,age,change,diabetesMed,X3,X4,X5,X6,X7,X9,...,number_inpatient,number_diagnoses,diag_1,diag_2,diag_3,diag_4,diag_5,X1,X2,readmitted
0,Male,[80-90),No,Yes,No,No,No,No,No,No,...,4,9,783.0,403.0,585.0,585.0,32.970966,0.0,0.0,1
1,Male,[60-70),No,No,No,No,No,No,No,No,...,0,6,564.0,788.0,569.0,569.0,94.116731,0.0,0.0,0
2,Female,[50-60),Ch,Yes,No,No,No,No,No,No,...,5,9,574.0,403.0,585.0,585.0,64.138297,0.0,0.0,1
4,Male,[70-80),No,No,No,No,No,No,No,No,...,0,9,311.0,276.0,300.0,300.0,82.898368,200.0,0.0,1
5,Male,[40-50),Ch,Yes,No,No,No,No,No,No,...,0,5,410.0,250.02,414.0,414.0,91.161586,0.0,0.0,0


In [39]:
# Create a sample_input.csv file with all the rows from the original dataset that survived to the cleaning process
samples_df = df[df.index.isin(df_clean.index)].copy()

sample_input_df = samples_df.drop(columns=[TARGET_COLUMN])

sample_output_df = pd.DataFrame({
    ID_COLUMN: samples_df[ID_COLUMN],
    TARGET_COLUMN: samples_df[TARGET_COLUMN]
})

sample_input_df.to_csv("sample_input.csv", index=False)
sample_output_df.to_csv("sample_output.csv", index=False)

## Pipeline Definition

In [40]:
available_num_features = [c for c in EXPECTED_NUMERICAL_FEATURES if c in df_clean.columns]
available_cat_features = [c for c in EXPECTED_CATEGORICAL_FEATURES.keys() if c in df_clean.columns]

print(f"Available categorical features: {len(available_cat_features)}")
print(f"Available numerical features: {len(available_num_features)}")

Available categorical features: 23
Available numerical features: 15


In [41]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), available_num_features),
        ('cat', OneHotEncoder(
            drop='first',
            sparse_output=False,
            categories=[EXPECTED_CATEGORICAL_FEATURES[col] for col in available_cat_features]
        ), available_cat_features),
    ]
)

In [42]:
# Create the complete ML pipeline
ml_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        random_state=42
    ))
])

## Data Splitting

In [43]:
# Split data into input features and target variable
x = df_clean.drop(columns=[TARGET_COLUMN])
y = df_clean[TARGET_COLUMN]

In [44]:
# Split the dataset into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=42, stratify=y)

In [45]:
print(f"Training set shape: {x_train.shape}")
print(f"Validation set shape: {x_val.shape}")
print(f"Training target distribution: {pd.Series(y_train).value_counts().to_dict()}")
print(f"Validation target distribution: {pd.Series(y_val).value_counts().to_dict()}")

Training set shape: (36798, 38)
Validation set shape: (4089, 38)
Training target distribution: {0: 19536, 1: 17262}
Validation target distribution: {0: 2171, 1: 1918}


## Pipeline Training

In [46]:
ml_pipeline.fit(x_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,categories,"[['Male', 'Female', ...], ['[0-10)', '[10-20)', ...], ...]"
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [47]:
print(f"\nPipeline components:")
print(f"- Preprocessor: {ml_pipeline.named_steps['preprocessor']}")
print(f"- Classifier: {ml_pipeline.named_steps['classifier']}")


Pipeline components:
- Preprocessor: ColumnTransformer(transformers=[('num', MinMaxScaler(),
                                 ['time_in_hospital', 'num_lab_procedures',
                                  'num_procedures', 'num_medications',
                                  'number_outpatient', 'number_emergency',
                                  'number_inpatient', 'number_diagnoses',
                                  'diag_1', 'diag_2', 'diag_3', 'diag_4',
                                  'diag_5', 'X1', 'X2']),
                                ('cat',
                                 OneHotEncoder(categories=[['Male', 'Female',
                                                            'Unknown/Invalid'],
                                                           ['[0-10)', '[10-20...
                                                           ['No', 'Steady',
                                                            'Up', 'Down'],
                                                

## Model evaluation

In [48]:
# Calculate training and validation accuracies
y_train_pred = ml_pipeline.predict(x_train)
y_val_pred = ml_pipeline.predict(x_val)

train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"Training Accuracy: {train_accuracy*100:.2f}%")
print(f"Validation Accuracy: {val_accuracy*100:.2f}%")

Training Accuracy: 100.00%
Validation Accuracy: 62.66%


In [49]:
# Detailed classification report
print("\n=== Classification Report (Validation Set) ===")
print(classification_report(y_val, y_val_pred))


=== Classification Report (Validation Set) ===
              precision    recall  f1-score   support

           0       0.63      0.74      0.68      2171
           1       0.63      0.50      0.56      1918

    accuracy                           0.63      4089
   macro avg       0.63      0.62      0.62      4089
weighted avg       0.63      0.63      0.62      4089



## Pipeline Export

In [50]:
# Create artifacts dictionary with complete pipeline
artifacts = {
    "pipeline": ml_pipeline,
    "feature_names": x.columns.tolist(),
    "categorical_features": list(EXPECTED_CATEGORICAL_FEATURES.keys()),
    "numerical_features": EXPECTED_NUMERICAL_FEATURES,
    "training_accuracy": train_accuracy,
    "validation_accuracy": val_accuracy,
    "created_at": datetime.now().isoformat(),
    "model_type": "RandomForestClassifier with preprocessing pipeline"
}

# Save the complete pipeline
jl.dump(artifacts, "model_pipeline.joblib")

['model_pipeline.joblib']