In [11]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score
import joblib

# 2. Load Dataset
df = pd.read_csv("readmission_data.csv")  # Replace with your dataset path

# 3. Define Target and Features
df['readmitted'] = df['readmitted'].map({'Yes': 1, 'No': 0})
X = df.drop('readmitted', axis=1)
y = df['readmitted']

# 4. Identify Feature Types by inspecting DataFrame columns
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

# Remove 'hospital_stay', 'patient_visits', 'num_diagnosis' from numerical as they should be categorical
numerical_cols.remove('hospital_stay')
numerical_cols.remove('patient_visits')
numerical_cols.remove('num_diagnosis')


# Add 'hospital_stay', 'patient_visits', 'num_diagnosis' to categorical
categorical_cols.extend(['hospital_stay', 'patient_visits', 'num_diagnosis'])


# 5. Preprocessing Pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 7. Build Pipeline with Random Forest
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42))
])

# 8. Cross-Validation for Overfitting Mitigation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')
print(f"Cross-validated Accuracy: {scores.mean():.3f} ± {scores.std():.3f}")

# 9. Train Final Model
pipeline.fit(X_train, y_train)

# 10. Evaluate Model
y_pred = pipeline.predict(X_test)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_pred):.3f}")

# 11. Save Model for Deployment
joblib.dump(pipeline, "readmission_model.pkl")

Cross-validated Accuracy: 0.632 ± 0.002

Confusion Matrix:
 [[1376  444]
 [ 786  686]]

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.76      0.69      1820
           1       0.61      0.47      0.53      1472

    accuracy                           0.63      3292
   macro avg       0.62      0.61      0.61      3292
weighted avg       0.62      0.63      0.62      3292

Precision: 0.607
Recall: 0.466


['readmission_model.pkl']