# C. Preprocessing Pipeline and Feature Engineering

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score

# Step 1: Simulate Data
np.random.seed(42)
n_samples = 1000

df = pd.DataFrame({
    'age': np.random.randint(20, 90, size=n_samples),
    'gender': np.random.choice(['Male', 'Female'], size=n_samples),
    'ethnicity': np.random.choice(['White', 'Black', 'Hispanic', 'Asian', 'Other'], size=n_samples),
    'diagnosis_code': np.random.choice(['D1', 'D2', 'D3', 'D4'], size=n_samples),
    'length_of_stay': np.random.randint(1, 15, size=n_samples),
    'lab_result1': np.random.normal(100, 15, size=n_samples),
    'lab_result2': np.random.normal(5, 1.5, size=n_samples),
    'readmitted': np.random.choice([0, 1], size=n_samples, p=[0.7, 0.3])  # 30% readmitted
})

# Step 2: Define features and target
X = df.drop("readmitted", axis=1)
y = df["readmitted"]

# Step 3: Preprocessing
numeric_features = ['age', 'length_of_stay', 'lab_result1', 'lab_result2']
categorical_features = ['gender', 'ethnicity', 'diagnosis_code']

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Step 4: Model pipeline
model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))

# Step 5: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Step 6: Train model
model.fit(X_train, y_train)

# Step 7: Predictions and evaluation
y_pred = model.predict(X_test)

# Step 8: Metrics
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")


Confusion Matrix:
 [[130  10]
 [ 54   6]]
Precision: 0.38
Recall: 0.10


# Optimization (5 points)

In [3]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_depth': [5, 10, None]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='recall')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)


Best parameters: {'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 100}
