# Data Preprocessing

---

In [None]:
# 1_data_preprocessing.ipynb

# ðŸ“Œ Step 1: Library Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# ðŸ“Œ Step 2: Load Dataset
df = pd.read_csv('../data/raw/diabetic_data.csv')
print("Initial shape:", df.shape)

# ðŸ“Œ Step 3: Drop High Missingness Columns
drop_cols = ['weight', 'payer_code', 'medical_specialty']
df.drop(columns=drop_cols, inplace=True)

# ðŸ“Œ Step 4: Handle Missing Values
df = df.dropna(subset=['race', 'gender', 'diag_1', 'diag_2', 'diag_3'])

# ðŸ“Œ Step 5: Encode Categorical Features
# Example for gender
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

# One-hot encode nominal categorical features
categorical_cols = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# ðŸ“Œ Step 6: Create Custom Features
df['service_utilization'] = df['number_outpatient'] + df['number_emergency'] + df['number_inpatient']
df['med_change_count'] = df.filter(like='change').apply(lambda row: (row == 'Ch').sum(), axis=1)

# ðŸ“Œ Step 7: Log Transform Skewed Features
skewed_cols = ['number_emergency', 'number_inpatient', 'service_utilization']
df[skewed_cols] = df[skewed_cols].apply(lambda x: np.log1p(x))

# ðŸ“Œ Step 8: Normalize Numerical Features
scaler = MinMaxScaler()
num_cols = ['time_in_hospital', 'num_lab_procedures', 'num_medications']
df[num_cols] = scaler.fit_transform(df[num_cols])

# ðŸ“Œ Step 9: Split and Apply SMOTE
X = df.drop(columns=['readmitted'])
y = df['readmitted'].apply(lambda x: 1 if x == '<30' else 0)  # Binary classification

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

print("Post-SMOTE class distribution:\n", pd.Series(y_train_bal).value_counts())


ModuleNotFoundError: No module named 'imblearn'