In [1]:
# 04. Model Training (FINAL FIXED VERSION)

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
import joblib

# -----------------------------
# LOAD PREPROCESSED DATA
# -----------------------------
train = pd.read_csv("train_processed.csv")

# FULL FEATURE SET
X = train.drop("Churn", axis=1)
y = train["Churn"]

# Identify object columns (should be NONE after preprocessing)
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# If there are still categorical columns â†’ encode them
label_encoders = {}
X_encoded = X.copy()

for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# -----------------------------
# SCALE NUMERIC FEATURES
# -----------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# -----------------------------
# BALANCE THE DATA
# -----------------------------
sm = SMOTE(random_state=42)
X_bal, y_bal = sm.fit_resample(X_scaled, y)

# -----------------------------
# TRAIN MODEL
# -----------------------------
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=12,
    min_samples_split=10,
    min_samples_leaf=4,
    class_weight="balanced",
    random_state=42
)

rf.fit(X_bal, y_bal)

# -----------------------------
# SAVE EVERYTHING FOR INFERENCE
# -----------------------------

# Save model
joblib.dump(rf, "rf_model.pkl")

# Save scaler
joblib.dump(scaler, "scaler.pkl")

# Save label encoders (if any)
joblib.dump(label_encoders, "label_encoders.pkl")

# Save the column order used during training
joblib.dump(list(X_encoded.columns), "feature_names.pkl")

print("Model training complete. All artifacts saved.")

Model training complete. All artifacts saved.
