In [1]:
# train_compare_models.ipynb

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report
)

# --- Load dataset ---
df = pd.read_csv("../data/churn_clean.csv")

# Separate features and target
X = pd.get_dummies(df.drop(columns=["Churn"]))
y = df["Churn"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Logistic Regression ---
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse one-hot features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression(max_iter=1000, class_weight="balanced")
log_reg.fit(X_train_scaled, y_train)
y_pred_lr = log_reg.predict(X_test_scaled)
y_proba_lr = log_reg.predict_proba(X_test_scaled)[:, 1]

# --- Random Forest ---
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced")
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

# --- Metrics function ---
def evaluate_model(name, y_true, y_pred, y_proba):
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred),
        "ROC-AUC": roc_auc_score(y_true, y_proba)
    }

# --- Compare both models ---
results = []
results.append(evaluate_model("Logistic Regression", y_test, y_pred_lr, y_proba_lr))
results.append(evaluate_model("Random Forest", y_test, y_pred_rf, y_proba_rf))

results_df = pd.DataFrame(results)
print("ðŸ“Š Model Comparison:")
print(results_df)

# --- Save better model (Random Forest) if desired ---
joblib.dump(rf, "../app/model.pkl")
joblib.dump(X.columns, "../app/model_features.pkl")

print("\nâœ… Random Forest model saved as model.pkl for dashboard use.")


ðŸ“Š Model Comparison:
                 Model  Accuracy  Precision    Recall  F1 Score   ROC-AUC
0  Logistic Regression  0.789212   0.630508  0.497326  0.556054  0.831610
1        Random Forest  0.787083   0.631206  0.475936  0.542683  0.828155

âœ… Random Forest model saved as model.pkl for dashboard use.
