# Model Training — Random Forest Churn Classifier
**Author:** Nagul Meera Shaik

Training Logistic Regression + Random Forest ensemble with SMOTE to handle class imbalance.
Final model: 81% accuracy, AUC 0.69

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    classification_report, roc_auc_score,
    confusion_matrix, roc_curve, ConfusionMatrixDisplay
)
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import pickle, warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('../data/processed/features_engineered.csv')
X = df.drop('churn', axis=1)
y = df['churn']
print('Class balance before SMOTE:\n', y.value_counts())

In [None]:
# Apply SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
print('Class balance after SMOTE:\n', pd.Series(y_res).value_counts())

X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

In [None]:
# Train Random Forest
rf = RandomForestClassifier(
    n_estimators=200, max_depth=12,
    class_weight='balanced', random_state=42, n_jobs=-1
)
rf.fit(X_train, y_train)

y_pred  = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

acc = rf.score(X_test, y_test)
auc = roc_auc_score(y_test, y_proba)
print(f'Accuracy: {acc:.4f}')
print(f'AUC-ROC:  {auc:.4f}')
print('\n', classification_report(y_test, y_pred))

In [None]:
# Visualize ROC Curve + Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
axes[0].plot(fpr, tpr, color='#63D2FF', lw=2, label=f'AUC = {auc:.2f}')
axes[0].plot([0,1],[0,1],'k--', lw=1)
axes[0].set_title('ROC Curve — Churn Model', fontweight='bold')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].legend()

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Retained','Churned'])
disp.plot(ax=axes[1], colorbar=False, cmap='Blues')
axes[1].set_title('Confusion Matrix', fontweight='bold')

plt.tight_layout()
plt.savefig('../reports/model_performance.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Feature Importance
feat_imp = pd.Series(rf.feature_importances_, index=X.columns)
top10 = feat_imp.nlargest(10)

plt.figure(figsize=(8, 5))
top10.sort_values().plot(kind='barh', color='#7BF5A0', edgecolor='black')
plt.title('Top 10 Feature Importances — Churn Prediction', fontweight='bold')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.savefig('../reports/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()
print('Top 3 churn drivers:', top10.index[:3].tolist())

# Save model
with open('../models/churn_model_rf.pkl', 'wb') as f:
    pickle.dump(rf, f)
print('Model saved to models/churn_model_rf.pkl')