In [None]:
# -----------------------------
# PHASE 2: FEATURE ENGINEERING + ADVANCED MODEL
# -----------------------------
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import joblib

# -----------------------------
# Load cleaned dataset
# -----------------------------
df = pd.read_csv('churn_data.csv')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna(subset=['TotalCharges']).reset_index(drop=True)

# -----------------------------
# FEATURE ENGINEERING
# -----------------------------
bins = [0, 12, 24, 48, 60, 72]
labels = ['0-12','13-24','25-48','49-60','61-72']
df['tenure_group'] = pd.cut(df['tenure'], bins=bins, labels=labels)

service_cols = ['PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup',
                'DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
df['num_services'] = df[service_cols].apply(lambda x: sum(x=='Yes'), axis=1)
df['avg_monthly_charge'] = df['TotalCharges'] / (df['tenure'] + 1)

# -----------------------------
# PREPARE DATA
# -----------------------------
X = df.drop(columns=['customerID','Churn'])
y = (df['Churn']=='Yes').astype(int)
X = pd.get_dummies(X, drop_first=True)

# -----------------------------
# TRAIN TEST SPLIT
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# MODEL TRAINING
# -----------------------------
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

# -----------------------------
# MODEL EVALUATION
# -----------------------------
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:,1]

print("\n--- RANDOM FOREST EVALUATION ---")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -----------------------------
# SAVE MODEL AND FEATURE COLUMNS
# -----------------------------
# Ensure models folder exists
models_dir = os.path.join(os.getcwd(), '..', 'models')  # adjust if necessary
os.makedirs(models_dir, exist_ok=True)

# Save model
joblib.dump(rf, os.path.join(models_dir, 'rf_churn_model.joblib'))

# Save feature columns
joblib.dump(X.columns.tolist(), os.path.join(models_dir, 'feature_columns.joblib'))

print(f"Model and feature columns saved successfully in {models_dir}")
# -----------------------------
# Save Random Forest model and feature columns safely
# -----------------------------
import joblib
import os
import pandas as pd

# Load your trained Random Forest model (replace with your variable name if different)
# rf = your trained model
# X = your one-hot encoded feature DataFrame used for training

# Example: if you haven't run training in this notebook, reload model from notebook
# Otherwise, just use your rf and X from the Phase 2 notebook

# -----------------------------
# Make sure 'models' folder exists in project root
# -----------------------------
# Adjust this path to your churn_project folder
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # assuming notebook in notebooks/
models_dir = os.path.join(project_root, "models")
os.makedirs(models_dir, exist_ok=True)

# -----------------------------
# Save model
# -----------------------------
joblib.dump(rf, os.path.join(models_dir, "rf_churn_model.joblib"))

# -----------------------------
# Save feature columns
# -----------------------------
joblib.dump(list(X.columns), os.path.join(models_dir, "feature_columns.joblib"))

print("✅ Model and feature columns saved successfully in:", models_dir)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group,num_services,avg_monthly_charge
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0-12,1,14.925
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,One year,No,Mailed check,56.95,1889.5,No,25-48,3,53.985714
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,0-12,3,36.05
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,25-48,3,40.016304
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,0-12,1,50.55


In [None]:
print("hello")