In [None]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib
import pickle
import warnings
warnings.filterwarnings('ignore')
sns.set(style='whitegrid')

In [None]:
# Paths - adjust if your file name/location differs
RAW_CSV = '../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv'
PROCESSED_CSV = '../data/processed/churn_processed.csv'
MODEL_PATH = '../models/churn_model.pkl'

# Ensure directories exist
os.makedirs(os.path.dirname(PROCESSED_CSV), exist_ok=True)
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)

# Load raw data
df = pd.read_csv(r"C:\FSDS_GENAI2\Customer_Churn_Prediction_Project\WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

In [None]:
# Basic info and missing values
df.info()
print('\nMissing values per column:')
print(df.isnull().sum())

In [None]:
# Convert TotalCharges to numeric  and inspect
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print('TotalCharges nulls:', df['TotalCharges'].isnull().sum())

# Fill missing TotalCharges with median
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

In [None]:

# EDA - distribution of target and key features
import matplotlib.pyplot as plt
plt.figure(figsize=(6,4))
sns.countplot(x='Churn', data=df)
plt.title('Churn distribution')
plt.show()

plt.figure(figsize=(8,4))
sns.histplot(df['tenure'], kde=False, bins=30)
plt.title('Tenure distribution')
plt.show()

plt.figure(figsize=(8,4))
sns.boxplot(x='Churn', y='MonthlyCharges', data=df)
plt.title('MonthlyCharges by Churn')
plt.show()

In [None]:
# Preprocessing & feature engineering
df_proc = df.copy()

# Drop customerID if exists
if 'customerID' in df_proc.columns:
    df_proc = df_proc.drop(columns=['customerID'])

# Binary mapping for common Yes/No columns
binary_map = {'Yes':1, 'No':0}
for c in ['Partner','Dependents','PhoneService','PaperlessBilling','Churn']:
    if c in df_proc.columns:
        df_proc[c] = df_proc[c].map(binary_map)

# Map gender
if 'gender' in df_proc.columns:
    df_proc['gender'] = df_proc['gender'].map({'Male':1, 'Female':0})

# One-hot encode multi-categorical cols
to_dummify = ['MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaymentMethod']
existing = [c for c in to_dummify if c in df_proc.columns]
df_proc = pd.get_dummies(df_proc, columns=existing, drop_first=True)

print('Processed shape:', df_proc.shape)
df_proc.head()

In [None]:
# Save processed dataset for reproducibility
df_proc.to_csv(PROCESSED_CSV, index=False)
print('Saved processed CSV to', PROCESSED_CSV)

In [None]:
# Prepare features and target
X = df_proc.drop(columns=['Churn'])
y = df_proc['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

In [None]:
# Train multiple models and compare
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_estimators=200)
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:,1]
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    auc = roc_auc_score(y_test, proba)
    results.append((name, acc, f1, auc))
    print(f"{name}: Acc={acc:.4f}, F1={f1:.4f}, AUC={auc:.4f}")

results_df = pd.DataFrame(results, columns=['model','accuracy','f1','auc']).sort_values('accuracy', ascending=False)
results_df

In [None]:
# Choose best model (by accuracy) and save
best_name = results_df.iloc[0]['model']
best_model = models[best_name]
print('Best model:', best_name)

# Save model with joblib/pickle
joblib.dump(best_model, MODEL_PATH)
print('Saved model to', MODEL_PATH)

In [None]:
# Detailed evaluation of best model
best = joblib.load(MODEL_PATH)
preds = best.predict(X_test)
proba = best.predict_proba(X_test)[:,1]

print(classification_report(y_test, preds))
cm = confusion_matrix(y_test, preds)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC curve
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.3f}')
plt.plot([0,1],[0,1],'--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
# SHAP explainability (optional - can be slow)
try:
    import shap
    explainer = shap.Explainer(best)
    shap_values = explainer(X_test)
    shap.summary_plot(shap_values, X_test)
except Exception as e:
    print('SHAP failed or is slow in this environment:', e)

In [None]:
# Save X_test and y_test for reproducible evaluation by app or tests
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)
print('Saved X_test and y_test to data/processed/')