In [None]:
# ===============================
# CUSTOMER CHURN PREDICTION MODEL
# ===============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

# -------------------------------
# 1. Load Dataset
# -------------------------------
df = pd.read_csv("Telco-Customer-Churn.csv")
print("Dataset Loaded")

# -------------------------------
# 2. Data Cleaning
# -------------------------------
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df = df.dropna()
df = df.drop("customerID", axis=1)

# -------------------------------
# 3. Encode Categorical Variables
# -------------------------------
le = LabelEncoder()
for col in df.select_dtypes(include="object").columns:
    df[col] = le.fit_transform(df[col])

# -------------------------------
# 4. Split Features and Target
# -------------------------------
X = df.drop("Churn", axis=1)
y = df["Churn"]

# -------------------------------
# 5. Train Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# 6. Feature Scaling
# -------------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -------------------------------
# 7. Logistic Regression Model
# -------------------------------
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

# -------------------------------
# 8. Random Forest Model
# -------------------------------
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_prob = rf_model.predict_proba(X_test)[:,1]

# -------------------------------
# 9. Model Evaluation
# -------------------------------
print("\nLogistic Regression Accuracy:", accuracy_score(y_test, log_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_pred))

# -------------------------------
# 10. Confusion Matrix
# -------------------------------
cm = confusion_matrix(y_test, rf_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# -------------------------------
# 11. ROC Curve
# -------------------------------
roc = roc_auc_score(y_test, rf_prob)
fpr, tpr, _ = roc_curve(y_test, rf_prob)

plt.plot(fpr, tpr, label=f"AUC = {roc:.2f}")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

# -------------------------------
# 12. Feature Importance
# -------------------------------
feature_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("\nTop 10 Important Features:")
print(feature_importance.head(10))

feature_importance.to_csv("important_features.csv", index=False)

print("\nProject Completed Successfully!")
