In [None]:
# ===============================
# 1Ô∏è‚É£ Import Libraries
# ===============================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ===============================
# 2Ô∏è‚É£ Load Dataset
# ===============================
df = pd.read_csv("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")  # upload file in Colab
df.head()

# ===============================
# 3Ô∏è‚É£ Data Exploration
# ===============================
print("Dataset Shape:", df.shape)
print(df.info())
print(df.describe())
print(df['Churn'].value_counts())

# Optional: visualize churn distribution
sns.countplot(x='Churn', data=df)
plt.show()

# ===============================
# 4Ô∏è‚É£ Preprocessing
# ===============================

# Drop customerID column
df.drop("customerID", axis=1, inplace=True)

# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)

# Binary columns: Yes/No ‚Üí 1/0
binary_cols = ["Partner", "Dependents", "PhoneService", "PaperlessBilling", "Churn"]
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

# Multi-category columns ‚Üí One-Hot Encoding
multi_cols = [col for col in df.columns if df[col].dtype == "object" and col not in binary_cols]
df = pd.get_dummies(df, columns=multi_cols, drop_first=True)

# ===============================
# 5Ô∏è‚É£ Feature-Target Split
# ===============================
X = df.drop("Churn", axis=1)
y = df["Churn"]

# ===============================
# 6Ô∏è‚É£ Train-Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ===============================
# 7Ô∏è‚É£ Feature Scaling
# ===============================
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ===============================
# 8Ô∏è‚É£ Model Training: Logistic Regression
# ===============================
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

# Confusion Matrix
sns.heatmap(confusion_matrix(y_test, y_pred_log), annot=True, fmt="d", cmap="Blues")
plt.title("Logistic Regression Confusion Matrix")
plt.show()

# ===============================
# 9Ô∏è‚É£ Model Training: Random Forest
# ===============================
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Confusion Matrix
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt="d", cmap="Greens")
plt.title("Random Forest Confusion Matrix")
plt.show()

# ===============================
# üîπ Feature Importance (Random Forest)
# ===============================
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title("Feature Importance - Random Forest")
plt.show()

with open("/content/customer_churn_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)
y_compare = pd.DataFrame({"Actual": y_test, "Predicted": y_pred_rf})
y_compare.reset_index(drop=True, inplace=True)
y_compare.head(20).plot(kind='bar', figsize=(15,5))
plt.show()