In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, recall_score
from sklearn.impute import SimpleImputer


In [None]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
print("Initial shape:", df.shape)
df.head()


In [None]:
duplicates = df.duplicated().sum()
print(f"Duplicate rows found: {duplicates}")
df = df.drop_duplicates()
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
plt.figure(figsize=(6, 3))
sns.boxplot(x=df["TotalCharges"])
plt.title("Boxplot - Outlier Check for TotalCharges")
plt.show()
if "customerID" in df.columns:
    df = df.drop(columns=["customerID"])
df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})


In [None]:
plt.figure(figsize=(5, 4))
sns.countplot(x="Churn", data=df)
plt.title("Churn Distribution")
plt.show()
plt.figure(figsize=(5, 4))
plt.hist(df["tenure"], bins=30, edgecolor="black")
plt.title("Tenure Distribution")
plt.xlabel("Tenure (months)")
plt.show()


In [None]:
X = df.drop(columns=["Churn"])
y = df["Churn"]
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


In [None]:
km_imputer = SimpleImputer(strategy='median')
scaler_km = StandardScaler()
X_train_num = km_imputer.fit_transform(X_train[numeric_features])
X_train_scaled = scaler_km.fit_transform(X_train_num)
X_test_num = km_imputer.transform(X_test[numeric_features])
X_test_scaled = scaler_km.transform(X_test_num)
inertia = []
K_range = range(1, 10)
for k in K_range:
    kmeans_test = KMeans(n_clusters=k, n_init=10, random_state=42)
    kmeans_test.fit(X_train_scaled)
    inertia.append(kmeans_test.inertia_)
plt.figure(figsize=(6, 4))
plt.plot(K_range, inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()
kmeans = KMeans(n_clusters=2, n_init=10, random_state=42)
X_train = X_train.copy()
X_test = X_test.copy()
X_train["cluster"] = kmeans.fit_predict(X_train_scaled)
X_test["cluster"] = kmeans.predict(X_test_scaled)
X_train["cluster"] = X_train["cluster"].astype(str)
X_test["cluster"] = X_test["cluster"].astype(str)
categorical_features_with_cluster = categorical_features + ["cluster"]
print("Cluster feature created.")
print("Updated Categorical features:", categorical_features_with_cluster)


In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = OneHotEncoder(drop="first", handle_unknown="ignore")
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features_with_cluster),
    ]
)


In [None]:
results = []
def evaluate_model(model, name, X_tr, y_tr, X_te, y_te):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    acc = accuracy_score(y_te, y_pred)
    f1 = f1_score(y_te, y_pred)
    recall = recall_score(y_te, y_pred)
    print(f"--- {name} ---")
    print(f"Accuracy: {acc:.4f} | F1: {f1:.4f} | Recall: {recall:.4f}")
    return {"Model": name, "Accuracy": acc, "F1-Score": f1}
pipe_lr = Pipeline([("preprocess", preprocessor), ("clf", LogisticRegression(max_iter=1000))])
results.append(evaluate_model(pipe_lr, "LogReg (Baseline)", X_train, y_train, X_test, y_test))
pipe_knn = Pipeline([("preprocess", preprocessor), ("clf", KNeighborsClassifier(n_neighbors=5))])
results.append(evaluate_model(pipe_knn, "KNN", X_train, y_train, X_test, y_test))
pipe_rf = Pipeline([("preprocess", preprocessor), ("clf", RandomForestClassifier(random_state=42))])
results.append(evaluate_model(pipe_rf, "RandomForest (Baseline)", X_train, y_train, X_test, y_test))


In [None]:
param_grid_lr = {'clf__C': [0.1, 1.0, 10.0]}
grid_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=5, scoring='f1', n_jobs=-1)
results.append(evaluate_model(grid_lr, "LogReg (Tuned)", X_train, y_train, X_test, y_test))
best_lr = grid_lr.best_estimator_
y_pred_best_lr = best_lr.predict(X_test)
cm_lr = confusion_matrix(y_test, y_pred_best_lr)
plt.figure(figsize=(4, 4))
sns.heatmap(cm_lr, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Tuned Logistic Regression")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()
false_negatives = cm_lr[1, 0]
avg_monthly_charge = df['MonthlyCharges'].mean()
revenue_risk = false_negatives * avg_monthly_charge
print(f"\n--- Business Impact Analysis ---")
print(f"False Negatives (Missed Churners): {false_negatives}")
print(f"Average Monthly Charge: ${avg_monthly_charge:.2f}")
print(f"Estimated Monthly Revenue at Risk: ${revenue_risk:.2f}")


In [None]:
param_grid_rf = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [10, None]
}
grid_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
results.append(evaluate_model(grid_rf, "RandomForest (Tuned)", X_train, y_train, X_test, y_test))
best_rf = grid_rf.best_estimator_


In [None]:
try:
    feature_names = best_lr.named_steps["preprocess"].get_feature_names_out()
    coefs = best_lr.named_steps["clf"].coef_[0]
    coef_df = pd.DataFrame({"Feature": feature_names, "Coefficient": coefs})
    print("Top 5 indicators of Churn (Positive Coefs):")
    print(coef_df.sort_values(by="Coefficient", ascending=False).head())
    print("\nTop 5 indicators of Retention (Negative Coefs):")
    print(coef_df.sort_values(by="Coefficient", ascending=True).head())
except Exception as e:
    print(f"Could not extract coefficients: {e}")


In [None]:
results_df = pd.DataFrame(results)
print("=" * 60)
print("Summary (Sorted by F1-Score):")
print(results_df.sort_values(by="F1-Score", ascending=False).reset_index(drop=True))
