In [30]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [31]:
# Load dataset (manual parsing due to quoted rows)
df_raw = pd.read_csv(
    "../data/WA_Fn-UseC_-Telco-Customer-Churn.csv",
    header=None,
    engine="python"
)

df_raw[0] = df_raw[0].str.replace('"', '')
df = df_raw[0].str.split(",", expand=True)

df.columns = df.iloc[0]
df = df[1:].reset_index(drop=True)

# Convert numeric columns
numeric_cols = ["tenure", "MonthlyCharges", "TotalCharges", "SeniorCitizen"]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df = df.dropna()

df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [32]:
pd.crosstab(df["Contract"], df["Churn"], normalize="index")


Churn,No,Yes
Contract,Unnamed: 1_level_1,Unnamed: 2_level_1
Month-to-month,0.572903,0.427097
One year,0.887228,0.112772
Two year,0.971513,0.028487


In [33]:
pd.crosstab(df["InternetService"], df["Churn"], normalize="index")


Churn,No,Yes
InternetService,Unnamed: 1_level_1,Unnamed: 2_level_1
DSL,0.810017,0.189983
Fiber optic,0.581072,0.418928
No,0.925658,0.074342


In [34]:
df.groupby("Churn")[["MonthlyCharges", "tenure"]].mean()


Unnamed: 0_level_0,MonthlyCharges,tenure
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1
No,61.307408,37.65001
Yes,74.441332,17.979133


In [35]:
mean_monthly = df["MonthlyCharges"].mean()
mean_tenure = df["tenure"].mean()

df["risk_score"] = 0

df.loc[df["Contract"] == "Month-to-month", "risk_score"] += 1
df.loc[df["InternetService"] == "Fiber optic", "risk_score"] += 1
df.loc[df["MonthlyCharges"] > mean_monthly, "risk_score"] += 1
df.loc[df["tenure"] < mean_tenure, "risk_score"] += 1

df["is_high_risk"] = (df["risk_score"] >= 3).astype(int)

pd.crosstab(df["risk_score"], df["Churn"], normalize="index")


Churn,No,Yes
risk_score,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.972576,0.027424
1,0.942948,0.057052
2,0.762852,0.237148
3,0.695373,0.304627
4,0.384221,0.615779


In [36]:
df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})

X = df.drop(["customerID", "Churn"], axis=1)
y = df["Churn"]

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=2000, class_weight="balanced")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.736318407960199
              precision    recall  f1-score   support

           0       0.91      0.72      0.80      1033
           1       0.50      0.79      0.62       374

    accuracy                           0.74      1407
   macro avg       0.70      0.75      0.71      1407
weighted avg       0.80      0.74      0.75      1407



In [37]:
coefficients = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_[0]
})

coefficients.sort_values(by="Coefficient", ascending=False).head(10)


Unnamed: 0,Feature,Coefficient
12,InternetService_Fiber optic,0.922314
10,MultipleLines_No phone service,0.581136
30,PaymentMethod_Electronic check,0.289791
0,SeniorCitizen,0.271472
28,PaperlessBilling_Yes,0.242274
11,MultipleLines_Yes,0.105389
7,Partner_Yes,0.063928
25,StreamingMovies_Yes,0.047015
2,MonthlyCharges,0.028864
23,StreamingTV_Yes,0.022826
