In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("ChurnModelTraining") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

In [30]:
df = spark.read.parquet("hdfs://namenode:8020/user/telco/cleaned/telco_cleaned.parquet")

In [31]:
df = df.toPandas()

In [32]:
df[df["churn"]== 1].shape[0]

1869

In [33]:
df[df["churn"]== 0].shape[0]

5174

In [7]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline

In [35]:
def detect_outliers(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return lower_bound, upper_bound

def cap_outliers(df, col_name, lower_bound, upper_bound):
    df[col_name] = np.where(df[col_name] < lower_bound, lower_bound, df[col_name])
    df[col_name] = np.where(df[col_name] > upper_bound, upper_bound, df[col_name])
    return df

In [36]:
lower, upper = detect_outliers(df, "totalcharges")
df = cap_outliers(df, "totalcharges", lower, upper)
lower, upper = detect_outliers(df, "monthlycharges")
df = cap_outliers(df, "monthlycharges", lower, upper)
lower, upper = detect_outliers(df, "tenure")
df = cap_outliers(df, "tenure", lower, upper)

In [37]:
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,No,Yes,No,1.0,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,No,No,No,34.0,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,No,No,No,2.0,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,No,No,No,45.0,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,No,No,No,2.0,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [38]:
string_cols = [col for col in df.columns if df[col].dtype == "object" and col not in ["customerid","gender", "partner", "dependents", "phoneservice", "multiplelines", "phoneservice", "onlinebackup", "deviceprotection", "streamingtv", "streamingmovies"]]
numeric_cols = [col for col in df.select_dtypes(include=np.number).columns if col not in ["churn","totalcharges"]]
features = string_cols + numeric_cols

In [39]:
X = df[features]
y = df["churn"]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
preprocessor = ColumnTransformer([
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False), string_cols),
    ("sc", StandardScaler(), numeric_cols)
])

In [42]:
model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("rf", RandomForestClassifier(random_state=42, class_weight="balanced"))
])

In [43]:
model_pipeline.fit(X_train, y_train)



In [44]:
y_pred = model_pipeline.predict(X_test)

In [45]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("\n✅ Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred, digits=4))
print("\n📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.7637

📊 Classification Report:
               precision    recall  f1-score   support

           0     0.8456    0.8301    0.8378      1036
           1     0.5510    0.5791    0.5647       373

    accuracy                         0.7637      1409
   macro avg     0.6983    0.7046    0.7013      1409
weighted avg     0.7676    0.7637    0.7655      1409


📉 Confusion Matrix:
 [[860 176]
 [157 216]]


In [46]:
y_probs = model_pipeline.predict_proba(X_test)[:, 1]

for threshold in [0.5, 0.4, 0.3, 0.25]:
    print(f"\n🎯 Threshold: {threshold}")
    y_pred = (y_probs >= threshold).astype(int)

    acc = accuracy_score(y_test, y_pred)
    print(f"✅ Accuracy: {acc:.4f}")
    print("📊 Classification Report:")
    print(classification_report(y_test, y_pred))


🎯 Threshold: 0.5
✅ Accuracy: 0.7644
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1036
           1       0.55      0.59      0.57       373

    accuracy                           0.76      1409
   macro avg       0.70      0.71      0.70      1409
weighted avg       0.77      0.76      0.77      1409


🎯 Threshold: 0.4
✅ Accuracy: 0.7459
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.77      0.82      1036
           1       0.52      0.67      0.58       373

    accuracy                           0.75      1409
   macro avg       0.69      0.72      0.70      1409
weighted avg       0.77      0.75      0.76      1409


🎯 Threshold: 0.3
✅ Accuracy: 0.7161
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.69      0.78      1036
           1       0.48      0.78      0.59       3

In [47]:
from sklearn.ensemble import GradientBoostingClassifier

In [48]:
model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("GBC", GradientBoostingClassifier(random_state=42))
])

In [49]:
model_pipeline.fit(X_train, y_train)



In [50]:
y_pred =model_pipeline.predict(X_test)

In [None]:
y_pred

array([1, 0, 0, ..., 0, 0, 1])

: 

In [51]:
print("\n✅ Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred, digits=4))
print("\n📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.7686

📊 Classification Report:
               precision    recall  f1-score   support

           0     0.9157    0.7548    0.8275      1036
           1     0.5423    0.8070    0.6487       373

    accuracy                         0.7686      1409
   macro avg     0.7290    0.7809    0.7381      1409
weighted avg     0.8169    0.7686    0.7802      1409


📉 Confusion Matrix:
 [[782 254]
 [ 72 301]]


In [52]:
y_probs = model_pipeline.predict_proba(X_test)[:, 1]

for threshold in [0.5, 0.4, 0.3, 0.25]:
    print(f"\n🎯 Threshold: {threshold}")
    y_pred = (y_probs >= threshold).astype(int)

    acc = accuracy_score(y_test, y_pred)
    print(f"✅ Accuracy: {acc:.4f}")
    print("📊 Classification Report:")
    print(classification_report(y_test, y_pred))


🎯 Threshold: 0.5
✅ Accuracy: 0.7686
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.75      0.83      1036
           1       0.54      0.81      0.65       373

    accuracy                           0.77      1409
   macro avg       0.73      0.78      0.74      1409
weighted avg       0.82      0.77      0.78      1409


🎯 Threshold: 0.4
✅ Accuracy: 0.7424
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.70      0.80      1036
           1       0.51      0.87      0.64       373

    accuracy                           0.74      1409
   macro avg       0.72      0.78      0.72      1409
weighted avg       0.82      0.74      0.76      1409


🎯 Threshold: 0.3
✅ Accuracy: 0.6842
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.60      0.74      1036
           1       0.45      0.91      0.60       3

In [54]:
weights = {0: 1, 1: 1}

In [55]:
from sklearn.linear_model import LogisticRegression
model_pipeline_LR = Pipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("rf", LogisticRegression(random_state=42, class_weight=weights))
])

In [56]:
model_pipeline_LR.fit(X_train, y_train)



In [57]:
y_pred_LR = model_pipeline_LR.predict(X_test)

In [58]:
print("\n✅ Accuracy:", round(accuracy_score(y_test, y_pred_LR), 4))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred_LR, digits=4))
print("\n📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_LR))


✅ Accuracy: 0.7523

📊 Classification Report:
               precision    recall  f1-score   support

           0     0.9225    0.7239    0.8112      1036
           1     0.5201    0.8311    0.6398       373

    accuracy                         0.7523      1409
   macro avg     0.7213    0.7775    0.7255      1409
weighted avg     0.8160    0.7523    0.7659      1409


📉 Confusion Matrix:
 [[750 286]
 [ 63 310]]


In [59]:
y_probs = model_pipeline_LR.predict_proba(X_test)[:, 1]

for threshold in [0.5, 0.4, 0.3, 0.25]:
    print(f"\n🎯 Threshold: {threshold}")
    y_pred_LR = (y_probs >= threshold).astype(int)

    acc = accuracy_score(y_test, y_pred_LR)
    print(f"✅ Accuracy: {acc:.4f}")
    print("📊 Classification Report:")
    print(classification_report(y_test, y_pred_LR))


🎯 Threshold: 0.5
✅ Accuracy: 0.7523
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.72      0.81      1036
           1       0.52      0.83      0.64       373

    accuracy                           0.75      1409
   macro avg       0.72      0.78      0.73      1409
weighted avg       0.82      0.75      0.77      1409


🎯 Threshold: 0.4
✅ Accuracy: 0.6998
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.64      0.76      1036
           1       0.46      0.88      0.61       373

    accuracy                           0.70      1409
   macro avg       0.70      0.76      0.68      1409
weighted avg       0.81      0.70      0.72      1409


🎯 Threshold: 0.3
✅ Accuracy: 0.6395
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.53      0.68      1036
           1       0.42      0.94      0.58       3