In [1]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,accuracy_score,mean_absolute_error,mean_squared_error

In [2]:
df=pd.read_csv("churn.csv")

In [3]:
df.head()
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64374 entries, 0 to 64373
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   CustomerID         64374 non-null  int64 
 1   Age                64374 non-null  int64 
 2   Gender             64374 non-null  object
 3   Tenure             64374 non-null  int64 
 4   Usage Frequency    64374 non-null  int64 
 5   Support Calls      64374 non-null  int64 
 6   Payment Delay      64374 non-null  int64 
 7   Subscription Type  64374 non-null  object
 8   Contract Length    64374 non-null  object
 9   Total Spend        64374 non-null  int64 
 10  Last Interaction   64374 non-null  int64 
 11  Churn              64374 non-null  int64 
dtypes: int64(9), object(3)
memory usage: 5.9+ MB


Unnamed: 0,CustomerID,Age,Tenure,Usage Frequency,Support Calls,Payment Delay,Total Spend,Last Interaction,Churn
count,64374.0,64374.0,64374.0,64374.0,64374.0,64374.0,64374.0,64374.0,64374.0
mean,32187.5,41.970982,31.994827,15.080234,5.40069,17.133952,541.023379,15.49885,0.473685
std,18583.317451,13.924911,17.098234,8.81647,3.114005,8.852211,260.874809,8.638436,0.499311
min,1.0,18.0,1.0,1.0,0.0,0.0,100.0,1.0,0.0
25%,16094.25,30.0,18.0,7.0,3.0,10.0,313.0,8.0,0.0
50%,32187.5,42.0,33.0,15.0,6.0,19.0,534.0,15.0,0.0
75%,48280.75,54.0,47.0,23.0,8.0,25.0,768.0,23.0,1.0
max,64374.0,65.0,60.0,30.0,10.0,30.0,1000.0,30.0,1.0


In [4]:
df.isnull().sum()

CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64

In [5]:
from sklearn.impute import SimpleImputer


# Preprocessing Pipeline

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

num_cols = [
    "Age",
    "Tenure",
    "Usage Frequency",
    "Support Calls",
    "Payment Delay",
    "Total Spend"
]

cat_cols = [
    "Gender",
    "Subscription Type",
    "Contract Length",
    "Last Interaction"
]


num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop="first"))
])

preprocess = ColumnTransformer([
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols)
])


In [8]:
#TrainTest Split

from sklearn.model_selection import train_test_split

X = df[num_cols + cat_cols]
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# MODEL 1 – Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

log_model = Pipeline([
    ("prep", preprocess),
    ("model", LogisticRegression())
])

log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)


# MODEL 2 – KNN (Reality check model)

In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = Pipeline([
    ("prep", preprocess),
    ("model", KNeighborsClassifier(n_neighbors=5))
])


# MODEL 3 – SVM

In [11]:
from sklearn.svm import SVC

svm_model = Pipeline([
    ("prep", preprocess),
    ("model", SVC(kernel="rbf", C=1, gamma="scale"))
])


# MODEL 4 – Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier

dt_model = Pipeline([
    ("prep", preprocess),
    ("model", DecisionTreeClassifier(max_depth=5))
])


# MODEL 5 – Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf_model = Pipeline([
    ("prep", preprocess),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=None
    ))
])


# MODEL 6 – AdaBoost

In [15]:
from sklearn.ensemble import AdaBoostClassifier

ada_model = Pipeline([
    ("prep", preprocess),
    ("model", AdaBoostClassifier(n_estimators=200))
])


# MODEL 7 – XGBoost

In [16]:
from xgboost import XGBClassifier

xgb_model = Pipeline([
    ("prep", preprocess),
    ("model", XGBClassifier(
        n_estimators=400,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8
    ))
])


In [17]:
# EVALUATION

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.83      0.83      6776
           1       0.81      0.82      0.82      6099

    accuracy                           0.83     12875
   macro avg       0.83      0.83      0.83     12875
weighted avg       0.83      0.83      0.83     12875



In [18]:
def evaluate_model(model, X_train, X_test, y_train, y_test, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("\n==============================")
    print(f"MODEL: {name}")
    print("==============================")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


In [19]:
evaluate_model(log_model, X_train, X_test, y_train, y_test, "Logistic Regression")
evaluate_model(knn_model, X_train, X_test, y_train, y_test, "KNN")
evaluate_model(svm_model, X_train, X_test, y_train, y_test, "SVM")
evaluate_model(dt_model, X_train, X_test, y_train, y_test, "Decision Tree")
evaluate_model(rf_model, X_train, X_test, y_train, y_test, "Random Forest")
evaluate_model(ada_model, X_train, X_test, y_train, y_test, "AdaBoost")
evaluate_model(xgb_model, X_train, X_test, y_train, y_test, "XGBoost")



MODEL: Logistic Regression
Accuracy: 0.8261747572815534

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.83      0.83      6776
           1       0.81      0.82      0.82      6099

    accuracy                           0.83     12875
   macro avg       0.83      0.83      0.83     12875
weighted avg       0.83      0.83      0.83     12875


Confusion Matrix:


NameError: name 'confusion_matrix' is not defined