In [26]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,accuracy_score,mean_absolute_error,mean_squared_error


In [27]:
df=pd.read_csv("churn.csv")

In [28]:
X=df.drop("Churn",axis=1)
y=df["Churn"]

In [29]:
df.isnull().sum()

CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64

In [32]:
X.shape,y.shape

((64374, 11), (64374,))

# Encoding + Scaling (Industry way)

In [37]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_cols = [
    "Age",
    "Tenure",
    "Usage Frequency",
    "Support Calls",
    "Payment Delay",
    "Total Spend"
]

cat_cols = [
    "Gender",
    "Subscription Type",
    "Contract Length",
    "Last Interaction"
]

preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])


In [38]:
# Train test split 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# 1. Logistic Regression (Baseline Model)

In [40]:
from sklearn.linear_model import LogisticRegression

model = Pipeline([
    ("prep", preprocess),
    ("clf", LogisticRegression())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.8317669902912621
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      6793
           1       0.82      0.83      0.82      6082

    accuracy                           0.83     12875
   macro avg       0.83      0.83      0.83     12875
weighted avg       0.83      0.83      0.83     12875



In [41]:
#NaiveBayes

from sklearn.naive_bayes import GaussianNB

model = Pipeline([
    ("prep", preprocess),
    ("clf", GaussianNB())
])


In [42]:
#KNN

from sklearn.neighbors import KNeighborsClassifier

model = Pipeline([
    ("prep", preprocess),
    ("clf", KNeighborsClassifier(n_neighbors=5))
])


In [43]:
#SVM

from sklearn.svm import SVC

model = Pipeline([
    ("prep", preprocess),
    ("clf", SVC(kernel="rbf"))
])


In [44]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier

model = Pipeline([
    ("prep", preprocess),
    ("clf", DecisionTreeClassifier(max_depth=5))
])


In [46]:
#RandomFOreest

from sklearn.ensemble import RandomForestClassifier

model = Pipeline([
    ("prep", preprocess),
    ("clf", RandomForestClassifier(n_estimators=200))
])


In [47]:
#AdaBOOST

from sklearn.ensemble import AdaBoostClassifier

model = Pipeline([
    ("prep", preprocess),
    ("clf", AdaBoostClassifier(n_estimators=100))
])


In [48]:
#XGBOOST

from xgboost import XGBClassifier

model = Pipeline([
    ("prep", preprocess),
    ("clf", XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05
    ))
])


ModuleNotFoundError: No module named 'xgboost'