STEP 1: IMPORT LIBRARIES

In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib

STEP 2: LOAD DATASET


In [6]:
data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

STEP 3: DATA PREPROCESSING

In [7]:
if "customerID" in data.columns:
    data = data.drop("customerID", axis=1)

STEP 4: ENCODING TARGET

In [8]:
data["Churn"] = data["Churn"].map({"Yes": 1, "No": 0})

STEP 5: SPLIT FEATURES AND TARGET

In [9]:
X = data.drop("Churn", axis=1)
y = data["Churn"]

print("Data loaded. Shape:", data.shape)

Data loaded. Shape: (7043, 20)


STEP 6: TRAIN TEST-SPLIT

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)

Train size: (5634, 19) Test size: (1409, 19)


STEP 7: PREPROCESSING

In [11]:
cat_features = X.select_dtypes(include=["object"]).columns
num_features = X.select_dtypes(include=["int64", "float64"]).columns

TRANSFORMERS

In [12]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

COLUMN TRANSFORMERS

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ]
)

STEP 8: BUILD PIPELINES

In [14]:
pipe_lr = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

pipe_rf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])


STEP 9: HYPERPARAMETER TUNING

In [15]:
param_grid_lr = {
    "classifier__C": [0.1, 1.0, 10.0]
}

param_grid_rf = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [5, 10, None]
}

LOGISTIC REGRESSION

In [16]:
grid_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=3, scoring="accuracy", n_jobs=-1)
grid_lr.fit(X_train, y_train)


RANDOM FOREST

In [17]:
grid_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=3, scoring="accuracy", n_jobs=-1)
grid_rf.fit(X_train, y_train)

STEP 10: EVALUATION

In [18]:
best_lr = grid_lr.best_estimator_
best_rf = grid_rf.best_estimator_

y_pred_lr = best_lr.predict(X_test)
y_pred_rf = best_rf.predict(X_test)

print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Logistic Regression Results:
Accuracy: 0.7984386089425124
F1 Score: 0.5872093023255814
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1035
           1       0.64      0.54      0.59       374

    accuracy                           0.80      1409
   macro avg       0.74      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409


Random Forest Results:
Accuracy: 0.7849538679914834
F1 Score: 0.5388127853881279
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1035
           1       0.63      0.47      0.54       374

    accuracy                           0.78      1409
   macro avg       0.73      0.69      0.70      1409
weighted avg       0.77      0.78      0.77      1409



STEP 11: SAVE BEST MODEL

In [19]:
joblib.dump(best_rf, "churn_pipeline.pkl")
print("\nBest pipeline saved as churn_pipeline.pkl")



Best pipeline saved as churn_pipeline.pkl


In [20]:
# To reload in another notebook/cell:
# model = joblib.load("churn_pipeline.pkl")
# model.predict(new_data)