### Problem Statement
Customer churn is a major challenge for telecom companies. Predicting whether a customer
is likely to leave helps businesses take proactive retention measures.

### Objective
Build a reusable and production-ready machine learning pipeline using Scikit-learn
to predict customer churn using the Telco Churn dataset.



In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import joblib


In [None]:
df = pd.read_csv("Telco-Customer-Churn.csv")
df.head()


In [None]:
# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Drop missing values
df.dropna(inplace=True)

# Encode target variable
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})


In [None]:
X = df.drop(columns=["customerID", "Churn"])
y = df["Churn"]

categorical_features = X.select_dtypes(include=["object"]).columns
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns


In [None]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
lr_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])


In [None]:
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])


In [None]:
lr_params = {
    "classifier__C": [0.01, 0.1, 1, 10]
}

lr_grid = GridSearchCV(
    lr_pipeline,
    lr_params,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

lr_grid.fit(X_train, y_train)


In [None]:
rf_params = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20]
}

rf_grid = GridSearchCV(
    rf_pipeline,
    rf_params,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

rf_grid.fit(X_train, y_train)


In [None]:
def evaluate(model, name):
    y_pred = model.predict(X_test)
    print(f"\n{name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

evaluate(lr_grid.best_estimator_, "Logistic Regression")
evaluate(rf_grid.best_estimator_, "Random Forest")


In [None]:
best_model = rf_grid.best_estimator_

joblib.dump(best_model, "churn_prediction_pipeline.pkl")


In [None]:
### Final Insights
- Pipelines ensure clean, reusable, and leakage-free preprocessing.
- Random Forest achieved better performance than Logistic Regression.
- GridSearchCV improved model performance through tuning.
- The exported pipeline can be directly deployed in production.


### Final Insights
- Pipelines ensure clean, reusable, and leakage-free preprocessing.
- Random Forest achieved better performance than Logistic Regression.
- GridSearchCV improved model performance through tuning.
- The exported pipeline can be directly deployed in production.
