## End-to-End ML Pipeline with Scikit-learn Pipeline API

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


In [3]:
# Replace with your actual dataset path
df = pd.read_csv("Telco-Customer-Churn.csv")

# Drop customerID (not useful for prediction)
df = df.drop("customerID", axis=1)

# Convert target column to binary (Yes -> 1, No -> 0)
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})


In [4]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [5]:
# Identify categorical & numerical features
categorical_features = X.select_dtypes(include=["object"]).columns
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns

# Preprocessing: scale numeric, one-hot encode categorical
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)


In [6]:
# Logistic Regression Pipeline
logreg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Random Forest Pipeline
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])


In [7]:
# Define parameter grids
param_grid_logreg = {
    "classifier__C": [0.1, 1, 10],
    "classifier__penalty": ["l2"],
    "classifier__solver": ["lbfgs"]
}

param_grid_rf = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [5, 10, None],
    "classifier__min_samples_split": [2, 5]
}

# Grid search for Logistic Regression
grid_logreg = GridSearchCV(logreg_pipeline, param_grid_logreg,
                           cv=5, scoring="accuracy", n_jobs=-1)

grid_logreg.fit(X_train, y_train)

# Grid search for Random Forest
grid_rf = GridSearchCV(rf_pipeline, param_grid_rf,
                       cv=5, scoring="accuracy", n_jobs=-1)

grid_rf.fit(X_train, y_train)


In [8]:
# Logistic Regression
y_pred_logreg = grid_logreg.predict(X_test)
print("Logistic Regression Best Params:", grid_logreg.best_params_)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

# Random Forest
y_pred_rf = grid_rf.predict(X_test)
print("Random Forest Best Params:", grid_rf.best_params_)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Logistic Regression Best Params: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Logistic Regression Accuracy: 0.7984386089425124
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1035
           1       0.64      0.54      0.59       374

    accuracy                           0.80      1409
   macro avg       0.74      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409

Random Forest Best Params: {'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Random Forest Accuracy: 0.78708303761533
              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1035
           1       0.64      0.46      0.54       374

    accuracy                           0.79      1409
   macro avg       0.73      0.68      0.70      1409
weighted avg       0.77      0.79      0.78      1409



In [9]:
# Choose the best performing model
best_model = grid_rf if grid_rf.best_score_ > grid_logreg.best_score_ else grid_logreg

# Save the pipeline
joblib.dump(best_model.best_estimator_, "churn_prediction_pipeline.joblib")
print("Best model saved as churn_prediction_pipeline.joblib")


Best model saved as churn_prediction_pipeline.joblib
