# Task 2: End-to-End ML Pipeline for Customer Churn Prediction

In [None]:
!pip install pandas scikit-learn joblib openpyxl -q


In [None]:
import pandas as pd


##  Step 2: Load the Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = "/content/drive/MyDrive/telco-customer churn dtset/Telco Customer Churn (1).csv" # Update path as needed
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
df.shape

(7043, 21)

##  Step 3: Data Preprocessing

In [None]:
# Count occurrences of each category
churn_counts = df['Churn'].value_counts()
print(churn_counts)


Churn
No     5174
Yes    1869
Name: count, dtype: int64


In [None]:
if "customerID" in df.columns:
  df.drop("customerID", axis=1, inplace=True)

In [None]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')

In [None]:
df.dropna(inplace=True)

In [None]:
X = df.drop("Churn", axis=1)
y = df["Churn"].map({"Yes": 1, "No": 0})

In [None]:
# Count occurrences of each category
X_counts = X.value_counts()
print(X_counts)


gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  MultipleLines  InternetService  OnlineSecurity       OnlineBackup         DeviceProtection     TechSupport          StreamingTV          StreamingMovies      Contract        PaperlessBilling  PaymentMethod     MonthlyCharges  TotalCharges
Male    0              No       No          1       Yes           No             No               No internet service  No internet service  No internet service  No internet service  No internet service  No internet service  Month-to-month  No                Mailed check      20.20           20.20           4
                                                                                 Fiber optic      No                   No                   No                   No                   No                   No                   Month-to-month  Yes               Electronic check  69.90           69.90           3
                                                                           

In [None]:
X.shape

(7032, 19)

In [None]:
y.shape

(7032,)

##  Step 4: Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

##  Step 5: Pipeline Construction

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]
categorical_features = list(set(X.columns) - set(numeric_features))



In [None]:
preprocessor = ColumnTransformer(transformers=[ ("num", StandardScaler(), numeric_features), ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features) ])

In [None]:
pipeline = Pipeline(steps=[ ("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=1000)) ])

##  Step 6: Train the Model

In [None]:
pipeline.fit(X_train, y_train)

##  Step 7: Evaluate the Model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[452  65]
 [ 85 102]]
              precision    recall  f1-score   support

           0       0.84      0.87      0.86       517
           1       0.61      0.55      0.58       187

    accuracy                           0.79       704
   macro avg       0.73      0.71      0.72       704
weighted avg       0.78      0.79      0.78       704



##  Step 8: GridSearchCV for Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = { "classifier": [LogisticRegression(max_iter=1000), RandomForestClassifier()], }

grid_pipeline = Pipeline(steps=[ ("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=1000)) ])

grid_search = GridSearchCV(grid_pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'classifier': LogisticRegression(max_iter=1000)}
Best Score: 0.8048382443849166


##  Step 9: Export the Final Pipeline

In [None]:
import joblib

In [None]:
joblib.dump(grid_search.best_estimator_, "telco_churn_model.pkl")



['telco_churn_model.pkl']

In [None]:
model = joblib.load("telco_churn_model.pkl")
sample = X_test.iloc[:100]
print("Prediction:", model.predict(sample))

Prediction: [0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0
 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0
 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]


##  Summary

- Loaded CSV dataset
- Preprocessed, trained, evaluated Logistic Regression and Random Forest
- Tuned with GridSearchCV
- Saved the final pipeline using joblib