In [1]:
!pip install pandas scikit-learn joblib





[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Load the Telco Churn Dataset

In [83]:
import pandas as pd

# Load dataset
df = pd.read_csv("Telco-Customer-Churn.csv")

# Drop customerID (not useful)
df.drop("customerID", axis=1, inplace=True)

# Convert target to binary
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})


In [85]:
print ( df )

      gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0     Female              0     Yes         No       1           No   
1       Male              0      No         No      34          Yes   
2       Male              0      No         No       2          Yes   
3       Male              0      No         No      45           No   
4     Female              0      No         No       2          Yes   
...      ...            ...     ...        ...     ...          ...   
7038    Male              0     Yes        Yes      24          Yes   
7039  Female              0     Yes        Yes      72          Yes   
7040  Female              0     Yes        Yes      11           No   
7041    Male              1     Yes         No       4          Yes   
7042    Male              0      No         No      66          Yes   

         MultipleLines InternetService OnlineSecurity OnlineBackup  \
0     No phone service             DSL             No          Yes   
1      

## Data Preprocessing

In [88]:
X = df.drop("Churn", axis=1)
y = df["Churn"]


In [90]:
# Columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numerical:", num_cols)
print("Categorical:", cat_cols)


Numerical: ['SeniorCitizen', 'tenure', 'MonthlyCharges']
Categorical: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges']


## Build the ML Pipeline

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report


##  Preprocessing blocks

In [92]:
# Numerical transformer
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Categorical transformer
cat_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Full preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])


## Build Full Pipeline with Model

In [96]:
# clf = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression(solver='liblinear'))
# ])

from sklearn.ensemble import RandomForestClassifier

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])


## Train/Test Split & Train


In [98]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1036
           1       0.66      0.50      0.57       373

    accuracy                           0.80      1409
   macro avg       0.75      0.70      0.72      1409
weighted avg       0.79      0.80      0.79      1409



## Try Random Forest + GridSearchCV

In [100]:
# pipe = Pipeline([
#     ('preprocessor', preprocessor),
#     ('classifier', RandomForestClassifier(random_state=42))
# ])

## trying something to increse the accuracy 
## class_weight='balanced' in RandomForestClassifier is a simple yet powerful trick to help improve performance on imbalanced datasets. 

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])


# Grid search params
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5]
}

# Grid search
grid_search = GridSearchCV(pipe, param_grid, cv=3, scoring='f1', n_jobs=-1)


grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print(classification_report(y_test, y_pred))






Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      1036
           1       0.61      0.66      0.63       373

    accuracy                           0.80      1409
   macro avg       0.74      0.75      0.75      1409
weighted avg       0.80      0.80      0.80      1409



## Export Pipeline using joblib 

In [102]:
import joblib

# Export model
joblib.dump(grid_search.best_estimator_, "churn_pipeline.pkl")


['churn_pipeline.pkl']

In [105]:
model = joblib.load("churn_pipeline.pkl")

import pandas as pd

# Predict on a single row (as DataFrame)
single_row = X.iloc[[0]] 
prediction = model.predict(single_row)

# print("Prediction:", prediction)
label = "Yes" if prediction[0] == 1 else "No"
print("Customer will churn?", label)


Customer will churn? No


## Summary 

| Concept        | Tool/Technique Used                  | Purpose                                  |
| -------------- | ------------------------------------ | ---------------------------------------- |
| Data Cleaning  | Pandas                               | Remove ID column, map "Yes/No" to 1/0    |
| Preprocessing  | `StandardScaler`, `OneHotEncoder`    | Prepare data for ML models               |
| Pipeline       | `Pipeline()` & `ColumnTransformer()` | Wrap entire process into reusable object |
| Classification | `LogisticRegression`, `RandomForest` | Predict churn outcome                    |
| Tuning         | `GridSearchCV()`                     | Try multiple model configurations        |
| Exporting      | `joblib.dump()`                      | Save trained pipeline                    |
| Predicting     | `model.predict(df)`                  | Reuse pipeline on new data               |
