Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


Load Dataset

In [2]:
possible_files = [
    "Telco-Customer-Churn.csv",
    "WA_Fn-UseC_-Telco-Customer-Churn.csv",
    "telco.csv"
]

local_file = None
for f in possible_files:
    if os.path.exists(f):
        local_file = f
        break

if local_file:
    data = pd.read_csv(local_file)
else:
    url = "https://raw.githubusercontent.com/dphi-official/Datasets/master/Telco-Customer-Churn.csv"
    data = pd.read_csv(url)


Target column


In [3]:
y = data["Churn"].map({"Yes": 1, "No": 0})
X = data.drop(["Churn", "customerID"], axis=1)



Fix TotalCharges (object type with blanks)


In [4]:
if "TotalCharges" in X.columns:
    X["TotalCharges"] = pd.to_numeric(X["TotalCharges"], errors="coerce")
    X["TotalCharges"] = X["TotalCharges"].fillna(0)


Align indices


In [5]:
X, y = X.align(y, axis=0)


2. Identify feature types

In [6]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()


Prepocessing


In [7]:
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

4. Define Models + Pipelines


In [8]:
log_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, solver="liblinear"))
])

rf_clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier())
])

5. Train/Test Split


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


6. Hyperparameter Tuning (Windows safe: n_jobs=1)


In [10]:
param_grid_logreg = {"classifier__C": [0.1, 1, 10]}
param_grid_rf = {"classifier__n_estimators": [100, 200], "classifier__max_depth": [None, 10, 20]}

grid_logreg = GridSearchCV(log_reg, param_grid_logreg, cv=5, scoring="accuracy", n_jobs=1, refit=True)
grid_rf = GridSearchCV(rf_clf, param_grid_rf, cv=5, scoring="accuracy", n_jobs=1, refit=True)


7. Fit models


In [11]:
print("Training Logistic Regression...")
grid_logreg.fit(X_train, y_train)

print("Training Random Forest...")
grid_rf.fit(X_train, y_train)


Training Logistic Regression...
Training Random Forest...


0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'classifier__max_depth': [None, 10, ...], 'classifier__n_estimators': [100, 200]}"
,scoring,'accuracy'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


8. Evaluate


In [12]:
print("=== Evaluation on test set ===")
for name, model in {"Logistic Regression": grid_logreg, "Random Forest": grid_rf}.items():
    y_pred = model.predict(X_test)
    print("\n" + "="*40)
    print(name)
    print("Best Params:", model.best_params_)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

=== Evaluation on test set ===

Logistic Regression
Best Params: {'classifier__C': 10}
Accuracy: 0.8048261178140526
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.80      0.80      1409


Random Forest
Best Params: {'classifier__max_depth': 10, 'classifier__n_estimators': 200}
Accuracy: 0.8034066713981547
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.66      0.53      0.59       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.80      1409



9. Save the best model

In [18]:
best_model = max([grid_logreg, grid_rf], key=lambda m: m.best_score_)

joblib.dump(best_model, "telco_churn_pipeline.joblib")
print("Best model saved as telco_churn_pipeline.joblib")

Best model saved as telco_churn_pipeline.joblib
