## Customer Churn Predictor

In [22]:
import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,LabelEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


In [23]:
df=pd.read_csv('D:\ML\Customer_Churn_Predictor\Customer_Churn\data\WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [24]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [25]:
df_copy=df.copy()

In [26]:
df_copy.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [27]:
categorical_cols = df_copy.select_dtypes(include=["object", "category"]).columns.tolist()
numerical_cols = df_copy.select_dtypes(include=["int64", "float64"]).columns.tolist()
print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

Categorical columns: ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn']
Numerical columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges']


In [28]:
for col in df.columns:
    if df[col].dtype == "object":
        print(f"{col}: {df[col].unique()}")


customerID: ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
gender: ['Female' 'Male']
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
PhoneService: ['No' 'Yes']
MultipleLines: ['No phone service' 'No' 'Yes']
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: ['No' 'Yes' 'No internet service']
OnlineBackup: ['Yes' 'No' 'No internet service']
DeviceProtection: ['No' 'Yes' 'No internet service']
TechSupport: ['No' 'Yes' 'No internet service']
StreamingTV: ['No' 'Yes' 'No internet service']
StreamingMovies: ['No' 'Yes' 'No internet service']
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: ['Yes' 'No']
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
TotalCharges: ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn: ['No' 'Yes']


In [29]:

df_copy.drop(columns=["customerID"],axis=1,inplace=True)

In [30]:
#Feature classification
# Input features (drop the target column)
X = df_copy.drop("Churn", axis=1)
# Target/output column
y = df_copy["Churn"]

# 1. Custom cleaner
def clean_total_charges(X):
    X = X.copy()
    X["TotalCharges"] = X["TotalCharges"].astype(str).str.strip()
    X["TotalCharges"] = pd.to_numeric(X["TotalCharges"], errors="coerce")
    X["TotalCharges"] = X["TotalCharges"].fillna(X["TotalCharges"].median())
    return X

clean_total_charges_transformer = FunctionTransformer(clean_total_charges)

# 2. Column selections
categorical_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                    'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling',
                    'PaymentMethod']
ordinal_cols = ['Contract']
numerical_cols = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

# 3. Preprocessor
preprocessor = ColumnTransformer([
    ("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1,
                               categories=[['Month-to-month', 'One year', 'Two year']]), ordinal_cols),
    ("onehot", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
     ("scaler", StandardScaler(), numerical_cols)
], remainder="passthrough")

# 4. Full Pipeline
pipeline = Pipeline([
    ("clean_total_charges", clean_total_charges_transformer),
    ("preprocessing", preprocessor),
    # ("model", LogisticRegression()) — You can add this step later
])

# 5. Encode y
le = LabelEncoder()
y = le.fit_transform(df_copy["Churn"])

# 6. Fit the pipeline
pipeline.fit(X, y)


0,1,2
,steps,"[('clean_total_charges', ...), ('preprocessing', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function cle...001E34D0D1480>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('ordinal', ...), ('onehot', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['Month-to-month', 'One year', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [31]:
#let's split data for train and test
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=10)


In [32]:
#Hyperparameter tuning
lr=LogisticRegression(LogisticRegression(max_iter=1000))
c_values = [100, 10, 1.0, 0.1, 0.01]
params = [
    {
        'penalty': ['l1', 'l2'],
        'C': c_values,
        'solver': ['liblinear']
    },
    {
        'penalty': ['l2'],
        'C': c_values,
        'solver': ['lbfgs', 'newton-cg', 'sag']
    },
    {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'C': c_values,
        'solver': ['saga'],
        'l1_ratio': [0.5]  # Only relevant for elasticnet
    }
]


## GridSearchCV
from sklearn.model_selection import GridSearchCV
grid_model=GridSearchCV(estimator=lr,param_grid=params,scoring='accuracy',cv=5,n_jobs=-1)


In [33]:
model_pipeline = Pipeline(steps=[
    ("clean_total_charges", clean_total_charges_transformer),
    ("preprocessing", preprocessor),
    ("Lregressor", grid_model)
])


In [34]:
model_pipeline.fit(X_train,y_train)



0,1,2
,steps,"[('clean_total_charges', ...), ('preprocessing', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function cle...001E34D0D1480>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('ordinal', ...), ('onehot', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['Month-to-month', 'One year', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,estimator,LogisticRegre...ax_iter=1000))
,param_grid,"[{'C': [100, 10, ...], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}, {'C': [100, 10, ...], 'penalty': ['l2'], 'solver': ['lbfgs', 'newton-cg', ...]}, ...]"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,100


In [35]:
best_model = grid_model.best_estimator_

In [36]:
grid_model.best_score_

np.float64(0.8061390871821335)

In [37]:
final_pipeline = Pipeline(steps=[
    ("clean_total_charges", clean_total_charges_transformer),
    ("preprocessing", preprocessor),
    ("Lregressor", best_model)
])


In [38]:
final_pipeline.fit(X_train,y_train)





0,1,2
,steps,"[('clean_total_charges', ...), ('preprocessing', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function cle...001E34D0D1480>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('ordinal', ...), ('onehot', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['Month-to-month', 'One year', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,100


In [39]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
y_pred=final_pipeline.predict(X_test)

Analysis


In [40]:
score=accuracy_score(y_pred,y_test)
print(score)
print(classification_report(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))

0.8006814310051107
              precision    recall  f1-score   support

           0       0.89      0.85      0.87      1369
           1       0.55      0.63      0.58       392

    accuracy                           0.80      1761
   macro avg       0.72      0.74      0.73      1761
weighted avg       0.81      0.80      0.81      1761

[[1163  206]
 [ 145  247]]


Pikling

In [41]:
import pickle
pickle.dump(final_pipeline, open("model.pkl",'wb'))

