# Telco Customer Churn

1. **Data Ingestion**

In [135]:
!pip install -Uq chardet

In [136]:
import pandas as pd
import numpy as np
import os
pd.options.display.max_columns = 200

In [137]:
pwd

'C:\\redoneproj'

In [138]:
FILE_NAME = "telco-customer-churn.csv"
DATA_FOLDER = "data"

In [139]:
main_path = os.getcwd()

In [140]:
main_path

'C:\\redoneproj'

In [141]:
file_path = os.path.join(os.path.join(main_path, DATA_FOLDER), FILE_NAME)
file_path

'C:\\redoneproj\\data\\telco-customer-churn.csv'

In [142]:
telco_data =pd.read_csv(file_path)

In [143]:
telco_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [144]:
telco_data.shape

(7043, 21)

### Data Inspection

In [145]:
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [146]:
telco_data['TotalCharges'] = (
    pd.to_numeric
    (telco_data
     ['TotalCharges'],
     errors='coerce')
 )


In [147]:
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


##  Exploration Data Analysis

1. ## Model Building

In [148]:
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

In [149]:
telco_data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [150]:
X = telco_data[['customerID', 'gender', 'SeniorCitizen', 
                'Partner', 'Dependents',
                'tenure', 'PhoneService', 'MultipleLines',
                'InternetService',
                'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                'TechSupport', 'StreamingTV', 'StreamingMovies', 
                'Contract', 'PaperlessBilling',
                 'PaymentMethod', 'MonthlyCharges', 'TotalCharges',]]

In [151]:
y = telco_data["Churn"]

In [152]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [153]:
X_train.shape, X_test.shape

((4718, 20), (2325, 20))

## Train Model

In [154]:
train_data, test_data = train_test_split( telco_data, test_size=0.33, random_state=42)

In [155]:
train_data.shape, test_data.shape

((4718, 21), (2325, 21))

In [156]:
label = 'Churn'
print("Summary of class variable: \n", train_data[label].describe())

Summary of class variable: 
 count     4718
unique       2
top         No
freq      3477
Name: Churn, dtype: object


In [157]:
save_path = 'model'
predictor = TabularPredictor(label=label, path=save_path).fit(train_data)

Beginning AutoGluon training ...
AutoGluon will save models to "model\"
AutoGluon Version:  0.6.2
Python Version:     3.9.16
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
Train Data Rows:    4718
Train Data Columns: 20
Label Column: Churn
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  ['No', 'Yes']
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = Yes, class 0 = No
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive (Yes) vs negative (No) class.
	To explicitly set the positive_class, either rename classes to 1 and 0, or specify positive_class in Predictor init.
Using Feature Generators to preproces

In [158]:
predictor.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.812,0.056963,17.208059,0.002997,1.37615,2,True,14
1,LightGBMXT,0.806,0.022988,1.09532,0.022988,1.09532,1,True,3
2,NeuralNetTorch,0.806,0.030979,14.736589,0.030979,14.736589,1,True,12
3,CatBoost,0.802,0.016989,29.464593,0.016989,29.464593,1,True,7
4,LightGBM,0.802,0.021987,1.180267,0.021987,1.180267,1,True,4
5,NeuralNetFastAI,0.8,0.061734,15.091016,0.061734,15.091016,1,True,10
6,XGBoost,0.798,0.025987,1.165277,0.025987,1.165277,1,True,11
7,ExtraTreesEntr,0.796,0.141912,1.491077,0.141912,1.491077,1,True,9
8,LightGBMLarge,0.784,0.022987,2.595392,0.022987,2.595392,1,True,13
9,ExtraTreesGini,0.784,0.119926,1.420121,0.119926,1.420121,1,True,8


## Model Evaluation 

In [159]:
y_test = test_data[label]  # values to predict
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
185,1024-GUALD,Female,0,Yes,No,1,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,24.8,24.8
2715,0484-JPBRU,Male,0,No,No,41,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Bank transfer (automatic),25.25,996.45
3825,3620-EHIMZ,Female,0,Yes,Yes,52,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.35,1031.7
1807,6910-HADCM,Female,0,No,No,1,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Electronic check,76.35,76.35
132,8587-XYZSF,Male,0,No,No,67,Yes,No,DSL,No,No,No,Yes,No,No,Two year,No,Bank transfer (automatic),50.55,3260.1


In [160]:
save_model_predictor = TabularPredictor.load(save_path)

In [161]:
y_pred = save_model_predictor.predict(test_data_nolab)

In [162]:
y_pred[0:5]

185     Yes
2715     No
3825     No
1807    Yes
132      No
Name: Churn, dtype: object

In [163]:
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.8008602150537635
Evaluations on test data:
{
    "accuracy": 0.8008602150537635,
    "balanced_accuracy": 0.6714926866069384,
    "mcc": 0.43917707755791546,
    "f1": 0.5141657922350473,
    "precision": 0.7538461538461538,
    "recall": 0.39012738853503187
}


Predictions:  
 185     Yes
2715     No
3825     No
1807    Yes
132      No
       ... 
4147     No
3542     No
3759    Yes
1114     No
4958     No
Name: Churn, Length: 2325, dtype: object


In [164]:
save_model_predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,0.811613,0.802,0.04897,0.016989,29.464593,0.04897,0.016989,29.464593,1,True,7
1,NeuralNetFastAI,0.806882,0.8,0.195879,0.061734,15.091016,0.195879,0.061734,15.091016,1,True,10
2,RandomForestGini,0.804301,0.782,0.344787,0.180888,1.668966,0.344787,0.180888,1.668966,1,True,5
3,XGBoost,0.803011,0.798,0.065958,0.025987,1.165277,0.065958,0.025987,1.165277,1,True,11
4,WeightedEnsemble_L2,0.80086,0.812,0.105932,0.056963,17.208059,0.006994,0.002997,1.37615,2,True,14
5,NeuralNetTorch,0.80043,0.806,0.074955,0.030979,14.736589,0.074955,0.030979,14.736589,1,True,12
6,LightGBM,0.8,0.802,0.037976,0.021987,1.180267,0.037976,0.021987,1.180267,1,True,4
7,RandomForestEntr,0.79871,0.784,0.310807,0.146909,2.098697,0.310807,0.146909,2.098697,1,True,6
8,LightGBMXT,0.797849,0.806,0.023983,0.022988,1.09532,0.023983,0.022988,1.09532,1,True,3
9,ExtraTreesEntr,0.795269,0.796,0.343788,0.141912,1.491077,0.343788,0.141912,1.491077,1,True,9


In [165]:
test_data["predicted_Churn"] = y_pred

In [166]:
test_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,predicted_Churn
185,1024-GUALD,Female,0,Yes,No,1,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,24.8,24.8,Yes,Yes
2715,0484-JPBRU,Male,0,No,No,41,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Bank transfer (automatic),25.25,996.45,No,No
3825,3620-EHIMZ,Female,0,Yes,Yes,52,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.35,1031.7,No,No
1807,6910-HADCM,Female,0,No,No,1,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Electronic check,76.35,76.35,Yes,Yes
132,8587-XYZSF,Male,0,No,No,67,Yes,No,DSL,No,No,No,Yes,No,No,Two year,No,Bank transfer (automatic),50.55,3260.1,No,No
