**TPOT installation**

In [None]:
pip install tpot



Import TPOT packages and load Telecom Churn Dataset


In [None]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from __future__ import print_function
import sys, tempfile, urllib, os
import pandas as pd
import numpy as np

In [None]:
BASE_DIR = '/tmp'
OUTPUT_FILE = os.path.join(BASE_DIR, 'churn_data.csv')

churn_data=urllib.request.urlretrieve('https://raw.githubusercontent.com/srivatsan88/YouTubeLI/master/dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv', OUTPUT_FILE)

In [None]:
churn_df = pd.read_csv(OUTPUT_FILE)

In [None]:
churn_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
print ("Rows    :" ,churn_df.shape[0])
print ("Columns :" ,churn_df.shape[1])

Rows    : 7043
Columns : 21


Convert Categorical data to numerical encoding

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder

# for simplicity we use this numerical encoding we can also do the onehotencoding

categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']
column_trans = make_column_transformer((OrdinalEncoder(), categorical_columns))

churn_transformed = column_trans.fit_transform(churn_df)


In [None]:
churn_df_trans = churn_df.copy()
churn_df_trans = pd.DataFrame(churn_transformed, columns=categorical_columns)

In [None]:
churn_df.update(churn_df_trans)

In [None]:
churn_df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0.0,0,1.0,0.0,1,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,29.85,29.85,0.0
1,5575-GNVDE,1.0,0,0.0,0.0,34,1.0,0.0,0.0,2.0,...,2.0,0.0,0.0,0.0,1.0,0.0,3.0,56.95,1889.5,0.0
2,3668-QPYBK,1.0,0,0.0,0.0,2,1.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,3.0,53.85,108.15,1.0
3,7795-CFOCW,1.0,0,0.0,0.0,45,0.0,1.0,0.0,2.0,...,2.0,2.0,0.0,0.0,1.0,0.0,0.0,42.30,1840.75,0.0
4,9237-HQITU,0.0,0,0.0,0.0,2,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,70.70,151.65,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,1.0,0,1.0,1.0,24,1.0,2.0,0.0,2.0,...,2.0,2.0,2.0,2.0,1.0,1.0,3.0,84.80,1990.5,0.0
7039,2234-XADUH,0.0,0,1.0,1.0,72,1.0,2.0,1.0,0.0,...,2.0,0.0,2.0,2.0,1.0,1.0,1.0,103.20,7362.9,0.0
7040,4801-JZAZL,0.0,0,1.0,1.0,11,0.0,1.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,29.60,346.45,0.0
7041,8361-LTMKD,1.0,1,1.0,0.0,4,1.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,3.0,74.40,306.6,1.0


In [None]:
print(churn_df.isnull().sum())

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [None]:
# Optionally set the new pandas behavior
pd.set_option('future.no_silent_downcasting', True)

# Replace empty strings with NaN and check missing values
missing_values = churn_df.replace(r'^\s*$', np.nan, regex=True).isna().sum()
print(missing_values)

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [None]:
churn_df = churn_df.replace(r'^\s*$', np.nan, regex=True)

In [None]:
# Replace empty strings with NaN and check missing values
missing_values = churn_df.replace(r'^\s*$', np.nan, regex=True).isna().sum()
print(missing_values)

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [None]:
print(churn_df.dtypes)

customerID           object
gender              float64
SeniorCitizen         int64
Partner             float64
Dependents          float64
tenure                int64
PhoneService        float64
MultipleLines       float64
InternetService     float64
OnlineSecurity      float64
OnlineBackup        float64
DeviceProtection    float64
TechSupport         float64
StreamingTV         float64
StreamingMovies     float64
Contract            float64
PaperlessBilling    float64
PaymentMethod       float64
MonthlyCharges      float64
TotalCharges        float64
Churn               float64
dtype: object


Converting TotalCharges from object to numeric

In [None]:
churn_df.iloc[:, 19] = pd.to_numeric(churn_df.iloc[:, 19], errors='coerce')

In [None]:
from sklearn.impute import SimpleImputer
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')

In [None]:
churn_df.iloc[:, 19] = imp_median.fit_transform(churn_df.iloc[:, 19].values.reshape(-1, 1))

In [None]:
# Replace empty strings with NaN and check missing values
missing_values = churn_df.replace(r'^\s*$', np.nan, regex=True).isna().sum()
print(missing_values)

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [None]:
print(churn_df.dtypes)

customerID           object
gender              float64
SeniorCitizen         int64
Partner             float64
Dependents          float64
tenure                int64
PhoneService        float64
MultipleLines       float64
InternetService     float64
OnlineSecurity      float64
OnlineBackup        float64
DeviceProtection    float64
TechSupport         float64
StreamingTV         float64
StreamingMovies     float64
Contract            float64
PaperlessBilling    float64
PaymentMethod       float64
MonthlyCharges      float64
TotalCharges        float64
Churn               float64
dtype: object


In [None]:
churn_df_X = churn_df.drop("Churn", axis=1)
churn_df_X = churn_df_X.drop("customerID", axis=1)
churn_df_y = churn_df["Churn"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(churn_df_X, churn_df_y, train_size=0.75, test_size=0.25)

In [None]:
X_train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
3579,1.0,0,1.0,1.0,40,1.0,2.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,2.0,81.20,3292.30
3102,0.0,0,0.0,0.0,5,1.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,54.20,308.25
1898,0.0,1,0.0,0.0,5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,78.95,378.40
1438,0.0,0,0.0,0.0,40,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,3.0,50.25,2023.55
6065,1.0,0,0.0,0.0,59,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,3.0,25.00,1510.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
659,1.0,0,1.0,0.0,37,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,20.35,697.65
3144,1.0,0,0.0,0.0,6,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,3.0,18.95,110.15
3407,0.0,1,1.0,0.0,67,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,20.05,1263.05
2428,1.0,0,0.0,1.0,38,1.0,2.0,1.0,0.0,0.0,2.0,0.0,2.0,2.0,0.0,1.0,0.0,99.25,3777.15


**Use TPOT Classifier to find the best model algorithm that fits the data. TPOT uses genetic algorithm based seac=rch internally**

In [None]:
# All AutoMl codes are very compute intensive so u just need to distribute the processing i.e. use multiple course from your CPU
tpot = TPOTClassifier(generations=4, population_size=10, verbosity=3)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

32 operators have been imported by TPOT.


Optimization Progress:   0%|          | 0/50 [00:00<?, ?pipeline/s]


Generation 1 - Current Pareto front scores:

-1	0.7974285785957972	XGBClassifier(input_matrix, XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=8, XGBClassifier__min_child_weight=15, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.9500000000000001, XGBClassifier__verbosity=0)

Generation 2 - Current Pareto front scores:

-1	0.804242997620481	XGBClassifier(input_matrix, XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=2, XGBClassifier__min_child_weight=10, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.4, XGBClassifier__verbosity=0)
Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.

Generation 3 - Current Pareto front scores:

-1	0.804242997620481	XGBClassifier(input_matrix, XGBClassifier__learning_rate=0.1, XGBClassifier__max_depth=2, XGBClassifier__min_child_weight=10, XGBClassifier__n_estimators=100, XGBCl

In [None]:
import multiprocessing

if __name__ == '__main__':
    multiprocessing.set_start_method('forkserver', force=True)
    #n_jobs running n_jobs in parallel limited to number of course available in your system
    tpot = TPOTClassifier(generations=2, population_size=20, verbosity=2, n_jobs=20, random_state=50)
    tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/60 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8034843467790488

Generation 2 - Current best internal CV score: 0.8038625971159084

Best pipeline: ExtraTreesClassifier(ZeroCount(input_matrix), bootstrap=True, criterion=gini, max_features=0.55, min_samples_leaf=17, min_samples_split=11, n_estimators=100)


In [None]:
print(tpot.score(X_test, y_test))

0.7995457126632595


Model is not overfitting or something over here our test accuracy is higher than the cross validation accuracy

Export the scikit learn code for the best classifier and display the code

In [None]:
tpot.export('tpot_churn_pipeline.py')

In [None]:
!cat tpot_churn_pipeline.py

import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import ZeroCount
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=50)

# Average CV score on the training set was: 0.8038625971159084
exported_pipeline = make_pipeline(
    ZeroCount(),
    ExtraTreesClassifier(bootstrap=True, criterion="gini", max_features=0.55, min_samples_leaf=17, min_samples_split=11, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_