In [None]:
# Import potential libraries to use

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Read the csv file for the Telco Customer Churn dataset
customer_churn_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Telco-Customer-Churn.csv")

### Data Preprocessing

In [None]:
# View the head of the dataset
customer_churn_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
customer_churn_df.shape

(7043, 21)

In [None]:
customer_churn_df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [None]:
customer_churn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [116]:
# Repalce ' ' with 0
customer_churn_df['TotalCharges'].mask(customer_churn_df['TotalCharges'] == ' ', '0', inplace=True)

In [117]:
# Convert the TotalCharges from object to numerical (float)
customer_churn_df['TotalCharges'] = pd.to_numeric(customer_churn_df['TotalCharges'])

In [118]:
# Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1
customer_churn_df['Churn'] = customer_churn_df['Churn'].replace(['No', 'Yes'], [0, 1])


### Feature Engineering

In [None]:
categorical = customer_churn_df[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']]
numerical = customer_churn_df[['tenure', 'MonthlyCharges', 'TotalCharges']]

In [None]:
import scipy.stats as stats
numerical_transform = numerical.apply(stats.zscore)
numerical_transform.shape

(7043, 3)

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)

categorical_transform = encoder.fit_transform(categorical)

# Create a DataFrame from the encoded values with column names
categorical_transform = pd.DataFrame(categorical_transform, columns= encoder.get_feature_names_out(categorical.columns))

categorical_transform.shape



(7043, 43)

In [None]:
features = pd.concat([numerical_transform, categorical_transform], axis = 1)
features.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.277445,-1.160323,-0.992611,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.066327,-0.259629,-0.172165,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-1.236724,-0.36266,-0.958066,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.514251,-0.746535,-0.193672,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,-1.236724,0.197365,-0.938874,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


### Train-Test Split

In [None]:
X = features
y = customer_churn_df['Churn']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2 , random_state= 1)

### Model Development

#### Question 14
What is the accuracy on the test set using the random forest classifier?

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 10, random_state=1)
rfc.fit(X_train, y_train)

In [None]:
rfc_pred = rfc.predict(X_test)
rfc_pred

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix
accuracy = accuracy_score(y_true=y_test, y_pred=rfc_pred)
print( 'Accuracy: {}' .format(round(accuracy, 4)))

Accuracy: 0.7906


#### Question 15


What is the accuracy on the test set using the xgboost classifier?

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [None]:
xgb_pred = xgb.predict(X_test)
xgb_pred

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
accuracy = accuracy_score(y_true=y_test, y_pred=xgb_pred)
print( 'Accuracy: {}' .format(round(accuracy, 4)))

Accuracy: 0.7935


#### Question 16

What is the accuracy on the test set using the LGBM classifier?

In [None]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000735 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [None]:
lgbm_pred = lgbm.predict(X_test)
lgbm_pred

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
accuracy = accuracy_score(y_true=y_test, y_pred=lgbm_pred)
print( 'Accuracy: {}' .format(round(accuracy, 4) ))

Accuracy: 0.8133


#### Question 17

Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators = 10, random_state=1)
etc.fit(X_train, y_train)

In [None]:
etc_pred = etc.predict(X_test)
etc_pred

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
accuracy = accuracy_score(y_true=y_test, y_pred=etc_pred)
print( 'Accuracy: {}' .format(round(accuracy, 4)))

Accuracy: 0.7601


In [104]:
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}


hyperparameter_grid

{'n_estimators': [50, 100, 300, 500, 1000],
 'min_samples_leaf': [1, 2, 4, 6, 8],
 'min_samples_split': [2, 3, 5, 7, 9],
 'max_features': ['auto', 'sqrt', 'log2', None]}

In [105]:
from sklearn.model_selection import RandomizedSearchCV

estimator = ExtraTreesClassifier(random_state=1)
model = RandomizedSearchCV(estimator, param_distributions = hyperparameter_grid,
                            cv=5, n_iter=10, scoring = 'accuracy',
                           n_jobs = -1, verbose = 1, random_state = 1)


model

In [106]:
model.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [108]:
# Extract best hyperparameters
print(model.best_params_)

{'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


Answer 17: The best hyperparameters from the randomized search CV:

**{'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}**

#### Question 18

Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?

In [109]:
etc_ = ExtraTreesClassifier(n_estimators = 1000, min_samples_split = 9, min_samples_leaf = 8,
                            max_features = 'sqrt', random_state=1)

etc_.fit(X_train, y_train)
etc_pred_ = etc_.predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=etc_pred_)
print( 'Accuracy: {}'.format(round(accuracy, 4)))

Accuracy: 0.8041


Answer 18: The accuracy of the new optimal model higher than the initial ExtraTreesClassifier

#### Question 19

What other hyperparameters can be tuned for ExtraTreeClassifer?

Answer 19: **min_weight_fraction_leaf** and **max_leaf_nodes**

#### Question 20

Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the two most important respectively?

In [115]:
importance = etc_.feature_importances_
feat_imp = pd.Series(importance, index = X.columns)
feat_imp = feat_imp.sort_values(ascending = False)
feat_imp

Contract_Month-to-month                    0.152237
tenure                                     0.092800
OnlineSecurity_No                          0.074998
InternetService_Fiber optic                0.065287
TechSupport_No                             0.064141
Contract_Two year                          0.054423
PaymentMethod_Electronic check             0.051666
TotalCharges                               0.047714
InternetService_DSL                        0.032687
OnlineBackup_No                            0.030077
Contract_One year                          0.028543
OnlineSecurity_Yes                         0.021700
DeviceProtection_No                        0.016857
MonthlyCharges                             0.014926
TechSupport_Yes                            0.014559
OnlineBackup_Yes                           0.012598
PaperlessBilling_No                        0.011883
PaperlessBilling_Yes                       0.011779
gender_Female                              0.010653
gender_Male 

Answer 20: **Contract_Month-to-month, tenure**