In [4]:
# Hyperparamter Tuning - XGBoost

# Here is the highlight of the process
# 1. Import the customer churn data (I have already cleaned it)
# 2. Split the data into test and train sets
# 3. Build data matrices - as XGBoost uses DMatrix
# 4. Find the logloss of the model with default parameters
# 5. Tune the parameters
# 6. Find the logloss of the model with tuned parameters

# For exploratory analysis and other models on this dataset, please use the following link
# https://github.com/Nickssingh/Churn-Prediction-Model-Telecommunication

In [5]:
import pandas as pd
import numpy as np

In [6]:
# We will import the dataset and view top rows
# I have already preapared the the data for analysis 
    # Removed the missing values
    # Converted the variables into appropriate data types
    # Encoded categorical variables using one hot encoding

df_churn=pd.read_csv("https://github.com/Nickssingh/Hyperparameter-tuning-XGBoost/raw/master/Data/telcom_customer_churn.csv")

In [7]:
df_churn.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,1,0
1,0,34,56.95,1889.5,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
3,0,45,42.3,1840.75,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,2,70.7,151.65,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0


In [8]:
# Checking the dimension of the data

df_churn.shape

(7032, 31)

In [9]:
# Splitting the data into train and test datasets
# test:train = 3:7
from sklearn.cross_validation import train_test_split


df_temp = df_churn
y = df_temp['Churn']
X = df_temp.drop('Churn', axis=1, inplace=False)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [13]:
# Installing XGBoost (I have already installed it)

!pip install xgboost

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support[0m


In [14]:
# XGBoost uses an internal data structure DMatrix - which optimizes both memory effieciency and speed
# Hence, rather than using pandas dataframe, we will use data matrix - DMatrix


import xgboost as xgb

dm_train = xgb.DMatrix(X_train, label=y_train)
dm_test = xgb.DMatrix(X_test, label=y_test)

In [15]:
# Building Model

# Ideal case would include an exhaustive gridsearch on all the parameters.
# However, such an approach is computationally intensive.
# Hence, we will focus on few important parameters and tune them sequentially.

# Following are the parameters that we will tune in this process
# max_depth
# min_child_weight
# subsample
# colsample_bytree
# eta
# num_boost_rounds
# early_stopping_rounds

# We will use logistic loss function to assess the accuracy of predictions, as this is a classification problem

In [16]:
# We will set num_boost_rounds to 100, early_stopping_rounds to 10, and objective to binary:logistic.
# All the other values at this stage are default values.
# We will tune our model by chaning the default values.

params = {'max_depth':6, 'min_child_weight':1, 'eta':0.3, 'subsample':1, 
          'colsample_bytree':1, 'objective':'binary:logistic',}

# We will use logloss function to evaluate the model's performance
params['eval_metric'] = "logloss"

xgmodel = xgb.train(params, dtrain = dm_train, num_boost_round = 100, evals = [(dm_test,"Test")], 
                    early_stopping_rounds = 10)

print("Best Logloss: {:.3f} | Rounds: {}".format(xgmodel.best_score,xgmodel.best_iteration+1))

[0]	Test-logloss:0.581809
Will train until Test-logloss hasn't improved in 10 rounds.
[1]	Test-logloss:0.520662
[2]	Test-logloss:0.484648
[3]	Test-logloss:0.459301
[4]	Test-logloss:0.44479
[5]	Test-logloss:0.43489
[6]	Test-logloss:0.428768
[7]	Test-logloss:0.427162
[8]	Test-logloss:0.425635
[9]	Test-logloss:0.423885
[10]	Test-logloss:0.424516
[11]	Test-logloss:0.425703
[12]	Test-logloss:0.426199
[13]	Test-logloss:0.426466
[14]	Test-logloss:0.427974
[15]	Test-logloss:0.428433
[16]	Test-logloss:0.429665
[17]	Test-logloss:0.429645
[18]	Test-logloss:0.42989
[19]	Test-logloss:0.430252
Stopping. Best iteration:
[9]	Test-logloss:0.423885

Best Logloss: 0.424 | Rounds: 10


In [17]:
# Here, we found that the tenth round gave the best result and the results did not improve in the next 10 rounds
# Hence, the iteration stopped at round 19 and we did not reach the maximum number of boosting rounds (100).

# Finding a suitable evidence to stop the iterations is important.
# Stopping the iterations when results do not improve prevents overfittig and the inefficient utilization of resources.

In [18]:
# We will use cross validation to tune the parameters within the params dictionary

In [19]:
# Parameters: max-depth and min_child_weight
# I realized that the optimal values are in the following ranges through multiple iterations

gridsearch_params = [(max_depth, min_child_weight)
                    for max_depth in range(1,4)
                    for min_child_weight in range(17,21)]

In [20]:
logloss_min = float("Inf")
best_params = None

for max_depth, min_child_weight in gridsearch_params:
    
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    
    xg_cvresults = xgb.cv(params, dtrain = dm_train, num_boost_round = 100,
                      seed = 0, nfold=10, metrics = {'logloss'}, early_stopping_rounds = 10,)
    
    logloss_mean = xg_cvresults['test-logloss-mean'].min()
    
    print("max_depth: {} | min_child_weight: {} with Logloss: {:.3}\n".format(max_depth,min_child_weight,logloss_mean))
    
    if logloss_mean < logloss_min:
        logloss_min = logloss_mean
        best_params = (max_depth, min_child_weight)

        
print("Best Parameters: max_depth: {} | min_child_weight: {} with Logloss: {:.3f}". format(best_params[0], 
                                                                                  best_params[1], logloss_min))

max_depth: 1 | min_child_weight: 17 with Logloss: 0.41

max_depth: 1 | min_child_weight: 18 with Logloss: 0.41

max_depth: 1 | min_child_weight: 19 with Logloss: 0.41

max_depth: 1 | min_child_weight: 20 with Logloss: 0.41

max_depth: 2 | min_child_weight: 17 with Logloss: 0.41

max_depth: 2 | min_child_weight: 18 with Logloss: 0.41

max_depth: 2 | min_child_weight: 19 with Logloss: 0.41

max_depth: 2 | min_child_weight: 20 with Logloss: 0.41

max_depth: 3 | min_child_weight: 17 with Logloss: 0.412

max_depth: 3 | min_child_weight: 18 with Logloss: 0.413

max_depth: 3 | min_child_weight: 19 with Logloss: 0.413

max_depth: 3 | min_child_weight: 20 with Logloss: 0.414

Best Parameters: max_depth: 2 | min_child_weight: 19 with Logloss: 0.410


In [21]:
# Updating the parameters with the best values: max_depth = 2 and min_child_weight = 19

params['max_depth'] = 2
params['min_child_weight'] = 19

In [22]:
# Parameters: subsample and colsample_bytree
# I found that the optimal values are in the following ranges through multiple iterations

gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(1,5)]
]

In [23]:
logloss_min = float("Inf")
best_params = None

for subsample, colsample in (gridsearch_params):
    
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    
    xg_cvresults = xgb.cv(params, dtrain = dm_train, num_boost_round = 100,
                      seed = 0, nfold=10, metrics = {'logloss'}, early_stopping_rounds = 10,)
    
    logloss_mean = xg_cvresults['test-logloss-mean'].min()
    
    print("subsample: {} | colsample: {} with Logloss: {:.3f}\n".format(subsample,colsample,logloss_mean))
    
    if logloss_mean < logloss_min:
        logloss_min = logloss_mean
        best_params = (subsample, colsample)
        
print("Best Parameters: subsample: {} | colsample: {} with Logloss: {:.3f}". format(best_params[0], 
                                                                           best_params[1], logloss_min))

subsample: 0.7 | colsample: 0.1 with Logloss: 0.412

subsample: 0.7 | colsample: 0.2 with Logloss: 0.411

subsample: 0.7 | colsample: 0.3 with Logloss: 0.410

subsample: 0.7 | colsample: 0.4 with Logloss: 0.411

subsample: 0.8 | colsample: 0.1 with Logloss: 0.411

subsample: 0.8 | colsample: 0.2 with Logloss: 0.411

subsample: 0.8 | colsample: 0.3 with Logloss: 0.409

subsample: 0.8 | colsample: 0.4 with Logloss: 0.410

subsample: 0.9 | colsample: 0.1 with Logloss: 0.410

subsample: 0.9 | colsample: 0.2 with Logloss: 0.410

subsample: 0.9 | colsample: 0.3 with Logloss: 0.410

subsample: 0.9 | colsample: 0.4 with Logloss: 0.409

subsample: 1.0 | colsample: 0.1 with Logloss: 0.410

subsample: 1.0 | colsample: 0.2 with Logloss: 0.411

subsample: 1.0 | colsample: 0.3 with Logloss: 0.410

subsample: 1.0 | colsample: 0.4 with Logloss: 0.410

Best Parameters: subsample: 0.9 | colsample: 0.4 with Logloss: 0.409


In [24]:
# Updating the parameters with the best values: subsample = 0.9 and colsample = 0.2

params['subsample'] = 0.9
params['colsample_bytree'] = 0.2

In [25]:
# Parameter: eta

logloss_min = float("Inf")
best_params = None

for eta in [0.3, 0.2, 0.1, 0.05, 0.01, 0.005]:
    
    params['eta'] = eta
    
    xg_cvresults = xgb.cv(params, dtrain = dm_train, num_boost_round = 100,
                      seed = 0, nfold=10, metrics = {'logloss'}, early_stopping_rounds = 10,)
    
    logloss_mean = xg_cvresults['test-logloss-mean'].min()
    print("eta: {} with Logloss: {:.3}\n".format(eta,logloss_mean))
    
    if logloss_mean < logloss_min:
        logloss_min = logloss_mean
        best_params = eta
        
print("Best Parameter: eta: {} with Logloss: {:.3f}". format(best_params, logloss_min))

eta: 0.3 with Logloss: 0.41

eta: 0.2 with Logloss: 0.409

eta: 0.1 with Logloss: 0.411

eta: 0.05 with Logloss: 0.423

eta: 0.01 with Logloss: 0.525

eta: 0.005 with Logloss: 0.582

Best Parameter: eta: 0.2 with Logloss: 0.409


In [26]:
# Updating the eta parameter with the best value

params['eta'] = 0.2

In [27]:
# Setting the optimum paramters

params = {'colsample_bytree': 0.2,
          'eta': 0.2,
          'eval_metric': 'logloss',
          'max_depth': 2,
          'min_child_weight': 19,
          'objective':'binary:logistic',
          'subsample': 0.9}

In [28]:
# Finding the optimal number of rounds for the model with new parameters

xgmodel_tuned = xgb.train(params, dtrain = dm_train, 
                          num_boost_round=100, evals=[(dm_test,"Test")], early_stopping_rounds=10)


print("Best Logloss: {:.3f} in {} rounds". format(xgmodel_tuned.best_score, xgmodel_tuned.best_iteration+1))

[0]	Test-logloss:0.635076
Will train until Test-logloss hasn't improved in 10 rounds.
[1]	Test-logloss:0.587516
[2]	Test-logloss:0.559388
[3]	Test-logloss:0.532871
[4]	Test-logloss:0.511868
[5]	Test-logloss:0.499512
[6]	Test-logloss:0.485791
[7]	Test-logloss:0.476895
[8]	Test-logloss:0.469163
[9]	Test-logloss:0.45966
[10]	Test-logloss:0.455054
[11]	Test-logloss:0.452829
[12]	Test-logloss:0.4477
[13]	Test-logloss:0.444522
[14]	Test-logloss:0.438465
[15]	Test-logloss:0.435882
[16]	Test-logloss:0.434745
[17]	Test-logloss:0.433169
[18]	Test-logloss:0.431758
[19]	Test-logloss:0.430851
[20]	Test-logloss:0.428035
[21]	Test-logloss:0.427202
[22]	Test-logloss:0.426541
[23]	Test-logloss:0.426402
[24]	Test-logloss:0.425321
[25]	Test-logloss:0.423764
[26]	Test-logloss:0.423377
[27]	Test-logloss:0.423043
[28]	Test-logloss:0.422699
[29]	Test-logloss:0.422123
[30]	Test-logloss:0.422191
[31]	Test-logloss:0.422346
[32]	Test-logloss:0.422648
[33]	Test-logloss:0.422762
[34]	Test-logloss:0.42282
[35]	Test

In [29]:
# With the tuned parameters we would need 63 rounds to achieve the best result

# The improvement after parameter tuning is marginal in our case.
    # Logloss of our model decreased from 0.424 to 0.417
# However, we were able to see how parameters can be tuned.

# Here we have used only a few combination of parameters.
# We can further improve the impact of tuning; however, doing so would be computationally more expensive.
# More combination of parameters and wider ranges of values for each of those paramaters would have to be tested.