# XGBOOST model training

In [220]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import pickle
import warnings
warnings.filterwarnings('ignore')

In [190]:
# load dataset churn modelling
dataset = pd.read_csv('Churn_Modelling.csv')
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [191]:
# split dataset into train and test
train,test  = train_test_split(dataset,test_size=0.2,random_state=0)

In [192]:
train.shape,test.shape

((8000, 14), (2000, 14))

In [193]:
# saving test data into csv file
test.to_csv('test_file.csv',index=False,encoding='utf-8')

In [194]:
# split train data into train and cross validation 
train_data,cross_val = train_test_split(train,test_size=0.2,random_state=0)

In [195]:
train.shape,cross_val.shape

((8000, 14), (1600, 14))

In [196]:
# seperating features and labels from dataset
# Credit Score through Estimated Salary
x_train = train_data.iloc[:,3:13].values
# Exited
y_train = train_data.iloc[:,13].values

In [197]:
x_train.shape,y_train.shape

((6400, 10), (6400,))

In [198]:
# seperating features and labels from cross val
# Credit Score through Estimated Salary
x_cv = cross_val.iloc[:,3:13].values
# Exited
y_cv = cross_val.iloc[:,13].values

In [199]:
x_cv.shape,y_cv.shape

((1600, 10), (1600,))

In [200]:
def categorical_encode(x):
    # Encoding categorical data country
    labelencoder_x_country = LabelEncoder()
    x[:,1] = labelencoder_x_country.fit_transform(x[:,1])
    # Encode categorical data gender
    labelencoder_x_gender = LabelEncoder()
    x[:,2]  =  labelencoder_x_gender.fit_transform(x[:,2])
    ohe_country = OneHotEncoder(categorical_features=[1])
    x = ohe_country.fit_transform(x).toarray()
    x = x[:, 1:]
    return x

In [201]:
# categorical encoding in train data
x_train = categorical_encode(x_train)

In [202]:
# one hot encoding on cross validation
x_cv = categorical_encode(x_cv)

In [203]:
x_train.shape,y_train.shape,x_cv.shape,y_cv.shape

((6400, 11), (6400,), (1600, 11), (1600,))

In [204]:
# Fitting XGBOOST to training set
classifier = XGBClassifier()
classifier.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [205]:
# make prediction on train data
predict_train = classifier.predict(x_train)

In [206]:
# confusion matrix on train data 
confusion_matrix_train = confusion_matrix(y_train,predict_train)

In [207]:
confusion_matrix_train

array([[4959,  144],
       [ 685,  612]])

In [208]:
# applying k-fold on train data
accuracies = cross_val_score(estimator=classifier, X = x_train, y = y_train, cv=10)
accuracies.mean()

0.8628130344403917

In [209]:
accuracies.std()

0.008359329389419941

In [210]:
# make prediction on cross validated data
predict_cv = classifier.predict(x_cv)

In [211]:
# confusion matrix on cross validation
cv_cf = confusion_matrix(y_cv,predict_cv)

In [212]:
cv_cf

array([[1229,   36],
       [ 172,  163]])

In [216]:
# applying k-fold cross validation
accuracies_cv = cross_val_score(estimator=classifier, X = x_cv, y = y_cv, cv=10)

In [217]:
accuracies_cv.std()

0.02280351899093408

In [218]:
accuracies_cv = accuracies_cv.mean()*100

In [219]:
print('accuracy on cross validation:',accuracies_cv)

accuracy on cross validation: 86.18735106840111


# save model

In [222]:
# save model
if accuracies_cv > 80:
    file_obj = open('model.pkl','wb')
    pickle.dump(classifier,file_obj)
    file_obj.close()
    print('Your model has been saved')

Your model has been saved
