### Importing the libraries


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset

In [3]:
df = pd.read_csv('Churn_Modelling.csv')

### Review data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
df.tail()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.0,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


### Split data into the independent vs dependent variables

In [7]:
X = df.iloc[:,3:13].values
y = df.iloc[:,-1].values

### Encoding categorical data


In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

For Geography,

In [9]:
labelencoder_X_1 = LabelEncoder() 

In [10]:
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])# column [1] for Geography

For gender,

In [11]:
labelencoder_X_2 = LabelEncoder()

In [12]:
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])

In [13]:
# Create dummy variable for countries column:
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
# Remove the 1st column to avoid dummy variable trap:
X = X[:,1:] 


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


### Split data into train and test sets

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 0)

### Fitting XGBoost Model to the Tranining set

In [16]:
from xgboost import XGBClassifier

In [17]:
classifier = XGBClassifier()

In [18]:
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

### Predicting the Test set results

In [19]:
y_pred = classifier.predict(X_test)

### Making the confusion matrix

In [20]:
from sklearn.metrics import confusion_matrix

In [21]:
cm = confusion_matrix(y_test,y_pred)

In [22]:
cm

array([[1521,   74],
       [ 197,  208]])

### Applying k-Fold Cross Validation


In [23]:
from sklearn.model_selection import cross_val_score

In [24]:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10 )
#cv: number of folds we want to create

Note: If we work on the large dataset, we need to set the <code>n_jobs</code> that is for running all the CPUs


In [25]:
accuracies.mean()

0.8629994451163204

In [26]:
accuracies.std()

0.010677872171663988