# Data Preprocessing

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score , confusion_matrix
_CURRENT_SCRATCH_GRAPH = None

#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense, Dropout

In [2]:
# Importing the dataset
dataset = pd.read_csv('G:\Software\Machine learning\Datasets\Deep learning\Churn_Modelling.csv')

In [3]:
dataset.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
#spliting the dataset
x = dataset.iloc[:, 3:12].values
y = dataset.iloc[:, 13].values

In [6]:
x.shape

(10000, 9)

In [7]:
y.shape

(10000,)

## Encoding categorical data

In [8]:
labelencoder_X = LabelEncoder()
x[:, 1] = labelencoder_X.fit_transform(x[:, 1])

In [9]:
x

array([[619, 0, 'Female', ..., 1, 1, 1],
       [608, 2, 'Female', ..., 1, 0, 1],
       [502, 0, 'Female', ..., 3, 1, 0],
       ...,
       [709, 0, 'Female', ..., 1, 0, 1],
       [772, 1, 'Male', ..., 2, 1, 0],
       [792, 0, 'Female', ..., 1, 1, 0]], dtype=object)

In [10]:
labelencoder_X_1 = LabelEncoder()
x[:, 2] = labelencoder_X_1.fit_transform(x[:, 2])

In [11]:
x

array([[619, 0, 0, ..., 1, 1, 1],
       [608, 2, 0, ..., 1, 0, 1],
       [502, 0, 0, ..., 3, 1, 0],
       ...,
       [709, 0, 0, ..., 1, 0, 1],
       [772, 1, 1, ..., 2, 1, 0],
       [792, 0, 0, ..., 1, 1, 0]], dtype=object)

In [12]:
onehotencoder = OneHotEncoder(categorical_features = [1])
x = onehotencoder.fit_transform(x).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [13]:
x

array([[1., 0., 0., ..., 1., 1., 1.],
       [0., 0., 1., ..., 1., 0., 1.],
       [1., 0., 0., ..., 3., 1., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 1.],
       [0., 1., 0., ..., 2., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.]])

In [14]:
x[:,1:]

array([[  0.,   0., 619., ...,   1.,   1.,   1.],
       [  0.,   1., 608., ...,   1.,   0.,   1.],
       [  0.,   0., 502., ...,   3.,   1.,   0.],
       ...,
       [  0.,   0., 709., ...,   1.,   0.,   1.],
       [  1.,   0., 772., ...,   2.,   1.,   0.],
       [  0.,   0., 792., ...,   1.,   1.,   0.]])

In [15]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

In [16]:
X_train

array([[0., 1., 0., ..., 3., 0., 0.],
       [1., 0., 0., ..., 2., 1., 0.],
       [0., 0., 1., ..., 2., 1., 1.],
       ...,
       [1., 0., 0., ..., 2., 1., 0.],
       [0., 0., 1., ..., 2., 1., 1.],
       [0., 1., 0., ..., 1., 1., 0.]])

In [17]:
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [18]:
X_train

array([[-1.01558815,  1.76021608, -0.57468161, ...,  2.53503394,
        -1.55362351, -1.03446007],
       [ 0.98465111, -0.56811207, -0.57468161, ...,  0.80424154,
         0.64365658, -1.03446007],
       [-1.01558815, -0.56811207,  1.74009395, ...,  0.80424154,
         0.64365658,  0.96668786],
       ...,
       [ 0.98465111, -0.56811207, -0.57468161, ...,  0.80424154,
         0.64365658, -1.03446007],
       [-1.01558815, -0.56811207,  1.74009395, ...,  0.80424154,
         0.64365658,  0.96668786],
       [-1.01558815,  1.76021608, -0.57468161, ..., -0.92655087,
         0.64365658, -1.03446007]])

In [19]:
X_test

array([[-1.01558815,  1.76021608, -0.57468161, ..., -0.92655087,
         0.64365658,  0.96668786],
       [ 0.98465111, -0.56811207, -0.57468161, ..., -0.92655087,
         0.64365658, -1.03446007],
       [-1.01558815, -0.56811207,  1.74009395, ..., -0.92655087,
         0.64365658,  0.96668786],
       ...,
       [ 0.98465111, -0.56811207, -0.57468161, ..., -0.92655087,
         0.64365658, -1.03446007],
       [-1.01558815,  1.76021608, -0.57468161, ...,  0.80424154,
         0.64365658, -1.03446007],
       [ 0.98465111, -0.56811207, -0.57468161, ..., -0.92655087,
        -1.55362351, -1.03446007]])

In [20]:
import keras
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


## Initializing the ANN

In [21]:
classifier = Sequential()

In [22]:
# input layer and first hidden layer.
classifier.add(Dense(output_dim = 6 , init = 'uniform' , activation = 'relu' , input_dim = 11))

  


In [23]:
# Add second hidden layer
classifier.add(Dense(output_dim = 6 , init = 'uniform' , activation = 'relu'))

  


In [24]:
# Add output layer
classifier.add(Dense(output_dim = 1 , init = 'uniform' , activation = 'sigmoid'))

  


In [25]:
# Compile the code
classifier.compile(optimizer = 'adam' , loss = 'binary_crossentropy' , metrics = ['accuracy'])

In [26]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

Physical devices cannot be modified after being initialized


In [27]:
# Fit the ANN with training set
classifier.fit(X_train , y_train , batch_size = 10 , epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x24b33863bc8>

In [28]:
# predicting the result
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

In [29]:
y_pred[:20]

array([[False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False]])

In [30]:
"""Predict if the customer with the following informations will leave the bank:
Geography: France
Credit Score: 600
Gender: Male
Age: 40
Tenure: 3
Balance: 60000
Number of Products: 2
Has Credit Card: Yes
Is Active Member: Yes
Estimated Salary: 50000"""


new_prediction = classifier.predict(sc.transform(np.array([[0.0 , 0, 600 , 1 , 40 , 3 , 60000 , 2 , 1 , 1 , 50000]])))
new_prediction = (new_prediction > 0.5)

In [33]:
new_prediction

array([[False]])

In [31]:
ac = accuracy_score(y_test , y_pred)
ac

0.8432

In [32]:
confusion_matrix(y_test , y_pred)

array([[1934,   57],
       [ 335,  174]], dtype=int64)