## Week 7: Training NN with Data: Improving the accuracy- churn modelling Data

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing label encoder for encoding, category data
from sklearn.preprocessing import LabelEncoder

In [2]:
#read the dataset
df = pd.read_csv("Churn_Modelling.csv")
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
df.shape

(10000, 14)

In [4]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [5]:
df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [6]:
#drop un-necessary columns
df=df.drop(['RowNumber','CustomerId','Surname'], axis=1)

In [7]:
df.shape

(10000, 11)

In [8]:
df.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [9]:
# object dtype means categorical data, see classes of objects
df.Gender.value_counts()

Male      5457
Female    4543
Name: Gender, dtype: int64

In [10]:
df.Geography.value_counts()

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

In [11]:
#object is categorical data, so convert to numeric by encoding
obj_list= ['Geography','Gender']

for col in obj_list:
    encoder = LabelEncoder()
    encoder.fit(df[col])
    df[col] = encoder.transform(df[col])

In [12]:
#see all dtypes are numeric
df.dtypes

CreditScore          int64
Geography            int32
Gender               int32
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [13]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,0,42,2,0.0,1,1,1,101348.88,1
1,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,502,0,0,42,8,159660.8,3,1,0,113931.57,1
3,699,0,0,39,1,0.0,2,0,0,93826.63,0
4,850,2,0,43,2,125510.82,1,1,1,79084.1,0


In [14]:
#separate X and target
X = df.iloc[:,:9].values
y = df.iloc[:,-1].values

In [15]:
# apply train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [16]:
# Feature Scaling, observe som efeatures of X are with too large values, so scale down to small values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [17]:
print(X_train[0:5])

[[-0.23082038  0.30531046  0.91509065 -0.94449979 -0.70174202  0.58817274
   0.80225696 -1.55337352  0.97725852]
 [-0.25150912 -0.90503826 -1.09278791 -0.94449979 -0.35520275  0.46984886
   0.80225696 -1.55337352 -1.02327069]
 [-0.3963303   0.30531046 -1.09278791  0.77498705  0.33787579  0.85878772
  -0.91150957  0.64376017  0.97725852]
 [-0.04462173  0.30531046 -1.09278791  1.25262228  0.33787579  0.56560403
   0.80225696 -1.55337352  0.97725852]
 [ 0.6587954   1.51565919  0.91509065 -0.56239161  1.03095433  0.730395
  -0.91150957 -1.55337352 -1.02327069]]


In [18]:
#build your keras ANN model
import keras
from keras.models import Sequential
from keras.layers import Dense

In [19]:
# Defining the ANN model
model = Sequential()
# Adding the first hidden layer, by passing X with 8 features
model.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu', input_dim = 9))

# Adding the second hidden layer
model.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu'))

# Adding the output layer
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4)                 40        
                                                                 
 dense_1 (Dense)             (None, 4)                 20        
                                                                 
 dense_2 (Dense)             (None, 1)                 5         
                                                                 
Total params: 65
Trainable params: 65
Non-trainable params: 0
_________________________________________________________________


In [21]:
# Fitting the ANN to the Training set
model.fit(X_train, y_train, batch_size = 2, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2ce4bdd2d90>

In [22]:
y_pred = model.predict(X_test)




In [23]:
print(y_pred[10:20])

[[0.27108046]
 [0.0594691 ]
 [0.23099574]
 [0.05457293]
 [0.39693034]
 [0.04212695]
 [0.02444886]
 [0.12946196]
 [0.1669751 ]
 [0.34001967]]


In [24]:
print(y_test[10:20])

[0 1 0 0 0 0 0 0 0 0]


# accuracy is bit increased when compared to week 6, why 

In [25]:
                  ##############################################################################