In [257]:
import numpy
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

seed = 7
numpy.random.seed(seed)

In [258]:
# downloaded from https://www.kaggle.com/filippoo/deep-learning-az-ann
churn_df = pd.read_csv('Churn_Modelling.csv')

In [259]:
churn_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [260]:
churnWithoutRowNumber = churn_df.drop(churn_df.columns[[0, 2]], axis=1)

In [261]:
# mapping countries to a specific integer
countries = map(lambda x: x[2], churnWithoutRowNumber.values)
unique_countries = set(countries)
country_hash = {}
for i, c in enumerate(unique_countries): country_hash[c] = i # can save this later when getting insight into neural network
integerMapping = map(lambda c: country_hash[c], churnWithoutRowNumber['Geography'])
countriesAsIntegers = churnWithoutRowNumber
countriesAsIntegers['Geography'] = integerMapping

#mapping genders as a specific integer
genders = map(lambda x: x[3], countriesAsIntegers.values)
unique_genders = set(genders)
gender_hash = {}
for i, g in enumerate(unique_genders): gender_hash[g] = i # can save this later when getting insight into neural network
integerMapping = map(lambda g: gender_hash[g], countriesAsIntegers['Gender'])
gendersAsIntegers = countriesAsIntegers
gendersAsIntegers['Gender'] = integerMapping

In [262]:
gendersAsIntegers.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,619,2,1,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,1,1,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,2,1,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,2,1,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,1,1,43,2,125510.82,1,1,1,79084.1,0


In [263]:
clean_dataset = gendersAsIntegers.values

In [264]:
clean_dataset[0]

array([  1.56346020e+07,   6.19000000e+02,   2.00000000e+00,
         1.00000000e+00,   4.20000000e+01,   2.00000000e+00,
         0.00000000e+00,   1.00000000e+00,   1.00000000e+00,
         1.00000000e+00,   1.01348880e+05,   1.00000000e+00])

In [265]:
X = clean_dataset[:,0:11].astype(float)

Y = clean_dataset[:,11]

In [266]:
len(X[0])

11

In [267]:
Y[0]

1.0

In [268]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

In [269]:
encoded_Y

array([1, 0, 1, ..., 1, 1, 0])

In [270]:
# baseline model
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(11, input_dim=11, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [271]:
estimator = KerasClassifier(build_fn=create_baseline, nb_epoch=10, batch_size=20, verbose=0)

In [272]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [273]:
results = cross_val_score(estimator, X, encoded_Y, cv=kfold)

In [274]:
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 34.27% (23.48%)


In [275]:
# Model with standard scaled data
numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, nb_epoch=10, batch_size=20, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Standardized: 85.27% (0.94%)


In [276]:
# smaller net
def create_smaller():
    model = Sequential()
    model.add(Dense(5, input_dim=11, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_smaller, nb_epoch=10, batch_size=20, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Smaller: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Smaller: 83.95% (1.43%)


In [277]:
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(11, input_dim=11, activation='relu'))
    model.add(Dense(5, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_larger, nb_epoch=10, batch_size=20, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Larger: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Larger: 85.27% (0.86%)
