# Your Mission, should you choose to accept it...

To hyperparameter tune and extract every ounce of accuracy out of this telecom customer churn dataset: <https://drive.google.com/file/d/1dfbAsM9DwA7tYhInyflIpZnYs7VT-0AQ/view> 

## Requirements

- Load the data
- Clean the data if necessary (it will be)
- Create and fit a baseline Keras MLP model to the data.
- Hyperparameter tune (at least) the following parameters:
 - batch_size
 - training epochs
 - optimizer
 - learning rate (if applicable to optimizer)
 - momentum (if applicable to optimizer)
 - activation functions
 - network weight initialization
 - dropout regularization
 - number of neurons in the hidden layer
 
 You must use Grid Search and Cross Validation for your initial pass of the above hyperparameters
 
 Try and get the maximum accuracy possible out of this data! You'll save big telecoms millions! Doesn't that sound great?


In [61]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import SGD, Adam, Nadam
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

SEED = 42

In [10]:
# load data
data = pd.read_csv('churn.csv')
data.shape

(7043, 21)

In [14]:
# clean data
features_to_drop = ['customerID', 'gender', 'PhoneService', 'InternetService']
# engineering numeric columns for Cox Proportional Hazard estimation
data = data.drop(features_to_drop, axis=1).copy()

# convert some stuff to integers
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(0, inplace=True)

# a lot of variables are encoded as 'Yes' or 'No', lets get these all done at once
binary_features = ['Partner', 'Dependents', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                   'StreamingTV','StreamingMovies', 'PaperlessBilling']
for feat in binary_features:
    data[feat] = data[feat] == 'Yes'
    
# let's one hot encode the remaining categorical features
ohe_features = ['MultipleLines', 'Contract', 'PaymentMethod']
data = pd.get_dummies(data, 
                              drop_first=True,
                              columns=ohe_features)

data.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,...,MonthlyCharges,TotalCharges,Churn,MultipleLines_No phone service,MultipleLines_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,True,False,1,False,True,False,False,False,False,...,29.85,29.85,No,1,0,0,0,0,1,0
1,0,False,False,34,True,False,True,False,False,False,...,56.95,1889.5,No,0,0,1,0,0,0,1
2,0,False,False,2,True,True,False,False,False,False,...,53.85,108.15,Yes,0,0,0,0,0,0,1
3,0,False,False,45,True,False,True,True,False,False,...,42.3,1840.75,No,1,0,1,0,0,0,0
4,0,False,False,2,False,False,False,False,False,False,...,70.7,151.65,Yes,0,0,0,0,0,1,0


In [15]:
X = data.copy().drop(columns='Churn')
y = data.Churn == 'Yes'

# train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y)

# scaling
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  del sys.path[0]


In [63]:
# model harness
def create_model(dense_layers=2,
                 dense_nodes=5,
                 dropout=False,
                 dropout_pct=0.0,
                 activation='sigmoid',
                 weight_initializer='glorot_uniform',
                 optimizer=SGD,
                 lr=0.0001,
                 input_shape=(X_train.shape[1],)):
    
    model = Sequential()
    
    # add input layer
    model.add(Dense(dense_nodes, 
                    input_shape=input_shape,
                    kernel_initializer=weight_initializer,
                    activation=activation))
    
    # add dense layers and drop out
    for _ in range(dense_layers):
        # dense
        model.add(Dense(dense_nodes,
                        kernel_initializer=weight_initializer,
                        activation=activation))
        # dropout
        if dropout:
            model.add(Dropout(rate=dropout_pct))

    # add final activation layer
    model.add(Dense(1, activation='sigmoid'))
    # optimizer
    optimizer=optimizer(lr=lr)
    # compile model
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['acc'])
              
    return model

In [46]:
# baseline model
epochs = 10
batch_size = 100
pipe = Pipeline([('nn', KerasClassifier(build_fn=create_model, 
                               epochs=epochs,
                               batch_size=batch_size,
                               verbose=1))])
kfold = StratifiedKFold(n_splits=3, random_state=SEED)
results = cross_val_score(pipe, X_train, y_train, cv=kfold)
print(f"K-fold Cross-Val Results - Mean: {results.mean():.2f} StDev: {results.std():.2f} MSE")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
K-fold Cross-Val Results - Mean: 0.73 StDev: 0.00 MSE


### Batch Size / Epochs

In [49]:
# create model
model = KerasClassifier(build_fn=create_model, 
                               epochs=epochs,
                               batch_size=batch_size,
                               verbose=0)

# define the grid search parameters
param_grid = {'batch_size': [10, 20, 40, 60, 80, 100],
              'epochs': [20, 30, 40]}

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X_train, y_train)

# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}") 



Best: 0.7346467924617393 using {'batch_size': 100, 'epochs': 40}
Means: 0.5741924049347946, Stdev: 0.22268390426407317 with: {'batch_size': 10, 'epochs': 20}
Means: 0.734646786807021, Stdev: 0.005789672123798473 with: {'batch_size': 10, 'epochs': 30}
Means: 0.734646786807021, Stdev: 0.005789672123798473 with: {'batch_size': 10, 'epochs': 40}
Means: 0.42296769805080636, Stdev: 0.22171751968686396 with: {'batch_size': 20, 'epochs': 20}
Means: 0.42580759864370576, Stdev: 0.2226839053352862 with: {'batch_size': 20, 'epochs': 30}
Means: 0.5747248860466848, Stdev: 0.22193097946870238 with: {'batch_size': 20, 'epochs': 40}
Means: 0.5834220788941809, Stdev: 0.21939323430382723 with: {'batch_size': 40, 'epochs': 20}
Means: 0.48171813902932786, Stdev: 0.19008432800412237 with: {'batch_size': 40, 'epochs': 30}
Means: 0.5834220788941809, Stdev: 0.21939323430382723 with: {'batch_size': 40, 'epochs': 40}
Means: 0.5770323018169504, Stdev: 0.22171751736556183 with: {'batch_size': 60, 'epochs': 20}
Mea

### Optimizer

Tuning the optimizer used, including learning rate parameter.

In [53]:
# create model
model = KerasClassifier(build_fn=create_model, 
                               epochs=40,
                               batch_size=100,
                               verbose=0)

# define the grid search parameters
param_grid = {'optimizer': [SGD, Adam, Nadam],
              'lr': [.01, .001, .0001, .00001]}

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X_train, y_train)

# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}") 

Best: 0.7985445464550285 using {'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>}
Means: 0.7346467924617393, Stdev: 0.005789670671755297 with: {'lr': 0.01, 'optimizer': <class 'keras.optimizers.SGD'>}
Means: 0.7985445464550285, Stdev: 0.0025475157022588815 with: {'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>}
Means: 0.7951721685932256, Stdev: 0.007395333414028529 with: {'lr': 0.01, 'optimizer': <class 'keras.optimizers.Nadam'>}
Means: 0.7346467924617393, Stdev: 0.005789670671755297 with: {'lr': 0.001, 'optimizer': <class 'keras.optimizers.SGD'>}
Means: 0.7729854451497057, Stdev: 0.032920188418976884 with: {'lr': 0.001, 'optimizer': <class 'keras.optimizers.Adam'>}
Means: 0.7955271550049204, Stdev: 0.008260612934226969 with: {'lr': 0.001, 'optimizer': <class 'keras.optimizers.Nadam'>}
Means: 0.5741924075082468, Stdev: 0.22268391002777502 with: {'lr': 0.0001, 'optimizer': <class 'keras.optimizers.SGD'>}
Means: 0.5741924075082468, Stdev: 0.22268391002777502 with: {'lr':

### Activation Functions

In [57]:
# create model
model = KerasClassifier(build_fn=create_model, 
                               epochs=40,
                               batch_size=100,
                               verbose=0)

# define the grid search parameters
param_grid = {'activation': ['sigmoid', 'tanh', 'relu'],
             'optimizer' : [Adam],
             'lr' : [0.01]}

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X_train, y_train)

# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}") 

Best: 0.7958821419244292 using {'activation': 'sigmoid', 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>}
Means: 0.7958821419244292, Stdev: 0.0027611513303878754 with: {'activation': 'sigmoid', 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>}
Means: 0.7919772782349442, Stdev: 0.0044408968167433584 with: {'activation': 'tanh', 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>}
Means: 0.7910898093598339, Stdev: 0.006768076667617337 with: {'activation': 'relu', 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>}


### Network Weight Initialization

In [59]:
# create model
model = KerasClassifier(build_fn=create_model, 
                               epochs=40,
                               batch_size=100,
                               verbose=0)

# define the grid search parameters
param_grid = {'weight_initializer': ['glorot_uniform', 
                                     'random_uniform', 
                                     'random_normal'],
             'optimizer' : [Adam],
             'lr' : [0.01]}

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X_train, y_train)

# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}") 

Best: 0.8010294636725364 using {'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'random_normal'}
Means: 0.7933972261245682, Stdev: 0.004347689195903933 with: {'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'glorot_uniform'}
Means: 0.7964146220392059, Stdev: 0.006740088372751573 with: {'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'random_uniform'}
Means: 0.8010294636725364, Stdev: 0.0059188304385675685 with: {'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'random_normal'}


### Dropout Regularization

In [64]:
# create model
model = KerasClassifier(build_fn=create_model, 
                               epochs=40,
                               batch_size=100,
                               verbose=0)

# define the grid search parameters
param_grid = {'weight_initializer': ['random_normal'],
             'optimizer' : [Adam],
             'lr' : [0.01],
             'dropout' : [True, False],
             'dropout_pct' : [0.1, 0.2, 0.3]}

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X_train, y_train)

# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}") 

Best: 0.7973020912740177 using {'dropout': True, 'dropout_pct': 0.1, 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'random_normal'}
Means: 0.7973020912740177, Stdev: 0.006091939961195595 with: {'dropout': True, 'dropout_pct': 0.1, 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'random_normal'}
Means: 0.7914447974854003, Stdev: 0.008137653346817009 with: {'dropout': True, 'dropout_pct': 0.2, 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'random_normal'}
Means: 0.7965921150863615, Stdev: 0.006693187332666583 with: {'dropout': True, 'dropout_pct': 0.3, 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'random_normal'}
Means: 0.7942847002470879, Stdev: 0.008389282859244758 with: {'dropout': False, 'dropout_pct': 0.1, 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'random_normal'}
Means: 0.7926872557344528, Stdev: 0.0075596504847

### Number of Hidden Layers

In [65]:
# create model
model = KerasClassifier(build_fn=create_model, 
                               epochs=40,
                               batch_size=100,
                               verbose=0)

# define the grid search parameters
param_grid = {'weight_initializer': ['random_normal'],
             'optimizer' : [Adam],
             'lr' : [0.01],
             'dropout' : [True],
             'dropout_pct' : [0.1],
             'dense_nodes' : [5, 7, 10]}

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X_train, y_train)

# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}") 

Best: 0.8022719174782181 using {'dense_nodes': 5, 'dropout': True, 'dropout_pct': 0.1, 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'random_normal'}
Means: 0.8022719174782181, Stdev: 0.0013975869231506262 with: {'dense_nodes': 5, 'dropout': True, 'dropout_pct': 0.1, 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'random_normal'}
Means: 0.7928647454808188, Stdev: 0.00534253154807279 with: {'dense_nodes': 7, 'dropout': True, 'dropout_pct': 0.1, 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'random_normal'}
Means: 0.795882135259373, Stdev: 0.00745896340890036 with: {'dense_nodes': 10, 'dropout': True, 'dropout_pct': 0.1, 'lr': 0.01, 'optimizer': <class 'keras.optimizers.Adam'>, 'weight_initializer': 'random_normal'}


## Stretch Goals:

- Try to implement Random Search Hyperparameter Tuning on this dataset
- Try to implement Bayesian Optimiation tuning on this dataset
- Practice hyperparameter tuning other datasets that we have looked at. How high can you get MNIST? Above 99%?
- Study for the Sprint Challenge
 - Can you implement both perceptron and MLP models from scratch with forward and backpropagation?
 - Can you implement both perceptron and MLP models in keras and tune their hyperparameters with cross validation?

In [101]:
kfold = StratifiedKFold(n_splits=2, random_state=42)
cvscores = []
for train_idx, test_idx in kfold.split(X_train, y_train):

    X_train1, X_test = X_train[train_idx], X_train[test_idx]
    y_train1, y_test = y_train[train_idx], y_train[test_idx]
    
    X_train1[np.isnan(X_train1)] = 0

#     model = Sequential()
#     model.add(Dense(10, input_dim=X_train1.shape[1],  
#                     kernel_initializer='random_normal', 
#                     activation='sigmoid'))
#     model.add(Dense(10, 
#                     kernel_initializer='random_normal', 
#                     activation='sigmoid'))
# #     model.add(Dense(8, activation='relu'))
#     model.add(Dense(1, activation='sigmoid'))
#     opt = Adam(lr=0.0001)
#     model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

#     model.fit(X_train1, y_train1, epochs=10, batch_size=50, verbose=1)
#     cvscores.append(scores[1])
#     print(f'{numpy.mean(csvscores):.2f} +/- {numpy.std(cvsscores):.2f}')
    model = create_model()
    model.fit(X_train1, y_train1.values, epochs=10, batch_size=50, verbose=1)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
def create_model(dense_layers=2,
                 dense_nodes=5,
                 dropout=False,
                 dropout_pct=0.0,
                 activation='sigmoid',
                 weight_initializer='glorot_uniform',
                 optimizer=SGD,
                 lr=0.0001,
                 input_shape=(X_train.shape[1],)):
    
    model = Sequential()
    
    # add input layer
    model.add(Dense(dense_nodes, 
                    input_shape=input_shape,
                    kernel_initializer=weight_initializer,
                    activation=activation))
    
    # add dense layers and drop out
    for _ in range(dense_layers):
        # dense
        model.add(Dense(dense_nodes,
                        kernel_initializer=weight_initializer,
                        activation=activation))
        # dropout
        if dropout:
            model.add(Dropout(rate=dropout_pct))

    # add final activation layer
    model.add(Dense(1, activation='sigmoid'))
    # optimizer
    optimizer=optimizer(lr=lr)
    # compile model
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['acc'])
              
    return model