### Tuning Batch Size and Number Training Epochs

In [None]:
#set up a grid of hyperparameter values
#Retrain the model for each value of the parameter
#the one which yields highest accuracy will be selected 
#We use scikit-learn to grid search the batch size and epochs
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
# Function to create model required for KerasClassifier 
def create_model():
  model = Sequential()
  model.add(Dense(12,input_dim=8, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  #compile model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model
#fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataset = numpy.loadtxt('/content/pima-indians-diabetes.data.csv', delimiter =",")

In [None]:
#split into into input (X) and output (Y) variables

X = dataset[:,0:8]
Y = dataset[:,8]
#Create Model
model = KerasClassifier(build_fn=create_model, verbose=0)
# define the grid search parameters for different batch sizes and epochs:
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X,Y)



  model = KerasClassifier(build_fn=create_model, verbose=0)


In [None]:
#Summarize Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" % (mean, stdev, param))

  # #this code will give me the best batch size with the best number of epochs meaning the best match, Best: 0.718750 using {'batch_size': 10, 'epochs': 100}

Best: 0.718750 using {'batch_size': 10, 'epochs': 100}
0.625000 (0.050126) with {'batch_size': 10, 'epochs': 10}
0.674479 (0.004872) with {'batch_size': 10, 'epochs': 50}
0.718750 (0.043848) with {'batch_size': 10, 'epochs': 100}
0.584635 (0.033502) with {'batch_size': 20, 'epochs': 10}
0.684896 (0.009744) with {'batch_size': 20, 'epochs': 50}
0.677083 (0.012890) with {'batch_size': 20, 'epochs': 100}
0.539062 (0.073079) with {'batch_size': 40, 'epochs': 10}
0.652344 (0.030425) with {'batch_size': 40, 'epochs': 50}
0.677083 (0.017566) with {'batch_size': 40, 'epochs': 100}
0.574219 (0.026107) with {'batch_size': 60, 'epochs': 10}
0.645833 (0.041134) with {'batch_size': 60, 'epochs': 50}
0.661458 (0.014382) with {'batch_size': 60, 'epochs': 100}
0.514323 (0.096442) with {'batch_size': 80, 'epochs': 10}
0.640625 (0.008438) with {'batch_size': 80, 'epochs': 50}
0.649740 (0.025976) with {'batch_size': 80, 'epochs': 100}
0.541667 (0.081085) with {'batch_size': 100, 'epochs': 10}
0.562500 (0

### Tuning Optimization algorithms/ optimizers

---



In [None]:
# we have many optimizers such as AdaMax, Adam, RMSprop, Adadelta, AMSGrad, , Adagrad = adaptive Gradient Algorithm, SGD, Nadam
#but the most famous ones are SGD = stochastic gradient descent, Adam = adptive moment estimation, RMSprop Root Mean Square Propagation
# The optimizer has this objective : Reduce the loss function and help models to make accurate predictions 

import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
# Function to create model required for KerasClassifier 
def create_model(optimizer='adam'):
  model = Sequential()
  model.add(Dense(12,input_dim=8, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  #compile model
  model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
  return model
#fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataset = numpy.loadtxt('/content/pima-indians-diabetes.data.csv', delimiter =",")
X = dataset[:,0:8]
Y = dataset[:,8]
#Create Model
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0) # add our chosen batch size and epochs by grid search
# define the grid search parameters for different optimizers:
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X,Y)
#Summarize Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" % (mean, stdev, param))

  #Best: 0.723958 using {'optimizer': 'Adam'}

  model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0) # add our chosen batch size and epochs by grid search


Best: 0.723958 using {'optimizer': 'Adam'}
0.680990 (0.019225) with: {'optimizer': 'SGD'}
0.630208 (0.122714) with: {'optimizer': 'RMSprop'}
0.506510 (0.020256) with: {'optimizer': 'Adagrad'}
0.621094 (0.034499) with: {'optimizer': 'Adadelta'}
0.723958 (0.026557) with: {'optimizer': 'Adam'}
0.670573 (0.016053) with: {'optimizer': 'Adamax'}
0.703125 (0.022326) with: {'optimizer': 'Nadam'}


### Tuning the Activation Function (in what we call neuron activation)

In [None]:
# we have various and common (non linear) activation functions such as : sigmoid (0 =>1) = logistic function, ReLU (0 =>1 )= rectified linear unit, Softmax, Leaky Relu, Maxout, ELU, tanh ( -1 => 1 )= hyperbolic tangent
#Activation Function = parameter we specify when building models, Function applied to the neurons in a layer during prediction 

import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
# Function to create model required for KerasClassifier 
def create_model(activation='relu'):
  model = Sequential()
  model.add(Dense(12,input_dim=8, activation=activation, kernel_initializer='uniform'))
  model.add(Dense(1, activation='sigmoid', kernel_initializer='uniform')) # the final layer has always a sigmoid activation function or softmax
  #compile model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model
#fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataset = numpy.loadtxt('/content/pima-indians-diabetes.data.csv', delimiter =",")
X = dataset[:,0:8]
Y = dataset[:,8]
#Create Model
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0) # add our chosen batch size and epochs by grid search
# define the grid search parameters for different activation functions:
activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid','linear']
param_grid = dict(activation=activation)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X,Y)
#Summarize Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" % (mean, stdev, param))

# the output ==> Best: 0.730469 using {'activation': 'softplus'}

  model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0) # add our chosen batch size and epochs by grid search


Best: 0.730469 using {'activation': 'softplus'}
0.653646 (0.009207) with: {'activation': 'softmax'}
0.730469 (0.020915) with: {'activation': 'softplus'}
0.682292 (0.017566) with: {'activation': 'softsign'}
0.718750 (0.029232) with: {'activation': 'relu'}
0.673177 (0.032734) with: {'activation': 'tanh'}
0.701823 (0.008027) with: {'activation': 'sigmoid'}
0.691406 (0.005524) with: {'activation': 'hard_sigmoid'}
0.714844 (0.027805) with: {'activation': 'linear'}


### Tuning dropout regularization layer

In [13]:
# applying dropout to a neural net => Drop units/neurons automatically during the training phase  from each layer 
#randomly remove these units temporarily in order to reduce the computation needs during the training process every epoch, to have  agood performance,  to speed the training phase and to prevent neural networks from overfitting
#to reach a good model that captures the underlying logic of the dataset (no capturing the noise of the dataset) and never miss the point (= low loss and high accuracy)
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
# Function to create model required for KerasClassifier 
def create_model(dropout_rate=0.0, weight_constraint=0):
  model = Sequential()
  model.add(Dense(12,input_dim=8, activation='softplus', kernel_initializer='uniform', kernel_constraint=maxnorm(weight_constraint)))
  model.add(Dropout(dropout_rate))
  model.add(Dense(1, activation='sigmoid', kernel_initializer='uniform')) # the final layer has always a sigmoid activation function or softmax
  #compile model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model
#fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataset = numpy.loadtxt('/content/pima-indians-diabetes.data.csv', delimiter =",")
X = dataset[:,0:8]
Y = dataset[:,8]
#Create Model
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0) # add our chosen batch size and epochs by grid search
# define the grid search parameters for different dropout rates and weight constraints:
weight_constraint = [1, 2, 3, 4, 5]
dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] # dropout_rate = rate parameter is between 0 and 1 wwhich is a fraction of units we intend to drop
param_grid = dict(dropout_rate=dropout_rate, weight_constraint = weight_constraint)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X,Y)
#Summarize Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" % (mean, stdev, param))

# the Best result: 0.740885 using {'dropout_rate': 0.0, 'weight_constraint': 5}


  model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0) # add our chosen batch size and epochs by grid search


Best: 0.740885 using {'dropout_rate': 0.0, 'weight_constraint': 5}
0.725260 (0.011201) with: {'dropout_rate': 0.0, 'weight_constraint': 1}
0.727865 (0.020256) with: {'dropout_rate': 0.0, 'weight_constraint': 2}
0.731771 (0.025780) with: {'dropout_rate': 0.0, 'weight_constraint': 3}
0.731771 (0.027126) with: {'dropout_rate': 0.0, 'weight_constraint': 4}
0.740885 (0.041626) with: {'dropout_rate': 0.0, 'weight_constraint': 5}
0.725260 (0.021236) with: {'dropout_rate': 0.1, 'weight_constraint': 1}
0.733073 (0.019225) with: {'dropout_rate': 0.1, 'weight_constraint': 2}
0.710938 (0.019401) with: {'dropout_rate': 0.1, 'weight_constraint': 3}
0.730469 (0.032369) with: {'dropout_rate': 0.1, 'weight_constraint': 4}
0.726562 (0.008438) with: {'dropout_rate': 0.1, 'weight_constraint': 5}
0.723958 (0.023073) with: {'dropout_rate': 0.2, 'weight_constraint': 1}
0.714844 (0.019401) with: {'dropout_rate': 0.2, 'weight_constraint': 2}
0.729167 (0.019488) with: {'dropout_rate': 0.2, 'weight_constraint': 

### Tuning Number of Neurons

In [15]:
#the number of neurons in a layer is an important parameter to tune. Generally the number of neurons in a layer controls the representational capacity
# of the network at least at that point in the topology
#Also, generally a large enough single layer network can approximate any other neural network at least in theory
# In this example, we will look at tuning the number of neurons in a single hidden layer, we will try values from 1 to 30 in steps of 5
#use scikit learn to grid search the number of neurons
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
# Function to create model required for KerasClassifier 
def create_model(neurons=1):
  model = Sequential()
  model.add(Dense(neurons,input_dim=8, activation='softplus', kernel_initializer='uniform', kernel_constraint=maxnorm(5)))
  model.add(Dropout(0.0))
  model.add(Dense(1, activation='sigmoid', kernel_initializer='uniform')) # the final layer has always a sigmoid activation function or softmax
  #compile model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model
#fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataset = numpy.loadtxt('/content/pima-indians-diabetes.data.csv', delimiter =",")
X = dataset[:,0:8]
Y = dataset[:,8]
#Create Model
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0) # add our chosen batch size and epochs by grid search
# define the grid search parameters for different numbers of neurons:
neurons = [1, 5, 10, 15, 20, 25, 30]
param_grid = dict(neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X,Y)
#Summarize Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" % (mean, stdev, param))

# the Best result: 0.746094 using {'neurons': 25}

  model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0) # add our chosen batch size and epochs by grid search


Best: 0.746094 using {'neurons': 25}
0.666667 (0.040386) with: {'neurons': 1}
0.710938 (0.014616) with: {'neurons': 5}
0.730469 (0.024910) with: {'neurons': 10}
0.742188 (0.015947) with: {'neurons': 15}
0.743490 (0.024360) with: {'neurons': 20}
0.746094 (0.019401) with: {'neurons': 25}
0.744792 (0.022628) with: {'neurons': 30}


### learning rate and momentum tuning

In [16]:
# This is the most important part in the hyperparameter tuning operations 
# Learning rate : hos fast or how slow is my network learning
# how does learning rate impact training 
# a high learning rate == > the network training will likely be unstable (or diverge entirely), performance of the model will oscillate over training epochs 
# a low learning rate == > training is more reliable, optimization will take a lot of time

# Momentum = a change occured to the weight = weight update, accelerate SGD in the relevant direction 
# As in learning rate, used when the model is trained by a stochastic gradient descent optimizer

import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
from keras.optimizers import SGD
# Function to create model required for KerasClassifier 
def create_model(learn_rate=0.01, momentum=0):
  model = Sequential()
  model.add(Dense(25,input_dim=8, activation='softplus', kernel_initializer='uniform', kernel_constraint=maxnorm(5)))
  model.add(Dropout(0.0))
  model.add(Dense(1, activation='sigmoid', kernel_initializer='uniform')) # the final layer has always a sigmoid activation function or softmax
  #compile model
  optimizer= SGD(lr=learn_rate, momentum=momentum)
  model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
  return model
#fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataset = numpy.loadtxt('/content/pima-indians-diabetes.data.csv', delimiter =",")
X = dataset[:,0:8]
Y = dataset[:,8]
#Create Model
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0) # add our chosen batch size and epochs by grid search
# define the grid search parameters for different learning rates and momentums:
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
param_grid = dict(learn_rate=learn_rate, momentum=momentum) 
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X,Y)
#Summarize Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" % (mean, stdev, param))

# the Best match : Best: 0.707031 using {'learn_rate': 0.001, 'momentum': 0.4}



  model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0) # add our chosen batch size and epochs by grid search
  super().__init__(name, **kwargs)


Best: 0.707031 using {'learn_rate': 0.001, 'momentum': 0.4}
0.690104 (0.011201) with: {'learn_rate': 0.001, 'momentum': 0.0}
0.662760 (0.035849) with: {'learn_rate': 0.001, 'momentum': 0.2}
0.707031 (0.033146) with: {'learn_rate': 0.001, 'momentum': 0.4}
0.704427 (0.017566) with: {'learn_rate': 0.001, 'momentum': 0.6}
0.695312 (0.032369) with: {'learn_rate': 0.001, 'momentum': 0.8}
0.660156 (0.006379) with: {'learn_rate': 0.001, 'momentum': 0.9}
0.622396 (0.055732) with: {'learn_rate': 0.01, 'momentum': 0.0}
0.664062 (0.030425) with: {'learn_rate': 0.01, 'momentum': 0.2}
0.669271 (0.018688) with: {'learn_rate': 0.01, 'momentum': 0.4}
0.649740 (0.026557) with: {'learn_rate': 0.01, 'momentum': 0.6}
0.649740 (0.026557) with: {'learn_rate': 0.01, 'momentum': 0.8}
0.651042 (0.024774) with: {'learn_rate': 0.01, 'momentum': 0.9}
0.651042 (0.024774) with: {'learn_rate': 0.1, 'momentum': 0.0}
0.651042 (0.024774) with: {'learn_rate': 0.1, 'momentum': 0.2}
0.651042 (0.024774) with: {'learn_rate':