# Gradient Descent Optimization Algorithms

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
#pip install tensorflow-addons

In [3]:
import pandas as pd
import datetime, os
import numpy as np
import numpy.random as npr
from pylab import plt, mpl
import time

from scipy.stats import norm
from scipy import optimize
import scipy.integrate as integrate
import scipy.special as special 

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorboard.plugins.hparams import api as hp
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import tensorflow_addons as tfa

import matplotlib.pyplot as plt
import seaborn as sns

from keras import backend as BK

# Load the TensorBoard notebook extension
%load_ext tensorboard

Using TensorFlow backend.


Lets read the CSV so we dont need to run it again, but can just load it when needed.

In [4]:
#To read the import the csv-file, use:
raw_Options_input = pd.read_csv (r"/Users/Marcklein/Desktop/Master Thesis/Option pricing using Neural Networks/Python/Heston/Options_input.csv")
raw_Options_output = pd.read_csv (r"/Users/Marcklein/Desktop/Master Thesis/Option pricing using Neural Networks/Python/Heston/Options_output.csv")

#Creates some unnamed column in the beginning, delete it:
del raw_Options_input['Unnamed: 0']
del raw_Options_output['Unnamed: 0']


We train on the first 5k input/output parameters to speed up the process

In [5]:
raw_Options_inputz = raw_Options_input[:5000]
raw_Options_outputz = raw_Options_output[:5000]

In [6]:
Options_input = raw_Options_input.copy()
Options_output = raw_Options_output.copy()

Since the standard deviation is calculated by taking the sum of the squared deviations from the mean, a zero standard deviation can only be possible when all the values of a variable are the same (all equal to the mean). In this case, those variables have no discriminative power so they can be removed from the analysis. They cannot improve any classification, clustering or regression task. Many implementations will do it for you or throw an error about a matrix calculation.

### **Data preparation**

We split our dataset into a training set and a test set (validation set is taken from the training set during model.fit).

In [7]:
# 90% for training and validating
train_dataset = Options_input.sample(frac=0.9, random_state=42)
test_dataset = Options_input.drop(train_dataset.index)

train_labels = Options_output.sample(frac=0.9, random_state=42)
test_labels = Options_output.drop(train_labels.index)

Check the overall statistics

In [8]:
train_stats = train_dataset.describe().T

In [9]:
#normalize the data
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset).values
normed_test_data = norm(test_dataset).values

#make the labels into numpy array just like the normed training data
train_labels = np.asarray(train_labels)
test_labels = np.asarray(test_labels)

#check the shapes
print("Input train data:", normed_train_data.shape, " Output train data:", train_labels.shape)
print("Input test data:", normed_test_data.shape, " Output test data:", test_labels.shape)

Input train data: (90000, 7)  Output train data: (90000, 10)
Input test data: (10000, 7)  Output test data: (10000, 10)


## **Gradient Descent optimization algorithms**

We look at the following gradient descent optimization algorithms:
- Momentum optimization
- Nesterov Accelerated Gradient
- AdaGrad
- RMSProp
- Adam

In [10]:
# Clear any logs from previous runs
!rm -rf ./logs/

In [11]:
#Create all the optimization algorithms
Optimizer_momentum = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9, nesterov=False, name='Momentum')
Optimizer_nesterov = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9, nesterov=True, name='Nesterov')
Optimizer_adagrad = tf.keras.optimizers.Adagrad(learning_rate=0.001, initial_accumulator_value=0.1, epsilon=1e-10, name='Adagrad')
Optimizer_rmsprop = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9, momentum=0.0, epsilon=1e-10, centered=False, name='RMSprop')
Optimizer_adam = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-10, amsgrad=False, name='Adam')

In [12]:
# input_size = len(train_dataset.keys())
output_size = 10
hidden_layer_size = 1000
n_epochs = 100

weights_initializer = keras.initializers.GlorotUniform()
#weights_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.05, seed=None)

#A function that trains and validates the model and returns the MSE (for the purpose of time-constrained comparison of BSG and Mini-SGD)
def train_val_model(run_dir, hparams):
    model = keras.models.Sequential([
            #Layer to be used as an entry point into a Network
            keras.layers.InputLayer(input_shape=[len(train_dataset.keys())]),
            #Dense layer 1
            keras.layers.Dense(hidden_layer_size, activation='relu', 
                               kernel_initializer = weights_initializer,
                               name='Layer_1'),
        
            #Batch Layer
            #keras.layers.BatchNormalization(),
        
            #Dense layer 2
            keras.layers.Dense(hidden_layer_size, activation='relu', 
                               kernel_initializer = weights_initializer,
                               name='Layer_2'),
        
            #Batch Layer
            #keras.layers.BatchNormalization(),
        
            #activation function is linear since we are doing regression
            keras.layers.Dense(output_size, activation='linear', name='Output_layer')
                                ])
    
    #Compiling the model
    model.compile(optimizer=hparams['optimizer'], 
                  loss='mean_squared_error', #Computes the mean of squares of errors between labels and predictions
                  metrics=['mean_squared_error']) #Computes the mean squared error between y_true and y_pred
    
    # initialize TimeStopping callback 
    #time_stopping_callback = tfa.callbacks.TimeStopping(seconds=5*60, verbose=1)
    
    #Training the network
    history = model.fit(normed_train_data, train_labels, 
         epochs=n_epochs,
         batch_size=256, 
         verbose=2,
         #validation_split=0.2,
         callbacks=[tf.keras.callbacks.TensorBoard(run_dir + "/Heston", update_freq='batch')])
    
    return history

In [13]:
train_val_model("logs/momentum", {'optimizer': Optimizer_momentum})
train_val_model("logs/nesterov", {'optimizer': Optimizer_nesterov})
train_val_model("logs/adagrad", {'optimizer': Optimizer_adagrad})
train_val_model("logs/rmsprop", {'optimizer': Optimizer_rmsprop})
train_val_model("logs/adam", {'optimizer': Optimizer_adam})

Epoch 1/100
352/352 - 13s - loss: 0.0570 - mean_squared_error: 0.0570
Epoch 2/100
352/352 - 13s - loss: 0.0073 - mean_squared_error: 0.0073
Epoch 3/100
352/352 - 13s - loss: 0.0055 - mean_squared_error: 0.0055
Epoch 4/100
352/352 - 13s - loss: 0.0048 - mean_squared_error: 0.0048
Epoch 5/100
352/352 - 9s - loss: 0.0044 - mean_squared_error: 0.0044
Epoch 6/100
352/352 - 9s - loss: 0.0040 - mean_squared_error: 0.0040
Epoch 7/100
352/352 - 9s - loss: 0.0037 - mean_squared_error: 0.0037
Epoch 8/100
352/352 - 8s - loss: 0.0034 - mean_squared_error: 0.0034
Epoch 9/100
352/352 - 7s - loss: 0.0032 - mean_squared_error: 0.0032
Epoch 10/100
352/352 - 8s - loss: 0.0029 - mean_squared_error: 0.0029
Epoch 11/100
352/352 - 8s - loss: 0.0027 - mean_squared_error: 0.0027
Epoch 12/100
352/352 - 8s - loss: 0.0026 - mean_squared_error: 0.0026
Epoch 13/100
352/352 - 8s - loss: 0.0024 - mean_squared_error: 0.0024
Epoch 14/100
352/352 - 7s - loss: 0.0022 - mean_squared_error: 0.0022
Epoch 15/100
352/352 - 7s

Epoch 10/100
352/352 - 8s - loss: 0.0030 - mean_squared_error: 0.0030
Epoch 11/100
352/352 - 8s - loss: 0.0028 - mean_squared_error: 0.0028
Epoch 12/100
352/352 - 8s - loss: 0.0026 - mean_squared_error: 0.0026
Epoch 13/100
352/352 - 10s - loss: 0.0024 - mean_squared_error: 0.0024
Epoch 14/100
352/352 - 10s - loss: 0.0023 - mean_squared_error: 0.0023
Epoch 15/100
352/352 - 8s - loss: 0.0021 - mean_squared_error: 0.0021
Epoch 16/100
352/352 - 8s - loss: 0.0020 - mean_squared_error: 0.0020
Epoch 17/100
352/352 - 7s - loss: 0.0019 - mean_squared_error: 0.0019
Epoch 18/100
352/352 - 8s - loss: 0.0018 - mean_squared_error: 0.0018
Epoch 19/100
352/352 - 8s - loss: 0.0017 - mean_squared_error: 0.0017
Epoch 20/100
352/352 - 8s - loss: 0.0016 - mean_squared_error: 0.0016
Epoch 21/100
352/352 - 8s - loss: 0.0015 - mean_squared_error: 0.0015
Epoch 22/100
352/352 - 8s - loss: 0.0014 - mean_squared_error: 0.0014
Epoch 23/100
352/352 - 8s - loss: 0.0013 - mean_squared_error: 0.0013
Epoch 24/100
352/3

Epoch 19/100
352/352 - 8s - loss: 0.0053 - mean_squared_error: 0.0053
Epoch 20/100
352/352 - 13s - loss: 0.0051 - mean_squared_error: 0.0051
Epoch 21/100
352/352 - 9s - loss: 0.0050 - mean_squared_error: 0.0050
Epoch 22/100
352/352 - 8s - loss: 0.0049 - mean_squared_error: 0.0049
Epoch 23/100
352/352 - 8s - loss: 0.0048 - mean_squared_error: 0.0048
Epoch 24/100
352/352 - 10s - loss: 0.0047 - mean_squared_error: 0.0047
Epoch 25/100
352/352 - 10s - loss: 0.0046 - mean_squared_error: 0.0046
Epoch 26/100
352/352 - 8s - loss: 0.0046 - mean_squared_error: 0.0046
Epoch 27/100
352/352 - 8s - loss: 0.0045 - mean_squared_error: 0.0045
Epoch 28/100
352/352 - 8s - loss: 0.0044 - mean_squared_error: 0.0044
Epoch 29/100
352/352 - 9s - loss: 0.0043 - mean_squared_error: 0.0043
Epoch 30/100
352/352 - 8s - loss: 0.0042 - mean_squared_error: 0.0042
Epoch 31/100
352/352 - 8s - loss: 0.0041 - mean_squared_error: 0.0041
Epoch 32/100
352/352 - 12s - loss: 0.0041 - mean_squared_error: 0.0041
Epoch 33/100
352

352/352 - 13s - loss: 5.3298e-05 - mean_squared_error: 5.3298e-05
Epoch 33/100
352/352 - 10s - loss: 5.1261e-05 - mean_squared_error: 5.1261e-05
Epoch 34/100
352/352 - 11s - loss: 4.9215e-05 - mean_squared_error: 4.9215e-05
Epoch 35/100
352/352 - 11s - loss: 4.7131e-05 - mean_squared_error: 4.7131e-05
Epoch 36/100
352/352 - 10s - loss: 4.5453e-05 - mean_squared_error: 4.5453e-05
Epoch 37/100
352/352 - 11s - loss: 4.3510e-05 - mean_squared_error: 4.3510e-05
Epoch 38/100
352/352 - 12s - loss: 4.2144e-05 - mean_squared_error: 4.2144e-05
Epoch 39/100
352/352 - 11s - loss: 4.0531e-05 - mean_squared_error: 4.0531e-05
Epoch 40/100
352/352 - 10s - loss: 3.8786e-05 - mean_squared_error: 3.8786e-05
Epoch 41/100
352/352 - 10s - loss: 3.7558e-05 - mean_squared_error: 3.7558e-05
Epoch 42/100
352/352 - 11s - loss: 3.6506e-05 - mean_squared_error: 3.6506e-05
Epoch 43/100
352/352 - 12s - loss: 3.5630e-05 - mean_squared_error: 3.5630e-05
Epoch 44/100
352/352 - 11s - loss: 3.4591e-05 - mean_squared_erro

Epoch 37/100
352/352 - 11s - loss: 4.5936e-06 - mean_squared_error: 4.5936e-06
Epoch 38/100
352/352 - 12s - loss: 4.1626e-06 - mean_squared_error: 4.1626e-06
Epoch 39/100
352/352 - 11s - loss: 4.2833e-06 - mean_squared_error: 4.2833e-06
Epoch 40/100
352/352 - 11s - loss: 5.4631e-06 - mean_squared_error: 5.4631e-06
Epoch 41/100
352/352 - 12s - loss: 2.1488e-06 - mean_squared_error: 2.1488e-06
Epoch 42/100
352/352 - 13s - loss: 4.5590e-06 - mean_squared_error: 4.5590e-06
Epoch 43/100
352/352 - 12s - loss: 3.2032e-06 - mean_squared_error: 3.2032e-06
Epoch 44/100
352/352 - 14s - loss: 3.9798e-06 - mean_squared_error: 3.9798e-06
Epoch 45/100
352/352 - 11s - loss: 3.2902e-06 - mean_squared_error: 3.2902e-06
Epoch 46/100
352/352 - 9s - loss: 4.4583e-06 - mean_squared_error: 4.4583e-06
Epoch 47/100
352/352 - 10s - loss: 2.4706e-06 - mean_squared_error: 2.4706e-06
Epoch 48/100
352/352 - 13s - loss: 3.8112e-06 - mean_squared_error: 3.8112e-06
Epoch 49/100
352/352 - 14s - loss: 2.7666e-06 - mean_

<tensorflow.python.keras.callbacks.History at 0x7fb4a26f8c50>

In [14]:
%tensorboard --logdir_spec=Momentum:logs/momentum,Nesterov:logs/nesterov,AdaGrad:logs/adagrad,RMSProp:logs/rmsprop,Adam:logs/adam
#%tensorboard --logdir logs/momentum

Reusing TensorBoard on port 6008 (pid 12945), started 1:47:49 ago. (Use '!kill 12945' to kill it.)