# Physics 494/594
## Introduction to Keras

In [None]:
# %load ./include/header.py
import numpy as np
import matplotlib.pyplot as plt
import sys
from tqdm import trange,tqdm
sys.path.append('./include')
import ml4s

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('./include/notebook.mplstyle')
np.set_printoptions(linewidth=120)
ml4s.set_css_style('./include/bootstrap.css')
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

## Last Time

### [Notebook Link: 16_Training_Neural_Network.ipynb](./16_Training_Neural_Network.ipynb)

- Combine feed forward with backpropagation for supervised learning
- Training our deep neural network to *learn* a 2D shape

## Today

- Learn how to use the `keras` and `tensorflow` libraries to build sequential deep neural networks.
- Learn a simple 2D logical function

### Import tensorflow

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import datetime

### We've already seen `AND` let's learn a simple cross "+" function


In [None]:
def cross(x,y):
    return (x>-0.25)*(x<0.25) + (y>-0.25)*(y<0.25)

### Visualize the function to be learned

In [None]:
grid_size = 41

# The domain over which you want to plot [xmin,xmax,ymin,ymax]
extent = [-1.0, 1.0, -1.0, 1.0]

X = np.meshgrid(np.linspace(extent[0],extent[1],grid_size),np.linspace(extent[2],extent[3],grid_size))
batch_size = grid_size**2

aₒ = np.zeros([batch_size,2])
aₒ[:,0] = X[0].flatten()
aₒ[:,1] = X[1].flatten()

# Evaluate your function here
# note that *X unpacks the list (https://docs.python.org/3.7/tutorial/controlflow.html#unpacking-argument-lists)
result = cross(*X)

plt.imshow(result,cmap='Spectral_r', rasterized=True, extent=extent,
           interpolation='nearest', origin='lower')
plt.xlabel(r'$x_0$')
plt.ylabel(r'$x_1$');

#### Make Training Batches

In [None]:
def make_batch(n,batch_size,extent,func):
    '''Create a mini-batch from our inputs and outputs.
    Inputs:
    n         : number of neurons in each layer
    batch_size: the desired number of samples in the mini-batch
    extent    : [min(xₒ),max(xₒ), min(x₁),max(x₁),…,min(x_{n[0]-1}),max(x_{n[0]-1})]
    func:     : the desired target function.
    
    Outputs: returns the desired mini-batch of inputs and targets.
    '''
    
    # n[0] is the input dimension nₒ
    x = np.zeros([batch_size,n[0]])
    for i in range(n[0]):
        x[:,i] = np.random.uniform(low=extent[2*i],high=extent[2*i+1],size=[batch_size])

    # we expand the final axis such that y is a matrix (and not a vector)
    y = func(*[x[:,j] for j in range(n[0])])[:,np.newaxis]
    
    return x,y 

### Setup the deep neural network in `keras`

This is a `sequential` network as it is made up of 1 stack of layers, each with a single input and output layer.  I encourage you to use the extensive documentation available at [tensorflow.org](https://www.tensorflow.org/guide).  `keras` is the high-level API that makes it very easy to build and train neural networks.

In [None]:
# define network topology (architecture)
n = [2,10,4,1]

# initilize the model
model = keras.Sequential()

# the input layer is treated differently
model.add(keras.Input(shape=(n[0],)))

#  Create an initilizer for the weights
initializer = tf.keras.initializers.RandomUniform(minval=-2.0,maxval=2.0, seed=42)

# construct and initialize the network
for i,nℓ in enumerate(n[1:]):
    model.add(layers.Dense(nℓ, activation='sigmoid',kernel_initializer=initializer, 
                           bias_initializer=initializer))

# setup the properties of the 'model'
η = 0.9
SGD = keras.optimizers.SGD(learning_rate=η)
model.compile(loss='mean_squared_error', optimizer=SGD) 

You can get a glimpse at your network via `model.summary()`

In [None]:
model.summary()

Or in graph form

In [None]:
weights = [layer.weights[0].numpy() for layer in model.layers]
biases = [layer.weights[1].numpy() for layer in model.layers]
ml4s.draw_network(n, weights=weights, biases=biases, zero_index=True)

#### Feed forward is just evaluating the model on data

In [None]:
aL = model(aₒ).numpy().reshape(grid_size,grid_size)
plt.imshow(aL, extent=extent, cmap='Spectral_r', rasterized=True, 
           interpolation='nearest', origin='lower', aspect='equal')
plt.xlabel(r'$x_0$')
plt.ylabel(r'$x_1$');

In [None]:
batch_size = 500
num_steps = 5000
plot_ratio = int(num_steps / 50)

costs = np.zeros(num_steps)
for j in range(num_steps):
    
    x,y = make_batch(n,batch_size,extent,cross)
    costs[j] = model.train_on_batch(x,y)
    
    # we plot every plot_ratio steps
    if not j % plot_ratio or j == num_steps-1:
        
        aL = model(aₒ).numpy().reshape(X[0].shape)      
        fig,ax = ml4s.plot_training_2D(aL,cross(*X),costs)


## We can also have keras train on auto-generated batches

#### 1. Load and process all the data; we wille have `keras` make the train/test split

In [None]:
x,y = make_batch([2,1],10000,extent,cross)

<!-- The following code is used to interogate our model while training, it is not needed
!rm -rf ./logs/
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
-->

#### 2. Setup our model and define the architecture

We can auto-generate the input layer using `input_shape` in our first hidden layer.  A list of possible activation functions can be found [here](https://www.tensorflow.org/api_docs/python/tf/keras/activations).

In [None]:
model = keras.Sequential(
    [
        layers.Dense(4,input_shape=(2,),activation='relu'),
        layers.Dense(20, activation='relu'),
        layers.Dense(4, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])

#### 3. Choose the cost function (loss) and optimizer

In [None]:
model.compile(loss='mean_squared_error', optimizer='adam') 

#### 4. Train the model

In [None]:
training_history = {}
training_history['adam'] = model.fit(x=x,y=y, epochs=30,validation_split=0.2)

#### 5. Study the success of the model, make sure to include performance on the unseen validation data.

In [None]:
fig,ax = plt.subplots(ncols=3,nrows=1,figsize=(10,4))
        
ax[1].axis('off')

aL = model(aₒ).numpy().reshape(grid_size,grid_size)

img = ax[1].imshow(aL, extent=extent, cmap='Spectral_r', rasterized=True, 
       interpolation='nearest', origin='lower', aspect='equal',vmin=0, vmax=1)

ax[2].axis('off')
ax[2].imshow(cross(*X), extent=extent, cmap='Spectral_r', rasterized=True, 
   interpolation='nearest', origin='lower', aspect='equal',vmin=0, vmax=1)

ax[0].plot(training_history['adam'].history['loss'], label=f'train = {np.average(training_history["adam"].history["loss"][-10:]):.2g}')
ax[0].plot(training_history['adam'].history["val_loss"], label=f'test = {np.average(training_history["adam"].history["val_loss"][-10:]):.2g}')

ax[0].legend()

ax[0].set_title("Cost")
ax[0].set_xlabel("Epoch")
ax[1].set_title("Model Prediction")
ax[2].set_title("Target")

#### 6. Optimize performance for the data set by changing hyperparameters

Let's investigate the difference between SGD and adam for the optimizer

In [None]:
def generate_model():
    return keras.Sequential(
    [
        layers.Dense(4,input_shape=(2,),activation='relu'),
        layers.Dense(20, activation='relu'),
        layers.Dense(4, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])

In [None]:
η = [0.0001,0.001,0.01,0.05,0.1,1]
for cη in tqdm(η):
    model = generate_model()
    SGD = keras.optimizers.SGD(learning_rate=cη)
    model.compile(loss='mean_squared_error', optimizer=SGD)
    training_history[f'SGD η = {cη}'] = model.fit(x=x,y=y, epochs=30,validation_split=0.2, verbose=0)

In [None]:
epochs = np.arange(30)

plt.plot(epochs,training_history['adam'].history["loss"], label=f'adam', color='gray', linestyle='-')
plt.plot(epochs+0.5,training_history['adam'].history["val_loss"], color='gray', linestyle='--')

for i,cη in enumerate(η):
    plt.plot(epochs,training_history[f'SGD η = {cη}'].history["loss"], label=f'SGD η = {cη}', color=colors[i], linestyle='-')
    plt.plot(epochs+0.5,training_history[f'SGD η = {cη}'].history["val_loss"], color=colors[i], linestyle='--')

plt.legend(loc=(1,0.3))
plt.ylabel("Cost")
plt.xlabel("Epoch")

Or we can compare different cost functions

In [None]:
model = keras.Sequential(
    [
        layers.Dense(4,input_shape=(2,),activation='relu'),
        layers.Dense(20, activation='relu'),
        layers.Dense(4, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])

In [None]:
model.compile(loss='mean_absolute_error', optimizer='adam')
training_history[f'absolute'] = model.fit(x=x,y=y, epochs=30,validation_split=0.2, verbose=0)

In [None]:
plt.plot(epochs,training_history['adam'].history["loss"], label=f'MSE', color=colors[0], linestyle='-')
plt.plot(epochs+0.5,training_history['adam'].history["val_loss"], color=colors[0], linestyle='--')

plt.plot(epochs,training_history['absolute'].history["loss"], label=f'MAE', color=colors[-1], linestyle='-')
plt.plot(epochs+0.5,training_history['absolute'].history["val_loss"], color=colors[-1], linestyle='--')

plt.legend()
plt.ylabel("Cost")
plt.xlabel("Epoch")

Or different layer types.

In [None]:
model = keras.Sequential(
    [
        layers.Dense(4,input_shape=(2,),activation='sigmoid'),
        layers.Dense(20, activation='sigmoid'),
        layers.Dense(4, activation='sigmoid'),
        layers.Dense(1, activation='sigmoid')
    ])

model.compile(loss='mean_squared_error', optimizer='adam')
training_history['sigmoid'] = model.fit(x=x,y=y, epochs=30,validation_split=0.1, verbose=0)

In [None]:
plt.plot(epochs,training_history['adam'].history["loss"], label=f'relu', color=colors[0], linestyle='-')
plt.plot(epochs,training_history['adam'].history["val_loss"], color=colors[0], linestyle='--')

plt.plot(epochs,training_history['sigmoid'].history["loss"], label=f'sigmoid', color=colors[-1], linestyle='-')
plt.plot(epochs,training_history['sigmoid'].history["val_loss"], color=colors[-1], linestyle='--')

plt.legend()
plt.ylabel("Cost")
plt.xlabel("Epoch")