# Initialization

## Import Libraries

In [1]:
# data processing
import numpy as np
import pandas as pd 
from collections import defaultdict
import time

# data visualization
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style
sns.set()
import urllib.request


import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.callbacks import ReduceLROnPlateau, CSVLogger, EarlyStopping

# Notebook auto reloads code. (Ref: http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython)
%load_ext autoreload
%autoreload 2

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2021-12-12 03:01:07.900997: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-12 03:01:07.911932: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-12 03:01:07.912610: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


## Import Custom Module

In [2]:
#adjust filepath so that working directory is folder "recreating-residual-attention-network"
import os

PROJECT_PATH = '/home/ecbm4040/recreating-residual-attention-network'
os.chdir(PROJECT_PATH)

In [None]:
# If using Google Colab
from google.colab import drive
BASE_PATH = '/content/drive'
drive.mount(BASE_PATH)

# change directory
import os
PROJECT_PATH = os.path.join(BASE_PATH, "MyDrive", "ECBM4040", "FinalProject", "recreating-residual-attention-network")
os.chdir(PROJECT_PATH)

In [3]:
# Import created modules
from src.models.ResidualAttentionNetwork import ResidualAttentionNetwork, Attention56, Attention92, Attention128, Attention164
from src.utils import generate_data

2021-12-12 03:01:08.253039: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-12 03:01:08.253581: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-12 03:01:08.254400: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-12 03:01:08.255057: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

## Custom Callbacks

In [4]:
### Callback to save per epoch time
class TimeHistory(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, epoch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, epoch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)
        # Save epoch time directly on the csv_logger
        logs['per_epoch_time'] = time.time() - self.epoch_time_start

### Callback to stop when certain accuracy is reached
class EarlyStoppingByAccuracy(tf.keras.callbacks.Callback):
    def __init__(self, monitor='accuracy', value=0.98, verbose=0):
        super(tf.keras.callbacks.Callback, self).__init__()
        self.monitor = monitor
        self.value = value
        self.verbose = verbose

    def on_epoch_end(self, epoch, logs={}):
        current = logs.get(self.monitor)
        if current is None:
            warnings.warn("Early stopping requires %s available!" % self.monitor, RuntimeWarning)

        if current >= self.value:
            if self.verbose > 0:
                print("Epoch %05d: early stopping THR" % epoch)
            self.model.stop_training = True

# Modelling CIFAR-10

In [5]:
INPUT_SHAPE, NUM_CLASS, train_ds, val_ds, test_ds, _ = generate_data.get_cifar10()

In [6]:
BATCH_SIZE = 256

In [7]:
train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE*4).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)
val_ds = val_ds.shuffle(buffer_size=BATCH_SIZE*4).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)
test_ds = test_ds.shuffle(buffer_size=BATCH_SIZE*4).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

In [8]:
STEP_SIZE = len(train_ds)

## Naive Attention Learning

In this part, the experiment is done using Naive Attention Learning mechanism

### Attention-56

In [9]:
N_EPOCH = 100
LR = 1e-3

In [10]:
ran_model = Attention56(input_shape=INPUT_SHAPE, num_class=NUM_CLASS, learning_type='nal')
inputs = tf.keras.Input(INPUT_SHAPE)
ran_model(inputs)
ran_model.summary()

Model: "attention56"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_1 (Sequential)   (None, 32, 32, 3)         0         
                                                                 
 conv2d (Conv2D)             multiple                  864       
                                                                 
 batch_normalization (BatchN  multiple                 128       
 ormalization)                                                   
                                                                 
 re_lu (ReLU)                multiple                  0         
                                                                 
 max_pooling2d (MaxPooling2D  multiple                 0         
 )                                                               
                                                                 
 residual_unit (ResidualUnit  multiple                 

In [11]:
## Callbacks
time_callback = TimeHistory()

# checkpointing & logger
model_checkpointer = ModelCheckpoint(filepath="data/saved_weights/1-a56-nal-cifar10.h5",
                                     verbose=0,
                                     save_weights_only=True,
                                     save_best_only=False)

csv_logger = CSVLogger('data/logs/1-a56-nal-cifar10.csv')

callbacks = [model_checkpointer, time_callback, csv_logger]

## Learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries=[80*STEP_SIZE, 120*STEP_SIZE, 160*STEP_SIZE, 180*STEP_SIZE],
    values=[LR, 0.1*LR, 1e-2*LR, 1e-3*LR, 0.5e-3*LR]
)

our_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

In [12]:
a56_nal_model = ran_model

a56_nal_model.compile(optimizer=our_optimizer, 
                      loss=tf.keras.losses.CategoricalCrossentropy(), 
                      metrics=['accuracy'])


history = a56_nal_model.fit(train_ds,
                            validation_data=val_ds,
                            callbacks=callbacks,
                            epochs=N_EPOCH, verbose=1)

Epoch 1/100


2021-12-10 01:24:13.362181: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8200


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


### Attention-92

Add gradient clipping in optimizer to avoid exploding gradient

In [9]:
N_EPOCH = 100
LR = 1e-3

In [10]:
ran_model = Attention92(input_shape=INPUT_SHAPE, num_class=NUM_CLASS, learning_type='nal')
inputs = tf.keras.Input(INPUT_SHAPE)
ran_model(inputs)
ran_model.summary()

Model: "attention92"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_2 (Sequential)   (None, 32, 32, 3)         0         
                                                                 
 conv2d (Conv2D)             multiple                  864       
                                                                 
 batch_normalization (BatchN  multiple                 128       
 ormalization)                                                   
                                                                 
 re_lu (ReLU)                multiple                  0         
                                                                 
 max_pooling2d (MaxPooling2D  multiple                 0         
 )                                                               
                                                                 
 residual_unit (ResidualUnit  multiple                 

In [11]:
## Callbacks
time_callback = TimeHistory()

# checkpointing & logger
model_checkpointer = ModelCheckpoint(filepath="data/saved_weights/2-a92-nal-cifar10.h5",
                                     verbose=0,
                                     save_weights_only=True,
                                     save_best_only=False)

csv_logger = CSVLogger('data/logs/2-a92-nal-cifar10.csv')

callbacks = [model_checkpointer, time_callback, csv_logger]

## Learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries=[80*STEP_SIZE, 120*STEP_SIZE, 160*STEP_SIZE, 180*STEP_SIZE],
    values=[LR, 0.1*LR, 1e-2*LR, 1e-3*LR, 0.5e-3*LR]
)

our_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipvalue=0.1)

In [12]:
a92_nal_model = ran_model

a92_nal_model.compile(optimizer=our_optimizer, 
                      loss=tf.keras.losses.CategoricalCrossentropy(), 
                      metrics=['accuracy'])


history = a92_nal_model.fit(train_ds,
                            validation_data=val_ds,
                            callbacks=callbacks,
                            epochs=N_EPOCH, verbose=1)

Epoch 1/100


2021-12-12 03:02:26.889521: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8200


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100


Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100


Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


### Attention-128

In [34]:
N_EPOCH = 100
LR = 1e-3

In [35]:
ran_model = Attention128(input_shape=INPUT_SHAPE, num_class=NUM_CLASS, learning_type='nal')
inputs = tf.keras.Input(INPUT_SHAPE)
ran_model(inputs)
ran_model.summary()

Model: "attention128_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_3 (Sequential)   (None, 32, 32, 3)         0         
                                                                 
 conv2d_1974 (Conv2D)        multiple                  864       
                                                                 
 batch_normalization_1512 (B  multiple                 128       
 atchNormalization)                                              
                                                                 
 re_lu_1512 (ReLU)           multiple                  0         
                                                                 
 max_pooling2d_102 (MaxPooli  multiple                 0         
 ng2D)                                                           
                                                                 
 residual_unit_468 (Residual  multiple              

In [36]:
## Callbacks
time_callback = TimeHistory()

# checkpointing & logger
model_checkpointer = ModelCheckpoint(filepath="data/saved_weights/3-a128-nal-cifar10.h5",
                                     verbose=0,
                                     save_weights_only=True,
                                     save_best_only=False)

csv_logger = CSVLogger('data/logs/3-a128-nal-cifar10.csv')

callbacks = [model_checkpointer, time_callback, csv_logger]

## Learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries=[80*STEP_SIZE, 120*STEP_SIZE, 160*STEP_SIZE, 180*STEP_SIZE],
    values=[LR, 0.1*LR, 1e-2*LR, 1e-3*LR, 0.5e-3*LR]
)

our_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipvalue=0.01)

In [37]:
a128_nal_model = ran_model

a128_nal_model.compile(optimizer=our_optimizer, 
                      loss=tf.keras.losses.CategoricalCrossentropy(), 
                      metrics=['accuracy'])


history = a128_nal_model.fit(train_ds,
                            validation_data=val_ds,
                            callbacks=callbacks,
                            epochs=N_EPOCH, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100


Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100


Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


### Attention-164

In [None]:
N_EPOCH = 100
LR = 1e-3

In [None]:
ran_model = Attention164(input_shape=INPUT_SHAPE, num_class=NUM_CLASS, learning_type='nal')
inputs = tf.keras.Input(INPUT_SHAPE)
ran_model(inputs)
ran_model.summary()

In [None]:
## Callbacks
time_callback = TimeHistory()

# checkpointing & logger
model_checkpointer = ModelCheckpoint(filepath="data/saved_weights/4-a192-nal-cifar10.h5",
                                     verbose=0,
                                     save_weights_only=True,
                                     save_best_only=False)

csv_logger = CSVLogger('data/logs/4-a192-nal-cifar10.csv')

callbacks = [model_checkpointer, time_callback, csv_logger]

## Learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries=[80*STEP_SIZE, 120*STEP_SIZE, 160*STEP_SIZE, 180*STEP_SIZE],
    values=[LR, 0.1*LR, 1e-2*LR, 1e-3*LR, 0.5e-3*LR]
)

our_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipvalue=0.001)

In [None]:
a192_nal_model = ran_model

a192_nal_model.compile(optimizer=our_optimizer, 
                      loss=tf.keras.losses.CategoricalCrossentropy(), 
                      metrics=['accuracy'])


history = a192_nal_model.fit(train_ds,
                            validation_data=val_ds,
                            callbacks=callbacks,
                            epochs=N_EPOCH, verbose=1)

## Attention-Residual Learning

In this part, the experiment is done using Residual Attention Learning mechanism

### Attention-56

In [13]:
N_EPOCH = 100
LR = 1e-3

In [14]:
ran_model = Attention56(input_shape=INPUT_SHAPE, num_class=NUM_CLASS)
inputs = tf.keras.Input(INPUT_SHAPE)
ran_model(inputs)
ran_model.summary()

Model: "attention56_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_1 (Sequential)   (None, 32, 32, 3)         0         
                                                                 
 conv2d_139 (Conv2D)         multiple                  864       
                                                                 
 batch_normalization_107 (Ba  multiple                 128       
 tchNormalization)                                               
                                                                 
 re_lu_107 (ReLU)            multiple                  0         
                                                                 
 max_pooling2d_7 (MaxPooling  multiple                 0         
 2D)                                                             
                                                                 
 residual_unit_33 (ResidualU  multiple               

In [15]:
## Callbacks
time_callback = TimeHistory()

# checkpointing & logger
model_checkpointer = ModelCheckpoint(filepath="data/saved_weights/1-a56-cifar10.h5",
                                     verbose=0,
                                     save_weights_only=True,
                                     save_best_only=False)

csv_logger = CSVLogger('data/logs/1-a56-cifar10.csv')

callbacks = [model_checkpointer, time_callback, csv_logger]

## Learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries=[80*STEP_SIZE, 120*STEP_SIZE, 160*STEP_SIZE, 180*STEP_SIZE],
    values=[LR, 0.1*LR, 1e-2*LR, 1e-3*LR, 0.5e-3*LR]
)

our_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

In [16]:
a56_model = ran_model

a56_model.compile(optimizer=our_optimizer, 
                  loss=tf.keras.losses.CategoricalCrossentropy(), 
                  metrics=['accuracy'])


history = a56_model.fit(train_ds,
                        validation_data=val_ds,
                        callbacks=callbacks,
                        epochs=N_EPOCH, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


### Attention-92

Add gradient clipping in optimizer to avoid exploding gradient

In [17]:
N_EPOCH = 100
LR = 1e-3

In [18]:
ran_model = Attention92(input_shape=INPUT_SHAPE, num_class=NUM_CLASS)
inputs = tf.keras.Input(INPUT_SHAPE)
ran_model(inputs)
ran_model.summary()

Model: "attention92_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_2 (Sequential)   (None, 32, 32, 3)         0         
                                                                 
 conv2d_506 (Conv2D)         multiple                  864       
                                                                 
 batch_normalization_388 (Ba  multiple                 128       
 tchNormalization)                                               
                                                                 
 re_lu_388 (ReLU)            multiple                  0         
                                                                 
 max_pooling2d_26 (MaxPoolin  multiple                 0         
 g2D)                                                            
                                                                 
 residual_unit_120 (Residual  multiple               

In [15]:
## Callbacks
time_callback = TimeHistory()

# checkpointing & logger
model_checkpointer = ModelCheckpoint(filepath="data/saved_weights/2-a92-cifar10.h5",
                                     verbose=0,
                                     save_weights_only=True,
                                     save_best_only=False)

csv_logger = CSVLogger('data/logs/2-a92-cifar10.csv')

callbacks = [model_checkpointer, time_callback, csv_logger]

## Learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries=[80*STEP_SIZE, 120*STEP_SIZE, 160*STEP_SIZE, 180*STEP_SIZE],
    values=[LR, 0.1*LR, 1e-2*LR, 1e-3*LR, 0.5e-3*LR]
)

our_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipvalue=0.1)

In [16]:
a92_model = ran_model

a92_model.compile(optimizer=our_optimizer, 
                  loss=tf.keras.losses.CategoricalCrossentropy(), 
                  metrics=['accuracy'])


history = a92_model.fit(train_ds,
                        validation_data=val_ds,
                        callbacks=callbacks,
                        epochs=N_EPOCH, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100


Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100


Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


### Attention-128

In [38]:
N_EPOCH = 100
LR = 1e-3

In [39]:
ran_model = Attention128(input_shape=INPUT_SHAPE, num_class=NUM_CLASS)
inputs = tf.keras.Input(INPUT_SHAPE)
ran_model(inputs)
ran_model.summary()

Model: "attention128_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_3 (Sequential)   (None, 32, 32, 3)         0         
                                                                 
 conv2d_2341 (Conv2D)        multiple                  864       
                                                                 
 batch_normalization_1793 (B  multiple                 128       
 atchNormalization)                                              
                                                                 
 re_lu_1793 (ReLU)           multiple                  0         
                                                                 
 max_pooling2d_121 (MaxPooli  multiple                 0         
 ng2D)                                                           
                                                                 
 residual_unit_555 (Residual  multiple              

In [40]:
## Callbacks
time_callback = TimeHistory()

# checkpointing & logger
model_checkpointer = ModelCheckpoint(filepath="data/saved_weights/3-a128-cifar10.h5",
                                     verbose=0,
                                     save_weights_only=True,
                                     save_best_only=False)

csv_logger = CSVLogger('data/logs/3-a128-cifar10.csv')

callbacks = [model_checkpointer, time_callback, csv_logger]

## Learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries=[80*STEP_SIZE, 120*STEP_SIZE, 160*STEP_SIZE, 180*STEP_SIZE],
    values=[LR, 0.1*LR, 1e-2*LR, 1e-3*LR, 0.5e-3*LR]
)

our_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipvalue=0.01)

In [41]:
a128_model = ran_model

a128_model.compile(optimizer=our_optimizer, 
                   loss=tf.keras.losses.CategoricalCrossentropy(), 
                   metrics=['accuracy'])


history = a128_model.fit(train_ds,
                         validation_data=val_ds,
                         callbacks=callbacks,
                         epochs=N_EPOCH, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100


Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100


Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


### Attention-164

In [None]:
N_EPOCH = 100
LR = 1e-3

In [14]:
ran_model = Attention164(input_shape=INPUT_SHAPE, num_class=NUM_CLASS)
inputs = tf.keras.Input(INPUT_SHAPE)
ran_model(inputs)
ran_model.summary()

Model: "attention164"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_4 (Sequential)   (None, 32, 32, 3)         0         
                                                                 
 conv2d_759 (Conv2D)         multiple                  864       
                                                                 
 batch_normalization_582 (Ba  multiple                 128       
 tchNormalization)                                               
                                                                 
 re_lu_582 (ReLU)            multiple                  0         
                                                                 
 max_pooling2d_39 (MaxPoolin  multiple                 0         
 g2D)                                                            
                                                                 
 residual_unit_180 (Residual  multiple                

In [None]:
## Callbacks
time_callback = TimeHistory()

# checkpointing & logger
model_checkpointer = ModelCheckpoint(filepath="data/saved_weights/4-a192-cifar10.h5",
                                     verbose=0,
                                     save_weights_only=True,
                                     save_best_only=False)

csv_logger = CSVLogger('data/logs/4-a192-cifar10.csv')

callbacks = [model_checkpointer, time_callback, csv_logger]

## Learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries=[80*STEP_SIZE, 120*STEP_SIZE, 160*STEP_SIZE, 180*STEP_SIZE],
    values=[LR, 0.1*LR, 1e-2*LR, 1e-3*LR, 0.5e-3*LR]
)

our_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipvalue=0.001)

In [17]:
a164_model = ran_model

a164_model.compile(optimizer=our_optimizer, 
                   loss=tf.keras.losses.CategoricalCrossentropy(), 
                   metrics=['accuracy'])


history = a164_model.fit(train_ds,
                         validation_data=val_ds,
                         epochs=N_EPOCH, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10


KeyboardInterrupt: ignored