In [1]:
import tensorflow as tf 
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adagrad, Adadelta, RMSprop, Adam, Nadam
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import time
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' 

from pyimagesearch.minigooglenet import MiniGoogLeNet
from pyimagesearch.clr_callback import CyclicLR
from pyimagesearch import config
from pyimagesearch.learningratefinder import LearningRateFinder
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.datasets import fashion_mnist
import cv2

## 2.3

In [2]:
img_witdth, img_height = 32, 32

((trainX, trainY), (testX, testY)) = fashion_mnist.load_data()
# ((trainX, trainY), (testX, testY)) = cifar10.load_data()
trainX = trainX.astype("float")
testX = testX.astype("float")

# apply mean subtraction to the data
mean = np.mean(trainX, axis=0)
trainX -= mean
testX -= mean

# Fashion MNIST images are 28x28 but the network we will be training
# is expecting 32x32 images
trainX = np.array([cv2.resize(x, (img_witdth, img_height)) for x in trainX])
testX = np.array([cv2.resize(x, (img_witdth, img_height)) for x in testX])

# scale the pixel intensities to the range [0, 1]
trainX = trainX.astype("float") / 255.0
testX = testX.astype("float") / 255.0


# reshape the data matrices to include a channel dimension (required
# for training)

trainX = trainX.reshape((trainX.shape[0], img_witdth, img_height, 1))
testX = testX.reshape((testX.shape[0], img_witdth, img_height, 1))


# convert the labels from integers to vectors
lb = LabelBinarizer()
trainY = lb.fit_transform(trainY)
testY = lb.transform(testY)

# construct the image generator for data augmentation
aug = ImageDataGenerator(
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    fill_mode="nearest",
)

In [3]:
train_size = trainX.shape[0]
train_size

60000

In [4]:
# train_x_dataset = tf.data.Dataset.from_tensor_slices(trainX)
# train_y_dataset = tf.data.Dataset.from_tensor_slices(trainY)
# test_x_dataset = tf.data.Dataset.from_tensor_slices(testX)
# test_y_dataset = tf.data.Dataset.from_tensor_slices(testY)

In [5]:
# def loss(model, x, y):
#     y_ = model(x)
#     return keras.losses.categorical_crossentropy(y_true=y, y_pred=y_)

# def grad(model, x, y):
#     with tf.GradientTape() as tape:
#         loss_value = loss(model, x, y)
#     return loss_value, tape.gradient(loss_value, model.trainable_variables)

In [6]:
opt = SGD(1e-1, momentum=0.9)
model = MiniGoogLeNet.build(width=img_witdth, height=img_height, depth=1, classes=10)
history = {'loss': [], 'accuracy': [], 'val_loss': [], 'val_accuracy': [], 'batch_size': []}

batch_size = 32
loss_object = tf.keras.losses.CategoricalCrossentropy()

for ep in range(20):
    epoch_loss = tf.keras.metrics.Mean()
    epoch_accuracy = tf.keras.metrics.CategoricalAccuracy()

    for i in range(int(np.ceil(train_size / batch_size))):
        current_batch = trainX[i*batch_size:min(train_size, (i+1)*batch_size)]
        current_label = trainY[i*batch_size:min(train_size, (i+1)*batch_size)]
        # accum_gradient = [tf.Variable(tf.zeros_like(v, dtype=tf.float32), trainable=False) for v in model.trainable_variables]
        for j in range(int(batch_size / 8)):
            x = current_batch[i*8:min(batch_size, (i+1)*8)]
            y = current_label[i*8:min(batch_size, (i+1)*8)]


            # loss_value, grads = grad(model, x, y)
            with tf.GradientTape() as tape:
                prediction = model(x, training = True)
                print(4)
                loss_value = loss_object(y_true=y, y_pred=prediction)
                print(5)
            
            # print(7)
            # epoch_loss.update_state(loss_value) 
            # epoch_accuracy.update_state(y_true=y, y_pred=y_)
            grads = tape.gradient(loss_value, model.trainable_variables)
            print(5)
            # for i in range(len(accum_gradient)):
            #     accum_gradient[i].assign_add(grads[i])
        
        # accum_gradient = [this_grad/(batch_size / 32) for this_grad in accum_gradient]
        # opt.apply_gradients(zip(accum_gradient, model.trainable_variables))
        
    # history['loss'].append(epoch_loss)
    # history['accuracy'].append(epoch_accuracy.result())
    # history['batch_size'].append(batch_size)

    # epoch_val_loss = tf.keras.metrics.Mean()
    # epoch_val_accuracy = tf.keras.metrics.CategoricalAccuracy()

    # for _ in range(int(np.ceil(len(testX) / batch_size))):
    #     predicted = []
    #     true_labels = []
    #     for _ in range(int(batch_size / 32)):
    #         try:
    #             x = a.next()
    #             y = b.next()
    #         except StopIteration:
    #             break

    #         y_ = model(x)
    #         epoch_loss.update_state(keras.losses.categorical_crossentropy(y_true=y, y_pred=y_)) 
    #         epoch_accuracy.update_state(y_true=y, y_pred=y_)

    # history['val_loss'].append(epoch_val_loss)
    # history['val_accuracy'].append(epoch_val_accuracy.result())

    if ep%10 == 9:
        batch_size *= 2
        print(ep, batch_size)

2022-11-09 18:04:55.976846: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-11-09 18:04:55.976992: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1 Max

systemMemory: 64.00 GB
maxCacheSize: 24.00 GB

[<tf.Variable 'conv2d/kernel:0' shape=(3, 3, 1, 96) dtype=float32, numpy=
array([[[[-0.04024583,  0.07023236,  0.01231268,  0.05246894,
           0.01729498, -0.08155982, -0.07855324, -0.02086258,
           0.06944911, -0.04135326,  0.07815564, -0.03839624,
           0.06725414, -0.03657682, -0.01275507,  0.01205456,
          -0.05488753,  0.08085696,  0.04139505,  0.05291864,
          -0.0746337 ,  0.0122546 , -0.01350466, -0.06046361,
           0.02179006, -0.0610294 ,  0.02633017,  0.02099525,
          -0.08094639, -0.0121751 ,  0.04624033, -0.05897267,
          -0.03394994, -0.07591308,  0.02787472,  0.01307666,
           0.05991745,  0.0112219 ,  0.05701442, -0.07357065,
          -0.0209321 , -0.01888042,  0.03746057, -0.02795456,
          -0.05248944, -0.00087717,  0.02508041, -0.01758304,
           0.05107524, -0.05827594,  0.04682577, -0.04182708,
           0.08145563,  0.00954646,  0

: 

: 

In [None]:
i = 0
trainY[i*batch_size:min(train_size, (i+1)*batch_size)].shape

(32, 10)

## 3.1 AlexNet

Conv-1: $(11\times11)\times3\times96 + 96 = 34944$

Conv-2: $(5\times5)\times96\times256 + 256 = 614656$

Conv-3: $(3\times3)\times256\times384 + 384 = 885120$

Conv-4: $(3\times3)\times384\times384 + 384 = 1327488$

Conv-5: $(3\times3)\times384\times256 + 256 = 884992$

FC-1: $(6\times6)\times256\times4096 + 4096 = 37752832$

FC-2: $4096\times4096 + 4096 = 16781312$

FC-3: $4096\times1000 + 1000 = 4097000$

Total: $62378344$ parameters

## 3.2 VGG19

| Layer | Number of Activations (Memory) | Parameters (Compute) |
|---|---|---|
| Input | $224*224*3=150K$ | 0 | 
| CONV3-64 | $224*224*64=3.2M$ | $(3*3*3)*64 = 1,728$ |
| CONV3-64 | $224*224*64=3.2M$ | $(3*3*64)*64 = 36,864$ |
| POOL2 | $112*112*64=800K$ | 0 |
| CONV3-128 |  |  |
| CONV3-128 |  |  |
| POOL2 | $56*56*128=400K$ | 0 |
| CONV3-256 |  |  |
| CONV3-256 | $56*56*256=800K$ | $(3*3*256)*256 = 589,824$ |
| CONV3-256 |  |  |
| CONV3-256 |  |  |
| POOL2 |  | 0 |
| CONV3-512 | $28*28*512=400K$ | $(3*3*256)*512 = 1,179,648$ |
| CONV3-512 |  |  |
| CONV3-512 | $28*28*512=400K$ |  |
| CONV3-512 |  |  |
| POOL2 |  | 0 |
| CONV3-512 |  |  |
| CONV3-512 |  |  |
| CONV3-512 |  |  |
| CONV3-512 |  |  |
| POOL2 |  | 0 |
| FC | 4096 |  |
| FC | 4096 | $4096*4096 = 16,777,216$ |
| FC | 1000 | $4096*1000 = 4096000$ |
| Total |  |  |

## 3.3

Assume stride is 1 and padding is 0.

For the 1st layer, the filter of size $F × F$ looks at $F × F$ of input, (i.e. receptive field $F × F$). The 2nd layer looks at $F × F$ of 1st layer, due to overlap of receptive fields the filter looking at, the first cell looks at $F × F$ of input, end each cell looks at an extra size equal to stride (i.e. $2×1$ will look at $(F + 1) × F$ of input)

Since stride is 1, $F × F$ of 1st layer is equal to $(F + (F - 1)*1) × (F + (F - 1)*1)$ of input. The 3rd layer looks at $F × F$ of 2nd layer, which is equal to $(F + (F - 1)*1) × (F + (F - 1)*1)$ of 1st layer, which is equal to $(F + (2F - 2)*1) × (F + (2F - 2)*1)$ of input. 

So Nth layer fillter will look at $F × F$ of N-1th layer, which is equal to $(2F - 1) × (2F - 1)$ of N-2th layer, which is equal to $(3F - 2) × (3F - 2)$ of N-3th layer, ..., which is equal to $(NF - N + 1) × (NF - N + 1)$ of input. So stack of N convolution layers each of filter size $F × F$ has the same receptive field as one convolution layer with filter of size $(NF − N + 1) × (NF − N + 1)$. 

## 3.4

### a)