# Short Introduction to Neural Networks and Deep Learning with Pytorch

In [None]:
import os


import matplotlib.pyplot as plt
import numpy as np

from tqdm.auto import tqdm

# keras supports tensorflow, torch and jax
os.environ["KERAS_BACKEND"] = "torch"

import keras
from keras import layers
from keras import ops

In [None]:
%matplotlib widget

plt.rcParams["figure.constrained_layout.use"] = True

# How to define a Neural Network Architecture in Keras

To declare a new Network architecture, we create an instance of [`keras.Sequential`](https://keras.io/guides/sequential_model/)

We can define the layers that are applied *in sequence*. 

Keras completely takes care about gradient computation using back propagation for us.

In [None]:
model = keras.Sequential(
    name="fully-connected",
    layers=[
        layers.Dense(128, activation="relu", name="hidden-1"),
        layers.Dense(128, activation="relu", name="hidden-2"),
        layers.Dense(10, activation="softmax", name="output"),
    ],
)

model.summary()

Observe that the input shapes are not yet fixed. It will be determined once applied to data for the first time:

In [None]:
dummy = np.zeros((1, 8 * 8))
model(dummy)
model.summary()

Now we are building a more flexible model, where we can pass some options:

In [None]:
def create_model(n_classes, n_hidden, dropout=0.25, activation="leaky_relu"):
    return keras.Sequential(
        layers=[
            # flatten and normalize input
            layers.Flatten(),
            layers.BatchNormalization(),
            # first hidden layer
            layers.Dense(n_hidden, activation=activation),
            layers.BatchNormalization(),
            layers.Dropout(dropout),
            # second hidden layer
            layers.Dense(n_hidden, activation=activation),
            layers.BatchNormalization(),
            layers.Dropout(dropout),
            # output layer
            layers.Dense(n_classes, activation="softmax"),
        ]
    )


model = create_model(n_classes=10, n_hidden=128)
model(np.zeros((1, 28, 28)))
model.summary()

# Training

Keras comes with default fit / evaluate functions.

We could roll our own, but for this simple examples, we are going to use the defaults.

# MNIST

In [None]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

x_train.shape, y_train.shape

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(9, 3), constrained_layout=True)

for i, ax in enumerate(axs.flat):
    ax.imshow(x_train[i], cmap="gray_r")

Now we need to compile the model, which also defines loss function, optimizer and metrics we want to evaluate

In [None]:
model = create_model(n_classes=10, n_hidden=128)

model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    metrics=[
        keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
    ],
)

In [None]:
batch_size = 128
epochs = 20

history = model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.15,
)
score = model.evaluate(x_test, y_test, verbose=0)

In [None]:
from matplotlib.ticker import IndexLocator


def plot_losses(history):
    fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)

    ax1.plot(history.history["loss"], label="train")
    ax1.plot(history.history["val_loss"], label="validation")

    ax2.plot(history.history["accuracy"], label="train")
    ax2.plot(history.history["val_accuracy"], label="validation")

    ax1.set(
        ylabel="loss",
    )
    ax2.set(
        xlabel="epoch",
        ylabel="accuracy",
    )

    ax1.legend()
    ax2.legend()
    ax2.xaxis.set_major_locator(IndexLocator(2, 0))

In [None]:
plot_losses(history)

The validation loss is lower in the beginning, mainly due to two reasons:

* The model is learning fast and the train loss is the mean over the epoch while the validation loss is evaluated at the end of the epoch.
* Dropout is active for the training evaluation, but the validation uses the full network

# CIFAR-10

In [None]:
(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()

In [None]:
x_train.shape, y_train.shape

In [None]:
cifar10_classes = dict(
    enumerate(
        [
            "airplane",
            "automobile",
            "bird",
            "cat",
            "deer",
            "dog",
            "frog",
            "horse",
            "ship",
            "truck",
        ]
    )
)

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(9, 9), constrained_layout=True)

rng = np.random.default_rng(0)
indices = rng.choice(len(y_train), size=axs.size)

for idx, ax in zip(indices, axs.flat):

    img = x_train[idx]

    ax.set_title(cifar10_classes[y_train[idx, 0]])
    ax.imshow(img)
    ax.set_axis_off()

In [None]:
model = create_model(n_classes=len(cifar10_classes), n_hidden=128)


model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    metrics=[
        keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
    ],
)

In [None]:
batch_size = 128
epochs = 30

history = model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.15,
)
score = model.evaluate(x_test, y_test, verbose=0)
score

In [None]:
plot_losses(history)

We do not get much better than 50 % with a fully connected network.

Let's try a convolutional network. First a relatively simple one, based on the Keras examples:

In [None]:
input_shape = x_train[0].shape
n_classes = len(cifar10_classes)

model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.BatchNormalization(),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.BatchNormalization(),
        layers.Dense(n_classes, activation="softmax"),
    ]
)

model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    metrics=[
        keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
    ],
)

model.summary()

In [None]:
batch_size = 128
epochs = 20

history = model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.15,
)
score = model.evaluate(x_test, y_test, verbose=0)
score

In [None]:
plot_losses(history)

Let's try another architecture, from https://arxiv.org/abs/1409.1556

> Very Deep Convolutional Networks for Large-Scale Image Recognition  
> Karen Simonyan, Andrew Zisserman

> In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth using an architecture with very small (3x3) convolution filters, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16-19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively. We also show that our representations generalise well to other datasets, where they achieve state-of-the-art results. We have made our two best-performing ConvNet models publicly available to facilitate further research on the use of deep visual representations in computer vision. 

In [None]:
model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        # first convolutional stack
        layers.Conv2D(32, kernel_size=(3, 3), activation="leaky_relu", padding="same"),
        layers.Conv2D(32, kernel_size=(3, 3), activation="leaky_relu", padding="same"),
        layers.BatchNormalization(),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.25),
        # second convolutional stack
        layers.Conv2D(64, kernel_size=(3, 3), activation="leaky_relu", padding="same"),
        layers.Conv2D(64, kernel_size=(3, 3), activation="leaky_relu", padding="same"),
        layers.BatchNormalization(),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.25),
        # third convolutional stack
        layers.Conv2D(128, kernel_size=(3, 3), activation="leaky_relu", padding="same"),
        layers.Conv2D(128, kernel_size=(3, 3), activation="leaky_relu", padding="same"),
        layers.BatchNormalization(),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.25),
        # fully-connected part
        layers.Flatten(),
        layers.Dense(128, activation="leaky_relu"),
        layers.Dropout(0.25),
        layers.Dense(n_classes, activation="softmax"),
    ]
)

model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    metrics=[
        keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
    ],
)

model.summary()

In [None]:
batch_size = 64
epochs = 30

history = model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.15,
)

In [None]:
test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=0)

In [None]:
plot_losses(history)

In [None]:
predicted_score = model.predict(x_test)
prediction = np.argmax(predicted_score, axis=1)

In [None]:
prediction

In [None]:
test_accuracy

In [None]:
from sklearn.metrics import confusion_matrix


classes = list(cifar10_classes.values())


matrix = confusion_matrix(y_test, prediction)
matrix = np.divide(matrix, matrix.sum(axis=1))

fig, ax = plt.subplots()

mat = ax.matshow(matrix)
ax.set_xticks(np.arange(len(classes)))
ax.set_xticklabels(classes, rotation=90)

ax.set_yticks(np.arange(len(classes)))
ax.set_yticklabels(classes)

fig.colorbar(mat)

None

## Fashion-MNIST

In [None]:
(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()

In [None]:
x_train.shape, y_train.shape

In [None]:
fashion_mnist_classes = {
    0: "T-shirt/top",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle boot",
}

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(9, 9), constrained_layout=True)

rng = np.random.default_rng(0)
indices = rng.choice(len(y_train), size=axs.size)

for idx, ax in zip(indices, axs.flat):

    img = x_train[idx]

    ax.set_title(fashion_mnist_classes[y_train[idx]])
    ax.imshow(img, cmap="gray_r")
    ax.set_axis_off()

In [None]:
# add one dimension for the "color channels"

x_train = x_train[..., np.newaxis]
x_test = x_test[..., np.newaxis]

In [None]:
input_shape = x_train[0].shape
n_classes = len(fashion_mnist_classes)


model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        # first convolutional stack
        layers.Conv2D(32, kernel_size=(3, 3), activation="leaky_relu", padding="same"),
        layers.Conv2D(32, kernel_size=(3, 3), activation="leaky_relu", padding="same"),
        layers.BatchNormalization(),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.25),
        # second convolutional stack
        layers.Conv2D(64, kernel_size=(3, 3), activation="leaky_relu", padding="same"),
        layers.Conv2D(64, kernel_size=(3, 3), activation="leaky_relu", padding="same"),
        layers.BatchNormalization(),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.25),
        # third convolutional stack
        layers.Conv2D(128, kernel_size=(3, 3), activation="leaky_relu", padding="same"),
        layers.Conv2D(128, kernel_size=(3, 3), activation="leaky_relu", padding="same"),
        layers.BatchNormalization(),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Dropout(0.25),
        # fully-connected part
        layers.Flatten(),
        layers.Dense(128, activation="leaky_relu"),
        layers.Dropout(0.25),
        layers.Dense(n_classes, activation="softmax"),
    ]
)

model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    metrics=[
        keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
    ],
)

model.summary()

In [None]:
batch_size = 64
epochs = 30

history = model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.15,
)

Further links:
* [Keras Documentation](https://keras.io)
* [Keras Quickstart Tutorial](https://keras.io/getting_started/intro_to_keras_for_engineers/)
* [Keras Examples](https://keras.io/examples/)

The best current performance claimed on CIFAR-10 is 99.5 % accuracy:

https://en.wikipedia.org/wiki/CIFAR-10#Research_papers_claiming_state-of-the-art_results_on_CIFAR-10