# LAB 3.1 - CNS (Sequential MNIST)

In [1]:
!mkdir sequential_mnist
!mkdir sequential_mnist / variables

mkdir: cannot create directory ‘sequential_mnist’: File exists
mkdir: cannot create directory ‘sequential_mnist’: File exists
mkdir: cannot create directory ‘/’: File exists
mkdir: cannot create directory ‘variables’: File exists


Import of libraries, fix of random seed and device.


In [2]:
import json
import os
from pathlib import Path
import random
import numpy as np
import matplotlib.pyplot as plt
import itertools
from typing import Callable
from tqdm.notebook import tqdm
import torch
from torchvision import datasets

seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

device = 'cuda'

# Bonus track 2 & 4 - Sequential MNIST classification task & benchmarking RNN models on the sequential MNIST task

Function able to download and get tensors related to MNIST data and labels of train and test set.

In [3]:
def download_mnist() -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Function able to download MNIST dataset and return it.

    returns:
        tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: Training data and labels and test data and labels of MNIST dataset.
    """
    mnist_dir = 'MNIST/'
    if not Path(mnist_dir).exists():
        os.mkdir(mnist_dir)
    TR_MNIST = datasets.MNIST(root=f'{mnist_dir}', train=True, download=True, transform=None)
    TS_MNIST = datasets.MNIST(root=f'{mnist_dir}', train=False, download=True, transform=None)
    TR_DATA_MNIST = TR_MNIST.train_data.reshape(28 * 28, -1, 1).type(torch.float32).to(device)
    TS_DATA_MNIST = TS_MNIST.test_data.reshape(28 * 28, -1, 1).type(torch.float32).to(device)
    TR_LABELS_MNIST = torch.nn.functional.one_hot(TR_MNIST.train_labels).type(torch.float32).to(device)
    TS_LABELS_MNIST = torch.nn.functional.one_hot(TS_MNIST.test_labels).type(torch.float32).to(device)
    return TR_DATA_MNIST, TR_LABELS_MNIST, TS_DATA_MNIST, TS_LABELS_MNIST


TR_DATA_MNIST, TR_LABELS_MNIST, TS_DATA_MNIST, TS_LABELS_MNIST = download_mnist()

TR_DATA_MNIST.shape, TR_LABELS_MNIST.shape, TS_DATA_MNIST.shape, TS_LABELS_MNIST.shape



(torch.Size([784, 60000, 1]),
 torch.Size([60000, 10]),
 torch.Size([784, 10000, 1]),
 torch.Size([10000, 10]))

Function able to compute the accuracy metric.

In [4]:
def accuracy(out: torch.Tensor, pred: torch.Tensor) -> float:
    """
    Function that compute accuracy given an output and prediction tensor.

    out: Output tensor.
    pred: Prediction tensor.

    returns:
        float: Computed accuracy value.
    """
    return (sum(pred.argmax(-1) - out.argmax(-1) == 0) / len(out)).item()

Train function able to fit a model given in input.

In [5]:
def train(
        model: torch.nn.Module,
        TR: tuple[torch.Tensor, torch.Tensor],
        TS: tuple[torch.Tensor, torch.Tensor],
        epochs: int = 10,
        batch_size: int = 64,
        sgd_config: dict = {},
        tqdm=None,
) -> tuple:
    """
    Function able to train a given model.

    model: Model to train.
    TR: Tuple composed by X train and Y train torch tensors.
    TS: Tuple composed by X test and Y test torch tensors.
    epochs: Number of epochs of training.
    batch_size: Dimension of batch.
    sgd_config: Dictionary containing sgd configurations (lr and momentum).
    tqdm: TQDM object to show the progressbar. It is None when progressbar is not shown.

    returns:
        tuple: Results of training. In particular the tuple is composed by 2 variables:
            - train_accuracy: List of accuracy of training set computed for each epoch.
            - test_accuracy: List of accuracy of test set computed for each epoch.
    """
    optimizer = torch.optim.SGD(model.parameters(), **sgd_config)
    criterion = torch.nn.MSELoss()
    train_accuracy, test_accuracy = None, None
    X_TR, Y_TR = TR
    X_TS, Y_TS = TS
    model.eval()

    iterable = range(epochs)
    if tqdm is not None:
        iterable = tqdm(iterable)
    for _ in iterable:

        model.train()
        train_batch_accuracy = 0
        for i in range(int(X_TR.shape[1] / batch_size)):
            optimizer.zero_grad()
            pred_tr = model(X_TR[:, i * batch_size: (i + 1) * batch_size])
            TR_LABEL_BATCH = Y_TR[i * batch_size: (i + 1) * batch_size]
            loss_tr = criterion(pred_tr, TR_LABEL_BATCH)
            loss_tr.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            train_batch_accuracy += accuracy(pred_tr, TR_LABEL_BATCH)
        train_accuracy = train_batch_accuracy / batch_size

        model.eval()
        test_batch_accuracy = 0
        for i in range(int(X_TS.shape[1] / batch_size)):
            pred_vl = model(X_TS[:, i * batch_size: (i + 1) * batch_size])
            TS_LABEL_BATCH = Y_TS[i * batch_size: (i + 1) * batch_size]
            test_batch_accuracy += accuracy(pred_vl, TS_LABEL_BATCH)
        test_accuracy = test_batch_accuracy / batch_size

    return train_accuracy, test_accuracy

Gridsearch function able to find the best configuration for a model created in a `train_func` function callback, train the model with the best configuration and test it on test set.

In [6]:
def gridsearch(
        train_func: Callable,
        configs: dict,
        TR: tuple[torch.Tensor, torch.Tensor],
        TS: tuple[torch.Tensor, torch.Tensor],
        epochs: int = 100,
        vl_portion: float = 0.2,
        batch_size: int = 64,
        attempts_for_config: int = 1,
        Ng: int = 1,
) -> tuple:
    """
    Gridsearch function able to find the best hyperparameters configuration, train the model with the best config and test it.

    train_func: Function able to create a model and train it given a config, a train and validation set and a number of epochs.
    configs: Hyperparameters configurations to investigate to find the best one that minimizes the loss on validation set. In particular this is a dictionary of lists for each hyperparam to investigate that is transformed by this function in a list of dictionaries.
    TR: Training set data (X, Y).
    TS: test set data (X, Y).
    epochs: Number of epochs of training both for model selection and model evaluation.
    vl_portion: Portion of example to use in validation set of model selection phase. It is useful to split training set in training and validation set.
    attempts_for_config: Number of attempts to do for each configuration. The loss that it's minimized is the mean of each loss of each attempt.
    Ng: Number of attempts in model assessment.

    returns: A tuple of 4 variables related to the result of training function during the model evaluation phase (mean and std of training and ts accuracy).
    """
    if isinstance(configs, dict):
        configs = [dict(zip(configs.keys(), t)) for t in itertools.product(*configs.values())]
    best_config = {}
    best_accuracy = None
    X_TR, Y_TR = TR
    vl_size = int(X_TR.shape[1] * vl_portion)
    for i, config in enumerate(tqdm(configs, desc='model evaluation')):
        vl_accuracy = 0
        for j in range(attempts_for_config):
            _, eval_accuracy = train_func(
                config,
                (X_TR[:, :-vl_size], Y_TR[:-vl_size]),
                (X_TR[:, -vl_size:], Y_TR[-vl_size:]),
                epochs=epochs,
                batch_size=batch_size,
            )
            vl_accuracy += eval_accuracy
        vl_accuracy /= attempts_for_config
        print(f'{i + 1}/{len(configs)} - Tried config {config} with accuracy {vl_accuracy}')
        if best_accuracy is None or vl_accuracy > best_accuracy:
            best_config = config
            best_accuracy = vl_accuracy
    print(f'Best config: {best_config} with accuracy {best_accuracy}')

    print('Retraining...')
    train_accuracies, test_accuracies = [], []
    for i in tqdm(range(Ng), desc='model assessment'):
        tr_accuracy, ts_accuracy = train_func(
            best_config,
            TR,
            TS,
            epochs=epochs,
            batch_size=batch_size,
        )
        train_accuracies.append(tr_accuracy)
        test_accuracies.append(ts_accuracy)
    train_accuracy_mean = np.mean(train_accuracies)
    train_accuracy_std = np.std(train_accuracies)
    test_accuracy_mean = np.mean(test_accuracies)
    test_accuracy_std = np.std(test_accuracies)

    return train_accuracy_mean, train_accuracy_std, test_accuracy_mean, test_accuracy_std

### RNN Model


Antisymmetric rnn layer built as a torch module used to construct an antisymmetric recurrent neural network.

In [7]:
class AntisymmetricRNNLayer(torch.nn.Module):
    """
    Antisymmetric rnn layer class.
    """

    def __init__(
            self,
            input_size: int,
            hidden_size: int,
            diffusion_coef: float = 0.001,
            num_layers: int = 1,
            bidirectional: bool = False
    ) -> None:
        """
        Antisymmetric rnn layer constructor.

        input_size: Input size.
        hidden_size: Hidden size.
        diffusion_coef: Diffusion coefficient of antisymmetric layer.
        num_layers: Number of layers to have a deep version of the model.
        bidirectional: Flag to create the bidirectional version of the model.
        """
        super(AntisymmetricRNNLayer, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.D = 2 if self.bidirectional else 1
        self.diffusion = diffusion_coef * torch.eye(hidden_size).to(device)
        self.weight_in, self.weight_hh, self.bias = self.__init_weights()

    def __init_weights(self) -> tuple[list, list, list]:
        """
        Private method able to initialize model layers.

        returns:
          tuple[list, list, list]: Lists of layers:
            - weight_in: Input weights.
            - weight_hh: Hidden weights.
            - bias: Bias.
        """
        weight_in = [
            torch.Tensor(self.input_size if i < self.D else self.hidden_size, self.hidden_size).to(device)
            for i in range(self.num_layers * self.D)
        ]
        weight_hh = [
            torch.Tensor(self.hidden_size, self.hidden_size).to(device)
            for _ in range(self.num_layers * self.D)
        ]
        bias = [
            torch.Tensor(1, self.hidden_size).to(device)
            for _ in range(self.num_layers * self.D)
        ]
        return weight_in, weight_hh, bias

    def forward(
            self,
            ts: torch.Tensor,
            H: torch.Tensor = None
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Forward function used to the forward phase of pytorch module.

        ts: Time series input data.
        H: Previous hidden state.

        returns:
          tuple[torch.Tensor, torch.Tensor]: Output data:
            - output: Output states of last layer.
            - hidden: Hidden states of last time steps of each layer.
        """
        layer_states = None
        if H is None:
            H = torch.rand(self.num_layers * self.D, ts.shape[1], self.hidden_size).to(device)
        if self.bidirectional:
            ts = torch.cat((ts, ts), dim=-1)
        for l in range(0, self.num_layers, self.D):
            dim_split = int(ts.shape[-1] / self.D)
            layer_states = self.__forward_layer(ts[:, :, :dim_split], H, l)
            if self.bidirectional:
                layer_states_2 = self.__forward_layer(ts[:, :, dim_split:].flip(0), H, l + 1)
                layer_states = torch.cat((layer_states, layer_states_2.flip(0)), dim=-1)
            ts = layer_states
        return layer_states, H

    def __forward_layer(
            self,
            ts: torch.Tensor,
            H: torch.Tensor,
            l: int
    ) -> torch.Tensor:
        """
        Private method able to run. the forward for a single layer.

        ts: Time series in input.
        H: hidden states.
        l: number of layer.

        returns:
          torch.Tensor: Hidden states computed.
        """
        layer_states = []
        for x in ts:
            H[l] = torch.nn.functional.tanh(
                x @ self.weight_in[l] + H[l] @ (self.weight_hh[l] - self.weight_hh[l].T - self.diffusion) + self.bias[l]
            )
        layer_states.append(H[l])
        return torch.stack(layer_states)

RNN pytorch model. This is able to take in input the parameter `recurrent_layer` that defines the type of recurrent layer. In this notebook are used:
- torch.nn.RNN
- torch.nn.LSTM
- torch.nn.GRU
- AntisymmetricRNNLayer (custom layer)

In [8]:
class RNN(torch.nn.Module):
    """
    Class of RNN model.
    """

    def __init__(
            self,
            input_size: int,
            hidden_size: int,
            output_size: int,
            recurrent_layer: torch.nn.RNN = torch.nn.RNN,
            stateful: bool = True,
            n_layers: int = 1,
            bidirectional=False,
            device: str = 'cpu',
    ) -> None:
        """
        RNN constructor method.

        input_size: Size of input value.
        hidden_size: Size of hidden state.
        output_size: Size of output value.
        stateful: Boolean set to true if it's want to use the final training hidden state as initial hidden state of evaluation.
        n_layers: Number of hidden layers. Default this is 1.
        device: Name of device to use for computation.
        """
        super(RNN, self).__init__()
        self.recoursive_layer = recurrent_layer(
            input_size,
            hidden_size,
            num_layers=n_layers,
            bidirectional=bidirectional,
        ).to(device)
        D = 2 if bidirectional else 1
        self.output_layer = torch.nn.Linear(hidden_size * D, output_size).to(device)
        self.stateful = stateful
        self.recurrent_states = None

    def forward(self, X: torch.Tensor) -> torch.Tensor:
        """
        Forward function used to the forward phase of pytorch module.

        X: Input data.

        returns:
            torch.Tensor: Output data.
        """
        out_state, recurrent_states = self.recoursive_layer(X, self.recurrent_states)
        if self.stateful and self.training:
            if type(recurrent_states) == tuple:
                self.recurrent_states = (
                    recurrent_states[0].detach(),
                    recurrent_states[1].detach()
                )
            else:
                self.recurrent_states = recurrent_states.detach()
        return self.output_layer(out_state[-1])


RNN(1, 100, 1)

RNN(
  (recoursive_layer): RNN(1, 100)
  (output_layer): Linear(in_features=100, out_features=1, bias=True)
)

RNN train function.

In [9]:
def train_rnn(
        config: dict,
        TR: tuple[torch.Tensor, torch.Tensor],
        TS: tuple[torch.Tensor, torch.Tensor],
        epochs: int = 10,
        batch_size: int = 64,
        tqdm=None
) -> tuple:
    """
    Function used to train the RNN model. It wraps the general train function.

    config: Dictionary of hyperparameters.
    TR: Training set.
    TS: Test set.
    epochs: Number of epochs.
    batch_size: Dimension of a batch.
    tqdm: Object used to show the progressbar.

    returns:
        tuple: Train results.
    """
    model = RNN(TR[0].shape[-1], config['hidden_size'], TR[1].shape[-1], recurrent_layer=config['recurrent_layer'],
                n_layers=config['n_layers'], bidirectional=config['bidirectional'], device=device)
    return train(model, TR, TS, epochs=epochs, tqdm=tqdm, batch_size=batch_size, sgd_config={
        'lr': config['lr'],
        'momentum': config['momentum'],
    })

Function able to perform RNN gridsearch and plot of results related to MSE loss and accuracy.

In [10]:
def perform_rnn_gs_and_plot(
        TR: tuple[torch.Tensor, torch.Tensor],
        TS: tuple[torch.Tensor, torch.Tensor],
        recurrent_layer: torch.nn.Module,
        save_name: str,
) -> None:
    """
    Function able to perform RNN gridsearch and plot of results related to MSE loss and accuracy.

    TR: Training set.
    TS: Test set.
    recurrent_layer: Recurrent layer module.
    save_name: Name given to the file where is saved the model.
    """
    tr_acc_mean, tr_acc_std, ts_acc_mean, ts_acc_std = gridsearch(
        train_func=train_rnn,
        configs=dict(
            hidden_size=[100],
            n_layers=[2, 1],
            lr=[0.1],
            momentum=[0.9],
            bidirectional=[True, False],
            recurrent_layer=[recurrent_layer],
        ),
        batch_size=512,
        TR=TR,
        TS=TS,
        epochs=30,
        vl_portion=0.2,
        attempts_for_config=1,
        Ng=5,
    )
    accuracy_results = dict(
        training_accuracy_mean=tr_acc_mean,
        training_accuracy_std=tr_acc_std,
        test_accuracy_mean=ts_acc_mean,
        test_accuracy_std=ts_acc_std,
    )
    print('Accuracy results')
    print(json.dumps(accuracy_results, indent=2))
    table_path = f'sequential_mnist/variables/accuracy_table.json'
    if os.path.exists(table_path):
        with open(table_path) as file:
            accuracy_table = json.load(file)
    else:
        accuracy_table = {}
    accuracy_table[save_name] = accuracy_results
    with open(table_path, 'w') as file:
        json.dump(accuracy_table, file)



## Sequential MNIST model selection and model evaluation results

In [11]:
seq_mnist_table_results = []

### Vanilla RNN

In [None]:
perform_rnn_gs_and_plot(
    (TR_DATA_MNIST, TR_LABELS_MNIST),
    (TS_DATA_MNIST, TS_LABELS_MNIST),
    recurrent_layer=torch.nn.RNN,
    save_name='rnn_seq_mnist'
)

model evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

### LSTM

In [None]:
perform_rnn_gs_and_plot(
    (TR_DATA_MNIST, TR_LABELS_MNIST),
    (TS_DATA_MNIST, TS_LABELS_MNIST),
    recurrent_layer=torch.nn.LSTM,
    save_name='lstm_seq_mnist'
)

### GRU

In [None]:
perform_rnn_gs_and_plot(
    (TR_DATA_MNIST, TR_LABELS_MNIST),
    (TS_DATA_MNIST, TS_LABELS_MNIST),
    recurrent_layer=torch.nn.GRU,
    save_name='gru_seq_mnist'
)

### Antisymmetric RNN

In [None]:
perform_rnn_gs_and_plot(
    (TR_DATA_MNIST, TR_LABELS_MNIST),
    (TS_DATA_MNIST, TS_LABELS_MNIST),
    recurrent_layer=AntisymmetricRNNLayer,
    save_name='antisymmetric_rnn_seq_mnist'
)

## Permuted sequential MNIST model selection and model evaluation results

MNIST dataset permutation of data to perform permuted sequential MNSIT.

In [None]:
permutations = np.random.permutation(28 * 28)

TR_DATA_PMNIST = TR_DATA_MNIST[permutations]
TS_DATA_PMNIST = TS_DATA_MNIST[permutations]

TR_DATA_PMNIST.shape, TS_DATA_PMNIST.shape

### Vanilla RNN

In [None]:
perform_rnn_gs_and_plot(
    (TR_DATA_PMNIST, TR_LABELS_MNIST),
    (TS_DATA_PMNIST, TS_LABELS_MNIST),
    recurrent_layer=torch.nn.RNN,
    save_name='rnn_permuted_mnist'
)

### LSTM

In [None]:
perform_rnn_gs_and_plot(
    (TR_DATA_PMNIST, TR_LABELS_MNIST),
    (TS_DATA_PMNIST, TS_LABELS_MNIST),
    recurrent_layer=torch.nn.LSTM,
    save_name='lstm_permuted_mnist'
)

### GRU

In [None]:
perform_rnn_gs_and_plot(
    (TR_DATA_PMNIST, TR_LABELS_MNIST),
    (TS_DATA_PMNIST, TS_LABELS_MNIST),
    recurrent_layer=torch.nn.GRU,
    save_name='gru_permuted_mnist'
)

### Antisymmetric RNN

In [None]:
perform_rnn_gs_and_plot(
    (TR_DATA_PMNIST, TR_LABELS_MNIST),
    (TS_DATA_PMNIST, TS_LABELS_MNIST),
    recurrent_layer=AntisymmetricRNNLayer,
    save_name='antisymmetric_rnn_permuted_mnist'
)