# Intro

This notebook is only used to train the network, the data generation is done in the python scripts that are provided in the github. In order to be able to run the whole notebook you need to have a folder called data with the files hits.txt and parameters.txt inside. To be able to save the model you need a folder named model.

```
- notebook.ipynb
- data
  - parameters.txt
  - hits.txt
- model
```

We start by doing all of our imports and defining some constants. Then we load the device in order to be able to use the GPU, and we load the dataset.



In [38]:
import tqdm
import torch
from torch import nn
import os
import math
import numpy as np
import pandas as pd
from torch import Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader, random_split
from timeit import default_timer as timer
from torch.nn.utils.rnn import pad_sequence
import matplotlib.pyplot as plt
# from utils import custom_collate, create_mask_src, create_output_pred_mask, sort_by_angle, load_variable_len_data, get_labels_2d, get_labels_3d, normalize_data

In [39]:
DETECTORS = [1, 2, 3, 4, 5]
NR_DETECTORS = len(DETECTORS)
DIM = 2
DATA_FILENAME = "data/hits.txt"
LABEL_FILENAME = "data/parameters.txt"
BATCH_SIZE = 32
TEST_BATCH_SIZE = 2
PADDING_LEN_INPUT = 100
PADDING_LEN_LBL = 20
PAD_TOKEN = 50
EARLY_STOPPING = 6
LOSS_FN = nn.MSELoss(reduction='mean')
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# utility functions

The following section includes a few functions that are used later on.

In [40]:
def custom_collate(batch):
    event_ids = []
    xs, ys, zs = [], [], []
    labels = []
    labels_pad, lbl_lens = None, None

    # load in the elements in the batch
    for b in batch:
        event_ids.append(b[0])
        xs.append(b[1])
        ys.append(b[2])
        zs.append(b[3])
        labels.append(b[4])

    x_lens = [len(val) for val in xs]
    lbl_lens = [len(lbl) for lbl in labels]

    # pad the labels
    if DIM == 2:
        labels[0] = nn.ConstantPad1d((0, PADDING_LEN_LBL - labels[0].shape[0]), PAD_TOKEN)(labels[0])
    if DIM == 3:
        labels[0] = nn.ConstantPad2d((0, 0, 0, PADDING_LEN_LBL - labels[0].shape[0]), PAD_TOKEN)(labels[0])
    labels_pad = pad_sequence(labels, batch_first=False, padding_value=PAD_TOKEN)

    # add padding to the x, y and z vectors
    xs[0] = nn.ConstantPad1d((0, PADDING_LEN_INPUT - xs[0].shape[0]), PAD_TOKEN)(xs[0])
    ys[0] = nn.ConstantPad1d((0, PADDING_LEN_INPUT - ys[0].shape[0]), PAD_TOKEN)(ys[0])
    zs[0] = nn.ConstantPad1d((0, PADDING_LEN_INPUT - zs[0].shape[0]), PAD_TOKEN)(zs[0])

    xs_pad = pad_sequence(xs, batch_first=False, padding_value=PAD_TOKEN)
    ys_pad = pad_sequence(ys, batch_first=False, padding_value=PAD_TOKEN)
    zs_pad = pad_sequence(zs, batch_first=False, padding_value=PAD_TOKEN)
    x = torch.stack((xs_pad, ys_pad, zs_pad), dim=1)

    # Return the final batch
    return event_ids, x.transpose(1, 2), x_lens, labels_pad, lbl_lens


def load_variable_len_data(path):
    # from https://stackoverflow.com/questions/27020216/import-csv-with-different-number-of-columns-per-row-using-pandas
    with open(path, 'r') as f:
        col_count = [len(l.split(",")) for l in f.readlines()]

    # create column names corresponding to their index
    column_names = [i for i in range(0, max(col_count))]

    # read data with the previously created column names
    data = pd.read_csv(path, header=None, delimiter=",", names=column_names)
    return data


def create_mask_src(src, device):
    src_seq_len = src.shape[0]
    padding_vector = torch.full((src_seq_len,), PAD_TOKEN, device=device)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
    src_padding_mask = (src.transpose(0, 2) == padding_vector).all(dim=0)

    return src_mask, src_padding_mask


def create_output_pred_mask(tensor, indices):
    indices_arr = np.array(indices)
    row_indices = np.arange(tensor.shape[1])[:, np.newaxis]
    col_indices = np.arange(tensor.shape[0])
    mask = col_indices < indices_arr[row_indices]
    return mask.T


def get_labels_2d(event_labels, sort=True):
    labels = event_labels[2::2]
    labels = [float(value) for value in labels if not math.isnan(value)]
    if sort:
        labels = np.sort(labels)
    return labels


def get_labels_3d(event_labels, sort=True):
    labels = event_labels[2::2]

    tmp_labels = []
    for angles in labels:
        if not isinstance(angles, float): # if there is a nan, it's a float, so we should skip it
            angles = angles.split(';')
            tmp_labels.append((float(angles[0]), float(angles[1])))
    labels = tmp_labels

    if sort:
        labels = np.sort(labels)
    return labels


def normalize_data(data):
    maximum = data.abs().max()
    norm_data = data / maximum
    return norm_data


def cartesian2cylindrical(x, y, z=None):
    # convert the cartesian coordinates into cylindrical coordinates
    rho = np.sqrt(x ** 2 + y ** 2)
    phi = np.arctan2(y, x)
    return (rho, phi, z) if z is not None else (rho, phi)


def sort_by_angle(x, y, z = None):
    # convert the seperate lists into one list of tuples
    coords = []
    for i in range(len(x)):
            if DIM == 2:
                coords.append((x[i], y[i]))
            elif DIM == 3:
                coords.append((x[i], y[i], z[i]))

    # sort the list
    dist_coords = np.array(coords)
    distances = np.round(np.linalg.norm(dist_coords, axis=1))
    # Sort first by rho, round the rho, then sort by phi (sorting by the angle on detector)
    cylindrical_coords = [cartesian2cylindrical(*coord) for coord in coords]
    sorted_indices = np.lexsort((list(zip(*cylindrical_coords))[1], distances))
    sorted_cartesian_coords = [coords[i] for i in sorted_indices]

    # convert back into seperate lists
    if DIM == 2:
        x, y = zip(*sorted_cartesian_coords)
    elif DIM == 3:
        x, y, z = zip(*sorted_cartesian_coords)

    return x, y, z


# The Dataset
The following code block defines the dataset which is used by the data loader later on.

In [41]:
class TrajectoryDataset(Dataset):
    def __init__(self, data_filename, labels_filename, normalize=False):
        self.data = load_variable_len_data(data_filename)
        self.labels = load_variable_len_data(labels_filename)

        self.total_nr_events = len(self.data)
        self.normalize = normalize


    def __len__(self):
        return self.total_nr_events


    def __getitem__(self, idx):
        labels = None
        data = self.data.iloc[[idx]].values.tolist()[0]
        event_id = int(data[0])

        # get the label
        event_labels = self.labels.iloc[[event_id]].values.tolist()[0]
        if DIM == 2:
            labels = get_labels_2d(event_labels)
        elif DIM == 3:
            labels = get_labels_3d(event_labels)

        # get the x and y coordinates
        x = data[1::DIM + 1]
        y = data[2::DIM + 1]

        # convert to float
        x = [float(value) for value in x if not math.isnan(value)]
        y = [float(value) for value in y if not math.isnan(value)]

        # if 2d data we fill the z vector with padding, if 3d data we fill it with data
        z = None
        if DIM == 2:
            z = [PAD_TOKEN] * len(x)
        elif DIM == 3:
            z = data[3::DIM + 1]
            z = [float(value) for value in z if not math.isnan(value)]

        # normalise
        if self.normalize:
            raise NotImplementedError() #TODO

        # sort the data
        if DIM == 2:
            x, y, _ = sort_by_angle(x, y)
        elif DIM == 3:
            x, y, z = sort_by_angle(x, y, z)

        # convert the coordinates and labels to tensors
        x = torch.tensor(x).float()
        y = torch.tensor(y).float()
        z = torch.tensor(z).float()
        labels = torch.tensor(labels).float()

        # clean up data
        del data

        return event_id, x, y, z, labels

In [42]:
dataset = TrajectoryDataset(DATA_FILENAME, LABEL_FILENAME)

# Training Functions
Here we define functions for the training and evaluating of the model.

In [51]:
def evaluate(transformer, batch_size, loader):
    transformer.eval()
    n_batches = int(math.floor(len(loader.dataset) / batch_size))
    progress_bar = tqdm.tqdm(enumerate(loader), total=n_batches)

    with torch.no_grad():
        losses = 0
        for i, data in progress_bar:
            #get the data and labels on the device
            _, x, src_len, labels, _ = data
            x = x.to(DEVICE)
            if labels is not None:
                labels = labels.to(DEVICE)

            # create masks for the data
            src_mask, src_padding_mask = create_mask_src(x, DEVICE)

            # run model
            pred = transformer(x, src_mask, src_padding_mask)

            # create and apply mask to the labels
            mask = (labels != PAD_TOKEN).float()
            padding_len = np.round(np.divide(src_len, NR_DETECTORS))
            labels = labels * mask

            # calculate loss for 2d data
            if DIM == 2:
                pred = pred.transpose(0, 1)
                pred_mask = create_output_pred_mask(pred, padding_len)
                pred = pred * torch.tensor(pred_mask, device=DEVICE).float()
                # loss calculation
                loss = LOSS_FN(pred, labels)

            # calculate loss for 3d data
            elif DIM == 3:
                pred = pred[0].transpose(0, 1), pred[1].transpose(0, 1)
                pred = torch.stack([pred[0], pred[1]])
                for slice_ind in range(pred.shape[0]):
                    slice_mask = create_output_pred_mask(pred[slice_ind, :, :], padding_len)
                    pred[slice_ind, :, :] = pred[slice_ind, :, :] * torch.tensor(slice_mask, device=DEVICE).float()
                pred = pred.transpose(0, 2)
                pred = pred.transpose(1, 0)
                # loss calculation
                loss = LOSS_FN(pred, labels)


            # update the progress bar
            progress_bar.set_description("loss = %.8f" % loss.item())
            losses += loss.item()

    return losses / len(loader)

In [52]:
def train(t_loader, v_loader, transformer, optimizer, batch_size):
    train_losses, val_losses = [], []
    min_val_loss = np.inf
    epoch, count = 0, 0

    print("Starting training...")

    for epoch in range(epoch, EPOCHS):
        start_time = timer()

        # we set the model to train mode and enable gradients
        torch.set_grad_enabled(True)
        transformer.train()

        # we calculate the number of batches for the progress bar
        n_batches = int(math.floor(len(t_loader.dataset) / batch_size))
        progress_bar = tqdm.tqdm(enumerate(t_loader), total=n_batches)

        # we calculate the loss by calling the train_eval function
        losses = 0

        for i, data in progress_bar:
            # get the data and labels on the device
            _, x, src_len, labels, _ = data
            x = x.to(DEVICE)
            if labels is not None:
                labels = labels.to(DEVICE)

            # create masks for the data
            src_mask, src_padding_mask = create_mask_src(x, DEVICE)

            # run model
            pred = transformer(x, src_mask, src_padding_mask)
            optimizer.zero_grad()

            # create and apply mask to the labels
            mask = (labels != PAD_TOKEN).float()
            padding_len = np.round(np.divide(src_len, NR_DETECTORS))
            labels = labels * mask

            # calculate loss for 2d data
            if DIM == 2:
                pred = pred.transpose(0, 1)
                pred_mask = create_output_pred_mask(pred, padding_len)
                pred = pred * torch.tensor(pred_mask, device=DEVICE).float()
                # loss calculation
                loss = LOSS_FN(pred, labels)

            # calculate loss for 3d data
            elif DIM == 3:
                pred = pred[0].transpose(0, 1), pred[1].transpose(0, 1)
                pred = torch.stack([pred[0], pred[1]])
                for slice_ind in range(pred.shape[0]):
                    slice_mask = create_output_pred_mask(pred[slice_ind, :, :], padding_len)
                    pred[slice_ind, :, :] = pred[slice_ind, :, :] * torch.tensor(slice_mask, device=DEVICE).float()
                pred = pred.transpose(0, 2)
                pred = pred.transpose(1, 0)
                # loss calculation
                loss = LOSS_FN(pred, labels)

            # compute gradients and do backpropagation
            loss.backward()
            optimizer.step()

            # update the progress bar
            progress_bar.set_description("loss = %.8f" % loss.item())
            losses += loss.item()

        # calculate final loss
        train_loss = losses / len(t_loader)
        end_time = timer()

        # we evaluate the model on the validation set
        val_loss = evaluate(transformer, batch_size, v_loader)
        print("\nEpoch: ", epoch, ", Train loss: ", train_loss, \
               ", Val loss: ", val_loss, ", Epoch time: ", end_time - start_time, "\n")

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        # either saving the best model or the last model and keeping track of early stopping
        if val_loss < min_val_loss:
            min_val_loss = val_loss
            print("Saving best model with val_loss: {}".format(val_loss))
            torch.save({'epoch': epoch, 'model_state_dict': transformer.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(), 'train_losses': train_losses,
                'val_losses': val_losses,
            }, "model/transformer_encoder_best")
            count = 0
        else:
            print("Saving last model with val_loss: {}".format(val_loss))
            torch.save({'epoch': epoch, 'model_state_dict': transformer.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(), 'train_losses': train_losses,
                'val_losses': val_losses,
            }, "model/transformer_encoder_last")
            count += 1

        # early stopping criterion
        if count >= EARLY_STOPPING:
            print("Early stopping")
            break

    print("Final best loss: ", min_val_loss)

# The Model

The following class describes the model itself, this code is mostly based on the paper "Artificial intelligence for improved fitting of trajectories of elementary particles in inhomogeneous dense materials immersed in a magnetic field". For which code was provided.

In [53]:
class FittingTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 d_model: int,
                 n_head: int,
                 input_size: int,
                 output_size: int,
                 dim_feedforward: int,
                 dropout: float = 0.1,
                 seq_len: int = 20):

        super(FittingTransformer, self).__init__()
        encoder_layers = TransformerEncoderLayer(d_model=d_model,
                                                 nhead=n_head,
                                                 dim_feedforward=dim_feedforward,
                                                 dropout=dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_encoder_layers)
        self.proj_input = nn.Linear(input_size, d_model)
        # self.aggregator = nn.Linear(seq_len, 1)
        self.decoder_angle1 = nn.Linear(d_model, output_size)
        self.decoder_angle2 = nn.Linear(d_model, output_size)
        self.dropout = nn.Dropout(dropout)
        self.init_weights()

    def init_weights(self, init_range=0.1) -> None:
        # weights initialisation
        self.proj_input.bias.data.zero_()
        self.proj_input.weight.data.uniform_(-init_range, init_range)
        self.decoder_angle1.bias.data.zero_()
        self.decoder_angle1.weight.data.uniform_(-init_range, init_range)
        self.decoder_angle2.bias.data.zero_()
        self.decoder_angle2.weight.data.uniform_(-init_range, init_range)

    def forward(self,
                src: Tensor,
                mask: Tensor,
                src_key_padding_mask: Tensor):

        # Linear projection of the input
        src_emb = self.proj_input(src)

        # Transformer encoder
        memory = self.transformer_encoder(src=src_emb, mask=mask,
                                          src_key_padding_mask=src_key_padding_mask)
        memory = torch.mean(memory, dim=0)

        # Linear projection of the output, with 1 output if there are 2 dimensions and 2 outputs if there are 3 dimensions
        if DIM == 2:
            output = self.decoder_angle1(memory)
            return output
        if DIM == 3:
            output1 = self.decoder_angle1(memory)
            output2 = self.decoder_angle2(memory)
            return output1, output2


# Setting up the data

We define the parameters for the model in order to easily change them for grid search. Then we split the data, create the dataloaders and initialise the model.

In [54]:
D_MODEL = 128
NUM_ENCODER_LAYERS = 6
N_HEAD = 8
DIM_FEEDFORWARD = 128
DROPOUT = 0.1
EPOCHS = 60

In [55]:
# split data into train=0.6, val=0.2 and test=0.2
proportions = [.6, .2, .2]
lengths = [int(p * len(dataset)) for p in proportions]
lengths[-1] = len(dataset) - sum(lengths[:-1])
train_set, val_set, test_set = random_split(dataset, lengths, generator=torch.Generator().manual_seed(123))

# create train, validation, and test loaders
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=custom_collate)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, collate_fn=custom_collate)
test_loader = DataLoader(test_set, batch_size=TEST_BATCH_SIZE, collate_fn=custom_collate)

# Initialise the transformer model
transformer = FittingTransformer(num_encoder_layers=NUM_ENCODER_LAYERS,
                                d_model=D_MODEL,
                                n_head=N_HEAD,
                                input_size=3,
                                output_size=20,
                                dim_feedforward=DIM_FEEDFORWARD,
                                dropout=DROPOUT)
transformer = transformer.to(DEVICE)

# initialise the optimizer
optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-4)

We do a small sanity check before training to see if the data is what we expect it to be

In [56]:
print("Length of data: ", len(dataset.data))
print("Length of labels: ", len(dataset.labels))

print("Length of train set: ", len(train_set))
print("Length of val set: ", len(val_set))
print("Length of test set: ", len(test_set))

Length of data:  50000
Length of labels:  50000
Length of train set:  30000
Length of val set:  10000
Length of test set:  10000


#Training

Here we can finally call the train function.

In [57]:
train(train_loader, val_loader, transformer, optimizer, BATCH_SIZE)

Starting training...


loss = 0.17365369: : 938it [00:53, 17.55it/s]                       
loss = 0.18451875: : 313it [00:13, 23.95it/s]                       



Epoch:  0 , Train loss:  0.3009836396126986 , Val loss:  0.204379329047264 , Epoch time:  53.447647781999876 

Saving best model with val_loss: 0.204379329047264


loss = 0.14299217: : 938it [00:53, 17.66it/s]                       
loss = 0.16244231: : 313it [00:13, 23.48it/s]



Epoch:  1 , Train loss:  0.15552250785169316 , Val loss:  0.17234236389969865 , Epoch time:  53.1367121960011 

Saving best model with val_loss: 0.17234236389969865


loss = 0.14753179: : 938it [00:52, 17.71it/s]                       
loss = 0.12528329: : 313it [00:15, 20.18it/s]



Epoch:  2 , Train loss:  0.13604150081017632 , Val loss:  0.1301801051861181 , Epoch time:  52.9762730500006 

Saving best model with val_loss: 0.1301801051861181


loss = 0.13436401: : 938it [00:52, 18.02it/s]                       
loss = 0.13050485: : 313it [00:13, 24.03it/s]                       



Epoch:  3 , Train loss:  0.11674336541785614 , Val loss:  0.12665230092910912 , Epoch time:  52.04968589300006 

Saving best model with val_loss: 0.12665230092910912


loss = 0.12427669: : 938it [00:51, 18.04it/s]                       
loss = 0.13218747: : 313it [00:13, 23.96it/s]



Epoch:  4 , Train loss:  0.10072082503518062 , Val loss:  0.11938970259869822 , Epoch time:  51.9988669710001 

Saving best model with val_loss: 0.11938970259869822


loss = 0.10605424: : 938it [00:57, 16.20it/s]                       
loss = 0.12929128: : 313it [00:13, 22.73it/s]                       



Epoch:  5 , Train loss:  0.09081489914484116 , Val loss:  0.1151123257538381 , Epoch time:  57.92052442500062 

Saving best model with val_loss: 0.1151123257538381


loss = 0.09368061: : 938it [01:02, 14.94it/s]                       
loss = 0.12290447: : 313it [00:13, 23.67it/s]                       



Epoch:  6 , Train loss:  0.08262375814478788 , Val loss:  0.10571190930260256 , Epoch time:  62.79718450800101 

Saving best model with val_loss: 0.10571190930260256


loss = 0.10215447: : 938it [00:54, 17.32it/s]                       
loss = 0.12672055: : 313it [00:13, 23.07it/s]                       



Epoch:  7 , Train loss:  0.07602848717843545 , Val loss:  0.10257674738193473 , Epoch time:  54.17272152499936 

Saving best model with val_loss: 0.10257674738193473


loss = 0.09310236: : 938it [00:53, 17.63it/s]                       
loss = 0.13077202: : 313it [00:13, 23.56it/s]                       



Epoch:  8 , Train loss:  0.07060574086458445 , Val loss:  0.10546860949823651 , Epoch time:  53.217455426000015 

Saving last model with val_loss: 0.10546860949823651


loss = 0.06552379: : 938it [00:55, 16.96it/s]                       
loss = 0.14498916: : 313it [00:13, 23.75it/s]                       



Epoch:  9 , Train loss:  0.0657695213686238 , Val loss:  0.11329521482792525 , Epoch time:  55.32811396699981 

Saving last model with val_loss: 0.11329521482792525


loss = 0.06577841: : 938it [00:53, 17.60it/s]                       
loss = 0.13184066: : 313it [00:13, 23.53it/s]                       



Epoch:  10 , Train loss:  0.06184101318007212 , Val loss:  0.10247858981497752 , Epoch time:  53.304473030000736 

Saving best model with val_loss: 0.10247858981497752


loss = 0.07664243: : 938it [00:54, 17.13it/s]                       
loss = 0.11474758: : 313it [00:13, 22.70it/s]                       



Epoch:  11 , Train loss:  0.059183748339666234 , Val loss:  0.0876890322961175 , Epoch time:  54.78141136799968 

Saving best model with val_loss: 0.0876890322961175


loss = 0.06745666: : 938it [00:56, 16.54it/s]                       
loss = 0.12532055: : 313it [00:12, 24.29it/s]



Epoch:  12 , Train loss:  0.05699745390508602 , Val loss:  0.08944401023582148 , Epoch time:  56.7273450520006 

Saving last model with val_loss: 0.08944401023582148


loss = 0.06414177: : 938it [00:52, 17.78it/s]                       
loss = 0.14314346: : 313it [00:13, 23.83it/s]                       



Epoch:  13 , Train loss:  0.05395383705128866 , Val loss:  0.10456060328946327 , Epoch time:  52.75055095700009 

Saving last model with val_loss: 0.10456060328946327


loss = 0.06547009: : 938it [00:53, 17.42it/s]                       
loss = 0.14723806: : 313it [00:13, 22.74it/s]                       



Epoch:  14 , Train loss:  0.051645427622568255 , Val loss:  0.11019975770586216 , Epoch time:  53.85564136699941 

Saving last model with val_loss: 0.11019975770586216


loss = 0.05809485: : 938it [00:53, 17.43it/s]                       
loss = 0.12409039: : 313it [00:13, 23.13it/s]                       



Epoch:  15 , Train loss:  0.050219824762026 , Val loss:  0.08789504694338805 , Epoch time:  53.830379545999676 

Saving last model with val_loss: 0.08789504694338805


loss = 0.06242459: : 938it [00:53, 17.52it/s]                       
loss = 0.13502195: : 313it [00:13, 23.16it/s]                       



Epoch:  16 , Train loss:  0.04855647750460962 , Val loss:  0.10933611422967605 , Epoch time:  53.54674596500081 

Saving last model with val_loss: 0.10933611422967605


loss = 0.04482698: : 938it [00:53, 17.52it/s]                       
loss = 0.15941368: : 313it [00:13, 23.89it/s]                       



Epoch:  17 , Train loss:  0.04757158255486536 , Val loss:  0.1252969062556855 , Epoch time:  53.548685972000385 

Saving last model with val_loss: 0.1252969062556855
Early stopping
Final best loss:  0.0876890322961175


# Loading a saved model

If the training has already been done, we can load in a saved model with the following code.

The code from setting up the data does have to be ran before this.

In [None]:
checkpoint = torch.load("model/transformer_encoder_last")
transformer.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch'] + 1
train_losses = checkpoint['train_losses']
val_losses = checkpoint['val_losses']
min_val_loss = min(val_losses)
print(epoch, val_losses)

# Evaluating

The final step is calculating the loss over the test set.

In [58]:
test_loss = evaluate(transformer, TEST_BATCH_SIZE, test_loader)
print("\ntest loss: ", test_loss)

loss = 0.11267737: 100%|██████████| 5000/5000 [00:57<00:00, 86.77it/s]


test loss:  0.12341414264765335



