# Training and testing

In [None]:
import torch, argparse, os, sys, time, itertools, pickle, warnings, logging
from tqdm.notebook import tqdm
from training_and_testing import *
import matplotlib.pyplot as plt, numpy as np, pytorch_lightning as pl

if torch.cuda.is_available():
    print('GPU available: ' + torch.cuda.get_device_name())
else:
    raise RuntimeError('No GPU found.')

warnings.filterwarnings('ignore')
logging.disable(sys.maxsize)

Please choose the path to the datasets in `path_data`. <br>
The `name_string_helper` helps to keep track of the different architectures that are trained. <br>
`NUM_TRIALS` specifies how many models will be trained of the same architecture. <br>
`MIN_EPOCHS` and `MAX_EPOCHS` set the boundaries for the training procedure. <br>

## Defining the architecture

The specification of the architecture works as follows: <br>
`kernels` defines the kernel size of the convolutions in the order they are applied to the input. <br>
`channels` specifies the number of output channels of the corresponding convolutions. <br>
Note that the number of input channels does not have to be specified because the lattice configuration's number of channels is hard coded in the `ObsPredictor` class, since it is always four. The other numbers of input channels are already fixed by the preceeding output channels. <br>
`dense_sizes` specifies the additional dense layers between the output of the convolutional part and the output of the whole network. The input of the dense part of the network is already specified by the last entry of `channels` and whether or not the output of the convolutional layers should be flattened. The output of the whole network is hard coded in the `ObsPredictor` class, since we always want to predict two values, $n$ and $|\phi|^2$. If one wants the output of the convolutional part of the network with e.g. $48$ channels to be directly connected to the output of the whole network, one would set `channels`$= [\ldots, 48]$ and `dense_sizes`$=[]$. <br>
After every convolution, a Tanh activation function is applied. After every dense layer except directly before the output a LeakyReLu activation function is applied.

In [None]:
# The path of the datasets
path_data = './datasets'

# The name_string_helper is part of the model name and also helps to determine the name of the pickle file that will be
# generated.
name_string_helper = 'test_ref'

# NUM_TRIALS determines how often one architecture should be trained, starting from different initializations.
NUM_TRIALS = 3

MIN_EPOCHS = 100
MAX_EPOCHS = 1000

"""
    Architecture
"""

kernels = [1, 1]
channels = [13, 13]
dense_sizes = []

In the following cell, no changes should be made. It runs a few checks to test if the vriables above have been chosen correctly.

In [None]:
# dimensions of the lattice to train on
NT, NX = 60, 4

# TEST_BATCH_SIZE determines the batch size that is used during testing. A larger value speeds up the testing process.
# The maximum value that can be chosen depends on the graphics card that is used. It is smaller for a larger lattice.
# The total number of test samples have to be divisble by it for the results being averaged correctly.
TEST_BATCH_SIZE = 500

train_path = os.path.join(path_data, "dataset-train-{:d}-{:d}.pt".format(NT, NX))
val_path = os.path.join(path_data, "dataset-val-{:d}-{:d}.pt".format(NT, NX))
test_path = os.path.join(path_data, "dataset-test-{:d}-{:d}.pt".format(NT, NX))

# checks
if not len(kernels) == len(channels):
    raise ValueError('kernels and channels must have the same length.')

if not isinstance(NUM_TRIALS, int):
    raise TypeError('NUM_TRIALS has to be an integer.')
    
if not NUM_TRIALS > 0:
    raise ValueError('NUM_TRIALS has to be positive.')
    
if not isinstance(TEST_BATCH_SIZE, int):
    raise TypeError('TEST_BATCH_SIZE has to be an integer.')
    
if not TEST_BATCH_SIZE > 0:
    raise ValueError('TEST_BATCH_SIZE has to be positive.')

if not os.path.isfile(train_path):
    raise FileNotFoundError('There is no training set of this lattice size under the specified path.')

## Training and testing on the 60x4 lattice

Here, the training takes place. Its duration depends particularly on the chosen `NUM_TRIALS` and `train_sample_numbers`. Then, the models are tested on the $60 \times 4$ lattice. The models and the test results are saved to a .pickle file. <br>
In the following cell, only `train_sample_numbers` should be modified. It specifies the number of training samples in the respective training sets and has to respect $0 <$ `train_sample_numbers` $\le 20000$.

In [None]:
print('Training will be performed on the {}x{} lattice.'.format(NT, NX))

# loading the data
train_data, val_data, test_data = torch.load(train_path), torch.load(val_path), torch.load(test_path)

print("Total training examples: {}".format(len(train_data)))
print("Total validation examples: {}".format(len(val_data)))
print("Total test examples: {}".format(len(test_data)))

if len(test_data) % TEST_BATCH_SIZE != 0:
    raise ValueError(f'The number of test data ({len(test_data)}) has to be a multiple of TEST_BATCH_SIZE ({TEST_BATCH_SIZE}). Please choose the latter accordingly.')
    
# The different sizes of the training sets are chosen.
# The corresponding validation sets have 10% of the training set's size.
train_sample_numbers = [50, 200, 2000, 20000]
# train_sample_numbers = list(itertools.chain(range(100, 250, 50), range(250, 1000, 250), range(1000, 3000, 500), range(3000, 20001, 1000)))
val_sample_numbers = [int(train_sample_numbers[i]/10) for i in range(len(train_sample_numbers))]
train_subsets = [range(train_sample_numbers[i]) for i in range(len(train_sample_numbers))]
val_subsets = [range(val_sample_numbers[i]) for i in range(len(val_sample_numbers))]


test_MSEs = []
test_losses = []
results = []

with tqdm(total=len(train_sample_numbers)) as pbar:
    for i, (train_subset, val_subset) in enumerate(zip(train_subsets, val_subsets)):
        
        """
            Datasets
        """
        train_data_subset = torch.utils.data.Subset(train_data, train_subset) 
        val_data_subset = torch.utils.data.Subset(val_data, val_subset)
        
        print("Training examples used: {}".format(len(train_data_subset)))
        print("Validation examples used: {}".format(len(val_data_subset)))
        
        # init hyperparameters
        hparams = argparse.Namespace()
        
        # lattice size
        hparams.NT = NT
        hparams.NX = NX

        # dataloaders
        hparams.num_workers = 0
        
        # name of the model
        hparams.name = 'reg_' + name_string_helper + '_{}_training_samples'.format(len(train_data_subset))
        
        """
            Optimization hyperparameters
        """
        # optimizer
        hparams.lr = 1e-2
        hparams.weight_decay = 0.
        
        # The total number of validation samples has to be divisible by the batch size
        # for the loss function and the MSE losses to be correctly averaged at validation_epoch_end.
        if len(train_data_subset) < 500:
            hparams.batch_size = 50
        else:
            hparams.batch_size = 100
        
        hparams.test_batch_size = TEST_BATCH_SIZE
        
        """
            Architecture hyperparameters
        """
        hparams.kernels = kernels
        hparams.channels = channels
        hparams.dense_sizes = dense_sizes

        test_MSEs_per_trial = []
        test_losses_per_trial = []
        results_per_trial = []
        
        # The following boolean variable is used to print the number of paramters only once -
        # right after the model is created.
        printed_parameters = False

        for trial in tqdm(range(NUM_TRIALS)):
            # init model
            model = ObsPredictor(hparams, train_data_subset, val_data_subset, test_data)

            if not printed_parameters:
                print("Number of trainable parameters: {}".format(model.count_parameters()))
                printed_parameters = True
                
            # tensorboard loggers
            log_path = os.path.join(os.getcwd(), 'training_and_testing_logs')
            if not os.path.isdir(log_path):
                os.mkdir(log_path)
                
            log_name = hparams.name + "_{:03d}".format(trial)
            tb = pl.loggers.TensorBoardLogger(save_dir=os.path.join(log_path, 'logs_' + name_string_helper), name=log_name)

            # training
            early_stopping = pl.callbacks.EarlyStopping(monitor='val_loss', min_delta=0., patience=25, mode='min')
            checkpoint = pl.callbacks.model_checkpoint.ModelCheckpoint()
            
            trainer = pl.Trainer(gpus=1, min_epochs=MIN_EPOCHS, max_epochs=MAX_EPOCHS, check_val_every_n_epoch=1, benchmark=True,
                                 weights_summary=None, progress_bar_refresh_rate=0, logger=tb,
                                 early_stop_callback=early_stopping, checkpoint_callback=checkpoint)

            trainer.fit(model)
            
            # testing
            best_model = torch.load(checkpoint.best_model_path)
            model.load_state_dict(best_model['state_dict'])
            
            model.eval()
            trainer.test(model)
            
            test_MSE = model.vMSE
            test_loss = model.vloss
            test_MSEs_per_trial.append(test_MSE.numpy())
            test_losses_per_trial.append(test_loss)
            
            # saving weights and hyperparameters
            w = model.state_dict().copy()
            result = {'weights': w, 'hparams': model.hparams.copy()}
            results_per_trial.append(result)
            
        test_MSEs.append(test_MSEs_per_trial)
        test_losses.append(test_losses_per_trial)
        results.append(results_per_trial)
        
        pbar.update(1)
        
# save results
pickle_path = os.path.join(os.getcwd(), 'test_pickles')
if not os.path.isdir(pickle_path):
    os.mkdir(pickle_path)
    
filename = name_string_helper + '.pickle'
# If the file already exists, we do not want to overwrite it, but create a new one with a unique name.
# To do this, we choose to prepend the current time.
if os.path.isfile(os.path.join(pickle_path, filename)):
    filename = str(time.time()) + filename
    print('File already existed, timestamp was prepended to filename.')

with open(os.path.join(pickle_path, filename), 'wb') as file:
    pickle.dump([test_losses, test_MSEs, results, train_sample_numbers], file)

## Testing on the other lattice sizes

Now, the testing will be performed also on lattice sizes different from the one the training took place. The test results are saved to a .pickle file. The last entry of `train_sample_numbers` specifies the size of the training set. <br>
Only `dims` should be changed, namely by choosing a subset of the available lattice sizes.

In [None]:
# dimensions of the lattice the model shall be tested on
dims = [(50,2), (60,4), (100,5)]#, (125,8), (200,10)]

for dim in dims:
    test_path = os.path.join(path_data, "dataset-test-{:d}-{:d}.pt".format(*dim))
    if not os.path.isfile(test_path):
        raise FileNotFoundError(f'There is no test set of this lattice size {(dim)} under the specified path.')
        
del test_MSEs
del test_losses

test_MSEs = []
test_losses = []

hparams = argparse.Namespace(**results[-1][0]['hparams'])

with tqdm(total=len(dims)) as pbar:
    for dim in dims:
        print('Testing will be performed on the test set of the {}x{} lattice.'.format(*dim))

        test_path = os.path.join(path_data, "dataset-test-{:d}-{:d}.pt".format(*dim))
        test_data = torch.load(test_path)
        print("Total test examples: {}\n".format(len(test_data)))
        
        test_MSEs_per_trial = []
        test_losses_per_trial = []
        
        printed_parameters = False

        for trial in tqdm(range(len(results[-1]))):
            w = results[-1][trial]['weights']

            model = ObsPredictor(hparams, None, None, test_data)
            model.load_state_dict(w)

            if not printed_parameters:
                print("Number of trained parameters: {}".format(model.count_parameters()))
                printed_parameters = True
    
            trainer = pl.Trainer(gpus=1, logger=False, weights_summary=None)

            trainer.test(model)
            
            test_MSE = model.vMSE
            test_loss = model.vloss

            test_MSEs_per_trial.append(test_MSE.numpy())
            test_losses_per_trial.append(test_loss)

        test_MSEs.append(test_MSEs_per_trial)
        test_losses.append(test_losses_per_trial)
        
        pbar.update(1)
        
# save results
filename = 'ls_' + name_string_helper + '.pickle'
# If the file already exists, we do not want to overwrite it, but create a new one with a unique name.
# To do this, we choose to prepend the current time.
if os.path.isfile(os.path.join(pickle_path, filename)):
    filename = str(time.time()) + filename
    print('File already existed, timestamp was prepended to filename.')
    
with open(os.path.join(pickle_path, filename), 'wb') as file:
    pickle.dump([test_losses, test_MSEs, dims], file)