In [None]:
import pickle

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import t5
import pandas as pd
import numpy as np
import random
import math

from sklearn.model_selection import train_test_split

import wandb

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

torch.cuda.empty_cache()
from transformers import T5Tokenizer, MT5ForConditionalGeneration, T5ForConditionalGeneration, Adafactor

# Select input data

In [None]:
# Data...

datasets = ['long', 'long_fully_aligned', 'long_softly_aligned',
            'short', 'short_fully_aligned', 'short_softly_aligned']
dataset_version = datasets[4]


# Model...

models = ['t5-small', 't5-base', 't5-large', 'google/mt5-small', 'google/mt5-base']
model_version = models[0]


# Spa encoding...

spa_char_encode = True

## Load Data

In [None]:
data = pickle.load(open('../Datasets/wikimusica_'+dataset_version+'.p', "rb"))

input_train = data[0]
input_test = data[1]
output_train = data[2]
output_test = data[3]

## Encode spanish characters

In [None]:
def encode_unseen_characters(text: str):
    
    text = (text.replace('í','%i%')
                .replace('Í','%I%')
                .replace('ú','%u%')
                .replace('Ú','%U%')
                .replace('Á','%A%')
                .replace('Ó','%O%')
                .replace('ñ','%n%')
                .replace('Ñ','%N%'))
    
    return text

#### #### ####

def decode_unseen_characters(text: str):
    
    text = (text.replace('%i%','í')
                .replace('%I%','Í')
                .replace('%u%','ú')
                .replace('%U%','Ú')
                .replace('%A%','Á')
                .replace('%O%','Ó')
                .replace('%n%','ñ')
                .replace('%N%','Ñ'))
    
    return text   

#### #### ####

if spa_char_encode:
    input_train = [encode_unseen_characters(d) for d in input_train]
    input_test = [encode_unseen_characters(d) for d in input_test]
    output_train = [encode_unseen_characters(d) for d in output_train]
    output_test = [encode_unseen_characters(d) for d in output_test]

## Show data

In [None]:
input_train[0]

In [None]:
output_train[0]

In [None]:
input_test[0]

In [None]:
output_test[0]

___

In [None]:
tokenizer = T5Tokenizer.from_pretrained(model_version)

In [None]:
def check_token_lengths(tokenizer, input_train, output_train, input_test, output_test):
    
    check_l = [input_train, input_test, output_train, output_test]
    names_l = ['input train', 'input test', 'output train', 'output test']
    
    for i,c in enumerate(check_l):
        
        length = [len(tokenizer.tokenize(i)) for i in c]
        print('Max '+names_l[i]+' token length: ',max(length))
        if i==1:
            print('-- --')

####

check_token_lengths(tokenizer, input_train, output_train, input_test, output_test)

__

In [None]:
def calculate_val_loss_points(input_train, divisor):
    
    len_input = math.ceil(len(input_train)/divisor)
    for i in range(120,600):
        if len_input%i==0:
            print('batch divisor: ',i)
            return i
        
####
divisor=2
batch_divisor = calculate_val_loss_points(input_train, divisor)

# Model

In [None]:
# 1. Start a W&B run
wandb.init(project='NLG WikiMusica')
run_id = wandb.run.id

# 2. Save model inputs and hyperparameters
config = wandb.config

config.TRAIN_BATCH_SIZE = 8
config.VALID_BATCH_SIZE = 2
config.EPOCHS = 12

config.SEED = 89     

config.MAX_INPUT_LEN = 400
config.MAX_OUTPUT_LEN = 270

config.LEARNING_RATE = 1e-3
config.EPS = (1e-30, 1e-3)
config.CLIP_THRESHOLD = 1.0
config.DECAY_RATE = -0.8
config.BETA1 = None
config.WEIGHT_DECAY = 0.0
config.RELATIVE_STEP = False
config.SCALE_PARAMETER = False
config.WARMUP_INIT = False

####

config.DATASET = dataset_version
config.MODEL = model_version
config.ENCODE_SPA_CHAR = spa_char_encode

In [None]:
# Check GPU
!nvidia-smi

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

In [None]:
class NLGWikiDataset(Dataset):

    def __init__(self, data_input, data_output, tokenizer, source_len, target_len):
        
        self.data = data
        self.tokenizer = tokenizer
        self.source_len = source_len
        self.target_len = target_len
        self.attr = data_input
        self.target = data_output

    
    def __len__(self):
        
        return len(self.attr)

    
    def __getitem__(self, index):

        attr = self.attr[index]
        target = self.target[index]
        
        source = self.tokenizer.batch_encode_plus([attr], pad_to_max_length=True, max_length=self.source_len,  return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target], pad_to_max_length=True, max_length=self.target_len, return_tensors='pt')    
        
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer, batch_divisor):
    
    model.train()
    
    epoch=epoch+1
    log_chunk = round(len(loader)/batch_divisor)
    log_step = 1
    loss_sum = 0
    
    total_epoch_loss = 0
    for _,data in enumerate(loader, 0):
        
        mask = data['source_mask'].to(device, dtype = torch.long)
        input_ids = data['source_ids'].to(device)
        labels = data['target_ids'].to(device)
          
        # clear out the gradients of all Variables 
        optimizer.zero_grad()
        
        outputs = model(input_ids = input_ids, attention_mask = mask, labels=labels)
        
        loss = outputs[0]  ## outputs.loss
        loss_sum += loss.item()
        total_epoch_loss += loss.item()
        
        if (_+1)%log_chunk == 0:
            
            loss_mean = (loss_sum/log_chunk)
            x_axis = round((log_step+(epoch-1)*batch_divisor)/batch_divisor,3)
            print(f'{x_axis}. Epoch: {epoch}, Training Loss:  {loss_mean}')
            wandb.log({"Training Loss": loss_mean, "epoch": x_axis, "epoch_n": epoch})
            log_step += 1
            loss_sum = 0

        # calculating the gradients
        loss.backward()
        
        # updating the params
        optimizer.step()
    
    # Save full epoch loss
    full_epoch_loss = total_epoch_loss/len(loader)
    wandb.log({"Training Epoch Loss": full_epoch_loss, "epoch_n": epoch})

In [None]:
def validate(epoch, tokenizer, model, device, loader, batch_divisor):

    model.eval()

    epoch=epoch+1
    log_chunk = round(len(loader)/batch_divisor)
    log_step = 1
    loss_sum = 0
    
    total_epoch_loss = 0
    with torch.no_grad():
        for _,data in enumerate(loader, 0):

            input_ids = data['source_ids'].to(device)
            labels = data['target_ids'].to(device)

            outputs = model(input_ids = input_ids, labels=labels)
            
            loss = outputs[0]  ## outputs.loss
            loss_sum += loss.item()
            total_epoch_loss += loss.item()

            if (_+1)%log_chunk == 0:
            
                loss_mean = (loss_sum/log_chunk)
                x_axis = round((log_step+(epoch-1)*batch_divisor)/batch_divisor,3)
                print(f'{x_axis}. Epoch: {epoch}, Validation Loss:  {loss_mean}')
                wandb.log({"Validation Loss": loss_mean, "epoch": x_axis})
                log_step += 1
                loss_sum = 0
                
        # Save full epoch loss
        full_epoch_loss = total_epoch_loss/len(loader)
        wandb.log({"Validation Epoch Loss": full_epoch_loss, "epoch_n": epoch})

In [None]:
# Set random seeds and deterministic pytorch for reproducibility

torch.manual_seed(config.SEED)            # pytorch random seed
np.random.seed(config.SEED)               # numpy random seed
torch.backends.cudnn.deterministic = True

In [None]:
training_set = NLGWikiDataset(input_train,
                              output_train,
                              tokenizer,
                              config.MAX_INPUT_LEN,
                              config.MAX_OUTPUT_LEN)

validation_set = NLGWikiDataset(input_test,
                                output_test,
                                tokenizer,
                                config.MAX_INPUT_LEN,
                                config.MAX_OUTPUT_LEN)

In [None]:
# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': config.TRAIN_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

val_params = {
    'batch_size': config.VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)

print('-- * -- * -- * --')

print(f'Batch Size (Train): {config.TRAIN_BATCH_SIZE}')
print(f'Batch Size (Validation): {config.VALID_BATCH_SIZE}')

print(f'Length Loader (Train): {len(training_loader)}')
print(f'Length Loader (Validation): {len(validation_loader)}')

print(f'Batch Divisor (Train): {len(training_loader)/batch_divisor}')
print(f'Batch Divisor (Validation): {len(validation_loader)/batch_divisor}')

In [None]:
# Load pretrained model
model = T5ForConditionalGeneration.from_pretrained(config.MODEL)
model = model.to(device)

In [None]:
# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = Adafactor(params =  model.parameters(), 
                      lr = config.LEARNING_RATE,
                      eps = config.EPS,
                      clip_threshold = config.CLIP_THRESHOLD,
                      decay_rate = config.DECAY_RATE,
                      beta1 = config.BETA1,
                      weight_decay = config.WEIGHT_DECAY,
                      relative_step = config.RELATIVE_STEP,
                      scale_parameter = config.SCALE_PARAMETER,
                      warmup_init = config.WARMUP_INIT)



# Training & Validation loop
print('Initiating Fine-Tuning for the model on our dataset')

wandb.watch(model, log="all")
for epoch in range(config.EPOCHS):
    
    if epoch!=0:
        wandb.init(project='NLG WikiMusica', resume=run_id)
    
    # Log metrics with wandb
    train(epoch, tokenizer, model, device, training_loader, optimizer, batch_divisor)
    validate(epoch, tokenizer, model, device, validation_loader, batch_divisor)
    
    try:
        # Save model for each epoch
        wandb.unwatch()
        torch.save(model, f'../Models/{wandb.run.name}-{epoch+1}.pt')
    except:
        pass

___