# HW5: Large Language Model: NanoGPT

# Finetuning Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import os
import gc
from typing import Tuple, Dict
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import datasets
import json
import shutil
from collections import Counter
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from tqdm import tqdm
import torchsummary
import Levenshtein
import numpy as np
import nltk
from rouge import Rouge 
from torch.utils.data.dataset import ConcatDataset
import re
import os
from collections import Counter
import torch.nn.functional as F

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

# Tokenizer

In [None]:
class UpgradeTokenizer2:
    def __init__(self, max_vocab_size, punctuations=['.', ',', '!', '?', ':', ';', '-', '(', ')']):
        self.vocab = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4}
        self.mask_token = '[MASK]'
        self.max_vocab_size = max_vocab_size
        self.punctuations = punctuations

    def custom_tokenize(self, text):
        # Generate a regex pattern that excludes specified punctuations
        # excluded_punctuations = ''.join(re.escape(p) for p in self.punctuations)
        pattern = r"\b\w+'?\w*|[^\w\s]"

        tokens = re.findall(pattern, text.lower())
        return tokens

    def build_vocab(self, corpus):
        word_counts = Counter(word for sentence in corpus for word in self.custom_tokenize(sentence))
        for word, _ in word_counts.most_common(self.max_vocab_size - len(self.vocab)):
            self.vocab[word] = len(self.vocab)
    
    def tokenize(self, text):
        return [self.vocab.get(word, self.vocab['[UNK]']) for word in self.custom_tokenize(text)]

    def convert_tokens_to_string(self, tokens):
        words = [list(self.vocab.keys())[list(self.vocab.values()).index(token)] for token in tokens]
        sentence = ''
        for word in words:
            if word in self.punctuations:
                sentence += word
            else:
                if sentence and not sentence.endswith(' '):
                    sentence += ' '
                sentence += word
        return sentence

# Initialize your tokenizer
tokenizer_fine = UpgradeTokenizer2(max_vocab_size=60006)  # Adjust max_vocab_size as needed

### Load VOCAB

In [None]:
vocab_fine_file = 'vocab60000-latest_fine.json'
with open(vocab_fine_file, 'r') as f:
    VOCAB_FINE = json.load(f)

tokenizer_fine.vocab = VOCAB_FINE

In [None]:
for key, value in enumerate(tokenizer_fine.vocab):
    print(key, value)
    if key ==100:
        break


# Load Data

In [None]:
cnn_train_dataset = np.load('/root/CNN_train_tokenized_60000.npy', allow_pickle=True)
train_qa_dataset     = np.load('/root/sq_train_tokenized_60000.npy', allow_pickle=True)
cnn_val_dataset = np.load('/root/CNN_val_tokenized_60000.npy', allow_pickle=True)
val_qa_dataset     = np.load('/root/sq_val_tokenized_60000.npy', allow_pickle=True)

In [None]:
config_fine = dict (
    batch_size          = 64,
    epochs              = 30,
    lr       = 1e-5,
    weight_decay        = 5e-3,
    tf_ratio            = 1.0,
    patience            = 1,
)

# DataLoader

In [None]:
class DataLoaderForLanguageModeling(torch.utils.data.DataLoader): # Inherit from torch.utils.data.DataLoader
    """
        TODO: Define data loader logic here
    """
    # TODO: You can probably add more parameters as well. Eg. sequence length
    def __init__(self, dataset, batch_size, num_workers, seq_len = 512, shuffle= True, drop_last= False): 
        super(DataLoaderForLanguageModeling, self).__init__(
            dataset,
            batch_size=batch_size,
            # shuffle=shuffle,
            num_workers=num_workers,
            drop_last=drop_last

        )
        self.shuffle    = shuffle
        # self.drop_last  = drop_last
        self.seq_len = seq_len
        self.l = len(np.concatenate(dataset))
        self.num_batches = self.__len__()
        # self.num_workers = num_workers

    def __len__(self):
        if self.drop_last:
            return self.l//(self.batch_size*self.seq_len)
        else:
            return self.l//(self.batch_size*self.seq_len)+1

    def __iter__(self):
        if self.shuffle:
            # TODO
            np.random.shuffle(self.dataset)
        all = np.concatenate(self.dataset)
        # total_seq = (len(all)-1)// self.seq_len
        padding_size = -len(all) % self.batch_size
        padded_data = np.pad(all, (0, padding_size), mode='constant')

        reshaped = padded_data.reshape(self.batch_size, -1)
        targets = np.roll(reshaped, -1, axis=1)

        leftover = len(all) % self.seq_len

        batch_idx = 0
        while batch_idx < self.num_batches:
            start_idx = batch_idx * self.seq_len
            end_idx = start_idx + self.seq_len
            if batch_idx == self.num_batches - 1 and not self.drop_last:
                end_idx = start_idx + leftover

            batch_idx +=1

            input = torch.tensor(reshaped[:, start_idx:end_idx], dtype=torch.long)
            target = torch.tensor(targets[:, start_idx:end_idx], dtype= torch.long)

            yield input, target

In [None]:
class DataLoaderForLanguageModelingFine(torch.utils.data.DataLoader): # Inherit from torch.utils.data.DataLoader
    """
        TODO: Define data loader logic here
    """
    # TODO: You can probably add more parameters as well. Eg. sequence length
    def __init__(self, dataset, batch_size, num_workers, seq_len = 512, shuffle= True, drop_last= False): 
        super(DataLoaderForLanguageModelingFine, self).__init__(
            dataset,
            batch_size=batch_size,
            # shuffle=shuffle,
            num_workers=num_workers,
            drop_last=drop_last

        )
        self.shuffle    = shuffle
        # self.drop_last  = drop_last
        self.seq_len = seq_len
        self.l = len(np.concatenate(dataset))
        self.num_batches = self.__len__()
        # self.num_workers = num_workers

    def __len__(self):
        if self.drop_last:
            return self.l//(self.batch_size*self.seq_len)
        else:
            return self.l//(self.batch_size*self.seq_len)+1

    def __iter__(self):
        datasets_list = list(self.dataset.datasets)

        if self.shuffle:
            np.random.shuffle(datasets_list)
        all = np.concatenate(self.dataset)
        # total_seq = (len(all)-1)// self.seq_len
        padding_size = -len(all) % self.batch_size
        padded_data = np.pad(all, (0, padding_size), mode='constant')

        reshaped = padded_data.reshape(self.batch_size, -1)
        targets = np.roll(reshaped, -1, axis=1)

        leftover = len(all) % self.seq_len

        batch_idx = 0
        while batch_idx < self.num_batches:
            start_idx = batch_idx * self.seq_len
            end_idx = start_idx + self.seq_len
            if batch_idx == self.num_batches - 1 and not self.drop_last:
                end_idx = start_idx + leftover

            batch_idx +=1

            input = torch.tensor(reshaped[:, start_idx:end_idx], dtype=torch.long)
            target = torch.tensor(targets[:, start_idx:end_idx], dtype= torch.long)

            yield input, target

In [None]:
dl_cnn_val = DataLoaderForLanguageModeling(
    dataset     = cnn_val_dataset, 
    batch_size  = config_fine["batch_size"], 
    shuffle     = False, 
    drop_last   = True,
    num_workers = 64
    # seq_len = 128
    # Input Extra parameters here if needed
)
dl_qa_val = DataLoaderForLanguageModeling(
    dataset     = val_qa_dataset, 
    batch_size  = config_fine["batch_size"], 
    shuffle     = False, 
    drop_last   = True,
    num_workers = 64
    # seq_len = 128
    # Input Extra parameters here if needed
)

# Create a new dataset by concatenating samples from both datasets
combined_dataset_val = ConcatDataset([val_qa_dataset, cnn_val_dataset])
combined_dataset = ConcatDataset([train_qa_dataset, cnn_train_dataset])
# Create a DataLoader for the combined dataset using the custom sampler
dataloader_combined = DataLoaderForLanguageModelingFine(
    combined_dataset,
    batch_size=config_fine["batch_size"],  # Set your desired batch size
    shuffle=True,
    num_workers=64
)
dataloader_combined_val = DataLoaderForLanguageModelingFine(
    combined_dataset_val,
    batch_size=config_fine["batch_size"],  # Set your desired batch size
    shuffle=False,
    num_workers=64
)

# Example usage
for i, (x, y) in enumerate(dataloader_combined):
    print(f"Batch {i + 1} - x shape: {x.shape}, y shape: {y.shape}")
    break

In [None]:
for i, (x, y) in enumerate(dataloader_combined):
    print(x.shape)
    print(y.shape)
    break
del x, y

In [None]:
def scaled_dot_product_attention(query, key, value, mask=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))

    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)

    attn = F.softmax(scores, dim=-1)
    output = torch.matmul(attn, value)
    return output, attn
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.2):
        super().__init__()

        self.d_model = d_model
        self.d_k = d_model // num_heads
        self.h = num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, attn_mask=None):
        bs = q.size(0)

        # Perform linear operation and split into h heads
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)

        # Transpose to get dimensions bs * h * sl * d_model
        k = k.transpose(1, 2)
        q = q.transpose(1, 2)
        v = v.transpose(1, 2)

        # Calculate attention using function we will define next
        scores, attn = scaled_dot_product_attention(q, k, v, attn_mask)

        # Concatenate heads and put through final linear layer
        concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
        
        output = self.out(concat)

        return output

In [None]:
import math

class PositionalEncoding(torch.nn.Module):

    def __init__(self, projection_size, max_seq_len= 800):
        super().__init__()
        position = torch.arange(0, max_seq_len).unsqueeze(1)
        denominator = torch.exp(torch.arange(0, projection_size, 2) * -(math.log(10000.0) / projection_size))
        pe = torch.zeros(max_seq_len, projection_size, device=DEVICE)

        pe[:, 0::2] = torch.sin(position * denominator)
        pe[:, 1::2] = torch.cos(position * denominator)

        self.pe = pe.unsqueeze(0)
        self.register_buffer('pos_encode',self.pe)

    def forward(self, x):

        x = x + self.pos_encode[:, :x.size(1)]
        return x
    


class TransformerBlock(torch.nn.Module):
    def __init__(self, projection_size, hidden_size, num_heads, dropout= 0.2):
        super().__init__()

        self.attention = MultiHeadAttention(projection_size, num_heads)

        self.bn1        = torch.nn.LayerNorm(projection_size)# TODO

        self.bn2        = torch.nn.LayerNorm(projection_size)# TODO


        # Feed forward neural network
        self.MLP        = torch.nn.Sequential(
            torch.nn.Linear(projection_size, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(hidden_size, projection_size)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):

        attention = self.attention(query, key, value, mask)

        out1    = attention + query

        out1    = self.bn1(out1)
        
        out2    = self.MLP(out1) 
        
        out2 = self.dropout(out2)
        out2    = out2 + out1
        
        out2    = self.bn2(out2)
        return out2

In [None]:
class Decoder(torch.nn.Module):

    def __init__(self,
                input_size,
                embedding_size,
                hidden_size,
                output_size,
                n_heads,
                tf_blocks,
                dropout):
        super().__init__()
        
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(input_size, embedding_size)

        self.positional_encoding    = PositionalEncoding(embedding_size)# TODO

        # create a sequence of transformer blocks
        self.transformer_blocks    = torch.nn.ModuleList([TransformerBlock(embedding_size, hidden_size, n_heads) for _ in range(tf_blocks)])

        self.droupout1 = nn.Dropout(0.1)
        self.layer_norm = nn.LayerNorm(embedding_size)
        self.linear = nn.Linear(embedding_size, output_size)
        self.droupout2 = nn.Dropout(dropout)

        

    def forward(self, x, mask):

        # Pass the output through the embedding
        output                  = self.embedding(x)# TODO
        output = self.droupout1(output)
        # calculate the position encoding
        output  = self.positional_encoding(output)# TODO
        output = self.droupout2(output)

        output = self.layer_norm(output)

        # Pass the output of the positional encoding through the transformer encoder
        for block in self.transformer_blocks:
            output = block(output, output, output, mask)# TODO

        output = self.linear(output)

        return output

In [None]:
class Transformer(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_heads, tf_blocks,dropout = 0.1):
        super(Transformer, self).__init__()
        self.decoder = Decoder(input_size, embedding_size, hidden_size, output_size, num_heads, tf_blocks, dropout)

    def forward(self, x):
       
       mask = self.create_mask(x.size(1))

       return self.decoder(x, mask)
    
    def generate(self, input_seq, max_length=150):
        self.eval()
        generated_seq = input_seq.to(DEVICE)
        

        with torch.inference_mode():

            for _ in range(max_length):
                logits  = self.forward(generated_seq)


                # Get the last predicted token
                predictions = torch.nn.functional.log_softmax(logits, dim=-1)
                next_token = predictions[:, -1, :].argmax(dim=-1, keepdim=True)

                generated_seq = torch.cat((generated_seq, next_token), dim=1)
                

        return generated_seq
    
    def predict(self, x):
        self.eval()
        if not torch.is_tensor(x):
            x = torch.tensor(x).long().to(DEVICE)
        else: x = x.to(DEVICE)

        with torch.inference_mode():

            predictions = self.forward(x)
            
            predictions = torch.nn.functional.log_softmax(predictions, dim=-1)


            next_token = predictions[:, -1, :].argmax(dim=-1, keepdim=True)

        
        return next_token
    

    def create_mask(self, input_seq_length):
        mask = torch.triu(torch.ones(input_seq_length, input_seq_length, device=DEVICE), diagonal=1)
        mask = mask.masked_fill(mask == 0, 0)
    



In [None]:
def calc_edit_distance(predictions, y,tokenizer, vocab= VOCAB_FINE, print_example= True):

    dist                = 0
    batch_size, seq_len = predictions.shape

    for batch_idx in range(batch_size):

        y_sliced    = tokenizer.convert_tokens_to_string(y[batch_idx])
        pred_sliced = tokenizer.convert_tokens_to_string(predictions[batch_idx])

        dist        += Levenshtein.distance(pred_sliced, y_sliced)
    
    dist    /= batch_size
    return dist
def calculate_loss(criterion, out, target):
    out     = out.view(-1, out.size(2))
    targets = torch.flatten(target)
    loss    = criterion(out, targets)

    return loss

In [None]:
def train(model, dataloader, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    for i, (src, trg) in enumerate(dataloader):

        src = src.to(DEVICE)
        trg = trg.to(DEVICE)

        optimizer.zero_grad()

        output = model(src).to(DEVICE)

        loss = calculate_loss(criterion, output, trg)
        loss.backward()

        # Clip the gradients to prevent them from exploding (a common issue in RNNs)
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.05f}".format(epoch_loss/(i+1)),
            lr="{:.05f}".format(float(optimizer.param_groups[0]['lr'])))
        batch_bar.update()

        del src, trg
        torch.cuda.empty_cache()
    
    batch_bar.close()

    return epoch_loss / len(dataloader)



In [None]:
def validate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0

    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, position=0, leave=False, desc="Val")
    with torch.inference_mode():
        for i, (src, trg) in enumerate(dataloader):

            src = src.to(DEVICE)
            trg = trg.to(DEVICE)
            
            output = model(src)
            
            loss = calculate_loss(criterion, output, trg)
            
            epoch_loss += loss.item()
            batch_bar.set_postfix(
                loss="{:.04f}".format(epoch_loss/(i+1)))
            batch_bar.update()
            del src, trg
            torch.cuda.empty_cache()
    
    batch_bar.close()

    return epoch_loss / len(dataloader)


In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
class DecoderFinetune(torch.nn.Module):

    def __init__(self,
                pretrain,
                input_size,
                embedding_size,
                hidden_size,
                output_size,
                n_heads,
                tf_blocks,
                dropout):
        super().__init__()
        
        self.embedding_size = embedding_size
        self.embedding_layer_finetune = nn.Embedding(input_size, embedding_size) 
        self.embedding_layer_finetune.weight.data[:pretrain.decoder.embedding.weight.size(0), :] = pretrain.decoder.embedding.weight.data  # Copy pre-trained weights
        torch.nn.init.xavier_uniform_(self.embedding_layer_finetune.weight.data[-pretrain.decoder.embedding.weight.size(0):])

        


        # compute the postion encoding
        self.positional_encoding    = pretrain.decoder.positional_encoding

        # create a sequence of transformer blocks
        self.transformer_blocks    = pretrain.decoder.transformer_blocks

        self.droupout1 = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(embedding_size)
        self.linear_fine = nn.Linear(embedding_size, output_size)
        self.linear_fine.weight.data[:pretrain.decoder.linear.weight.size(0), :] = pretrain.decoder.linear.weight.data  # Copy pre-trained weights
        torch.nn.init.xavier_uniform_(self.linear_fine.weight.data[-pretrain.decoder.linear.weight.size(0):])
        self.droupout2 = nn.Dropout(dropout)

        

    def forward(self, x, mask):

        # Pass the output through the embedding
        output  = self.embedding_layer_finetune(x)# TODO
        output = self.droupout1(output)
        # calculate the position encoding
        output  = self.positional_encoding(output)# TODO
        output = self.droupout2(output)

        output = self.layer_norm(output)

        # Pass the output of the positional encoding through the transformer encoder
        for block in self.transformer_blocks:
            output = block(output, output, output, mask)# TODO

        output = self.linear_fine(output)

        return output

In [None]:
class TransformerFineTune(nn.Module):
    def __init__(self, pretrain, input_size, embedding_size, hidden_size, output_size, num_heads, tf_blocks,dropout = 0.1):
        super(TransformerFineTune, self).__init__()
        self.decoder = DecoderFinetune(pretrain, input_size, embedding_size, hidden_size, output_size, num_heads, tf_blocks, dropout)

    def forward(self, x):
       
       mask = self.create_mask(x.size(1))

       return self.decoder(x, mask)
    
    def generate(self, input_seq, max_length=150):
        self.eval()
        if not torch.is_tensor(x):
            generated_seq = torch.tensor(input_seq).long().to(DEVICE)
        else:
            generated_seq = input_seq.to(DEVICE)
        
        with torch.inference_mode():

            for _ in range(max_length):
                logits  = self.forward(generated_seq)
                                
                next_token = self.random_sampling(logits)

                generated_seq = torch.cat((generated_seq, next_token), dim=1)
                

        return generated_seq
    
    def predict(self, x):
        self.eval()
        if not torch.is_tensor(x):
            x = torch.tensor(x).long().to(DEVICE)
        else: x = x.to(DEVICE)

        with torch.inference_mode():

            predictions = self.forward(x)
            
            predictions = torch.nn.functional.log_softmax(predictions, dim=-1)

            next_token = predictions[:, -1, :].argmax(dim=-1, keepdim=True)

        
        return next_token
    

    def create_mask(self, input_seq_length):
        mask = torch.triu(torch.ones(input_seq_length, input_seq_length, device=DEVICE), diagonal=1)
        mask = mask.masked_fill(mask == 0, 0)
        
    
    def random_sampling(self, logits, temperature = 0.1):
        # Apply softmax to convert logits to probabilities
        scaled_logits = logits / temperature
        batch_size = logits.size(0)
        next_token = torch.zeros(batch_size, 1, dtype = torch.long).to(DEVICE)
        probabilities = torch.log_softmax(scaled_logits, dim=-1)

        # Create a categorical distribution and sample from it
        categorical_dist = torch.distributions.Categorical(probs=probabilities[:, -1, :])
        next_token[:, 0] = categorical_dist.sample()
        return next_token


In [None]:
model_config = dict (
    batch_size          = 64,
    epochs              = 1,
    embedding_size  = 512,
    hidden_size     = 512,
    tf_blocks               = 6,
    vocab_size              = 60000,
    num_heads               = 8,
    tf_ratio                = 1.0,
    patience                = 1,
)

with open('./model_config-1.json', 'w') as file:
    json.dump(model_config, file, indent=4) 

In [None]:
model = Transformer(model_config["vocab_size"], model_config['embedding_size'], model_config['hidden_size'], model_config['vocab_size'], model_config['num_heads'],
                model_config['tf_blocks'])
model = model.to(DEVICE)
print(model)

In [None]:
checkpoint = torch.load("model-1.pth")
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(DEVICE)
model_config_fine = dict (
    batch_size          = 64,
    epochs              = 1,
    embedding_size  = 512,
    hidden_size     = 512,
    tf_blocks               = 6,
    vocab_size              = len(VOCAB_FINE),
    num_heads               = 8,
    tf_ratio                = 1.0,
    patience                = 1,
)

In [None]:
model_fine = TransformerFineTune(model, model_config_fine["vocab_size"], model_config_fine['embedding_size'], model_config_fine['hidden_size'],model_config_fine["vocab_size"], model_config_fine['num_heads'],
                model_config_fine['tf_blocks'])
model_fine = model_fine.to(DEVICE)
print(model_fine)

# Finetuning

In [None]:
criterion_fine = nn.CrossEntropyLoss()  # Ignore padding for loss calculation
optimizer_fine = torch.optim.Adam(model_fine.parameters(), lr=config_fine['lr'])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_fine, mode='min', factor=0.5, patience=1, threshold=0.001)

In [None]:
N_EPOCHS = 3
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    train_loss = train(model_fine, dataloader_combined, optimizer_fine, criterion_fine, CLIP)
    valid_loss = validate(model_fine, dataloader_combined_val, criterion_fine)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f}')

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./best-model-{epoch+1}.pth')

    torch.save({'model_state_dict':model_fine.state_dict(),
            'optimizer_state_dict':optimizer_fine.state_dict(),
            'scheduler_state_dict':scheduler.state_dict(),
            'valid_loss': valid_loss,
            'epoch': epoch}, f'/root/fine-{epoch+1}-1.pth')
    
    torch.save(model_fine, f"/root/model-fine-{epoch+1}-1.pt")

In [None]:
checkpoint = torch.load("fine.pth")
model_fine.load_state_dict(checkpoint['model_state_dict'])
model_fine = model_fine.to(DEVICE)

# Evaluation code

In [None]:
for x, y in dl_cnn_val:
    print("x: ", tokenizer_fine.convert_tokens_to_string(x[0, :]))
    print("y: ", tokenizer_fine.convert_tokens_to_string(y[0, :]))
    break

In [None]:
def evaluate(model, test_dataset):
    model.eval() 
    generated_sequences = []
    targets = []

    for batch in test_dataset:
        input_ids = batch['input_ids']
        labels = batch['labels']

        # Generate predictions
        with torch.no_grad():
            outputs = model_fine.generate(input_ids)

        # Decode the generated and actual sequences
        generated_seq = [tokenizer_fine.convert_tokens_to_string(output) for output in outputs]
        actual_seq = [tokenizer_fine.convert_tokens_to_string(label) for label in labels]

        generated_sequences.extend(generated_seq)
        actual_sequences.extend(actual_seq)

    return (generated_sequences, actual_sequences)

Sum_eval = evaluate_model_on_test_dataset(model, dl_cnn_val) 
QA_eval = evaluate_model_on_test_dataset(model, dl_qa_val)

In [None]:
# BLEU score
for i, (x, y) in enumerate(Sum_eval):
    reference = tokenizer_fine.convert_tokens_to_string(y)
    print(reference)
    hypothesis = tokenizer_fine.convert_tokens_to_string(x)
    print(hypothesis)
    BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
    print(BLEUscore)

In [None]:
# Rouge score
for i, (x, y) in enumerate(QA_eval):
    reference = tokenizer_fine.convert_tokens_to_string(y)
    print(reference)
    hypothesis = tokenizer_fine.convert_tokens_to_string(x)
    print(hypothesis)
    # Get the scores
    rouge = Rouge()
    scores = rouge.get_scores(hypothesis, reference)

    print(scores)