In [None]:
'''
This notebook (as it is) tests the abstractive model on the validation and test sets
In order to train a new model uncomment the line containing the call to the train function (make sure the load variable is set to False)
The usage of a GPU is recommended
'''

'\nThis notebook (as it is) tests the abstractive model on the validation and test sets\nIn order to train a new model uncomment the line containing the call to the train function (make sure the load variable is set to False)\nThe usage of a GPU is recommended\n'

In [None]:
# install packages
!pip install -U torchtext
!pip install Rouge
!pip install datasets

Collecting Rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: Rouge
Successfully installed Rouge-1.0.1
Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 5.2 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 42.1 MB/s 
[?25hCollecting fsspec>=2021.05.0
  Downloading fsspec-2021.7.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 46.3 MB/s 
[?25hCollecting huggingface-hub<0.1.0
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.2 MB/s 
Installing collected packages: xxhash, huggingface-hub, fsspec, datasets
Successfully installed datasets-1.11.0 fsspec-2021.7.0 huggingface-hub-0.0.16 xxhash-2.0.2


In [None]:
# imports
from google_drive_downloader import GoogleDriveDownloader as gdd
import os, struct
import glob
import random
import csv
from tensorflow.core.example import example_pb2
import torch, torch.nn as nn
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from collections import defaultdict
from torch.nn.utils.rnn import pack_padded_sequence
from rouge import Rouge
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
import gc, math
torch.set_printoptions(4)
from datasets import load_dataset

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# To train the model with a batch size of 250 15 GB of GPU memory are required
from pynvml import *
nvmlInit()
h = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(h)
print(f'total    : {info.total/pow(10,9)}')
print(f'free     : {info.free/pow(10,9)}')
print(f'used     : {info.used/pow(10,9)}')

total    : 11.996954624
free     : 11.996954624
used     : 0.0


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
OOV_WORD, PADDING, START_DEC, STOP_DEC = "[UNK]", "[PAD]", "[START]", "[STOP]"

'''
Vocabulary class
It contains the vocabulary used by the model to understand and generate the summaries
The class converts the words in a hot-encoding, it contains 2 dictionaries: word2id translates a word (string) in the specific id, id2word translates from the id to the word.
Inputs: 
  - max_size: maximum size for the vocabulary
'''
class Vocab():
  def __init__(self, max_size = None):
    self.count = 0
    self.word2id = {}
    self.id2word = {}
    # Init the vocabulary with out_of_vocabulary, padding, start and stop ids.
    for w in [OOV_WORD, PADDING, START_DEC, STOP_DEC]:
      self.word2id[w] = self.count
      self.id2word[self.count] = w
      self.count+=1
    self.max_size = max_size
  
  # get id given a word
  def get_id(self, word):
    if(word not in self.word2id):
      return self.word2id[OOV_WORD]
    return self.word2id[word]

  # populate the vocabulary during the reading of the dataset
  def populate_vocab(self, word):
    if((self.max_size == None or self.get_size() < self.max_size) and (word not in self.word2id)):
      self.word2id[word] = self.count
      self.id2word[self.count] = word
      self.count += 1
    elif(self.max_size != None and self.get_size() >= self.max_size):
      return self.word2id[OOV_WORD]
    return self.word2id[word]

  # get word given the id
  def get_word(self, id):
    return self.id2word[id]
  
  # get the size of the vocabulary
  def get_size(self):
    return len(self.word2id)

In [None]:
# MAX_TEXT_LENGTH: 2882
# MAX_ABS_LENGTH: 1726

# Init the maximum number of tokens for the articles and summaries:
TRUNCATE_TEXT_LENGTH = 400
TRUNCATE_ABSTRACT_LENGTH = 102

In [None]:
'''
This class aims at computing a weight for each word of the vocabulary, 
the weight can be used to change the weights of the loss function in order to give a different focus for the model and finally improve the performance
Input:
  - Documents: the articles in form [["sent1", "sent2"]...]
  - alpha: parameter to control the weights

The weights are computed similarly to TF-IDF
This class has been used for some experiments but then it has been discarded
'''
class Compute_weight():
  def __init__(self, documents, alpha=1): # [["sent1", "sent2"]...]
    self.counts = {}
    self.document_counts = {}
    self.weight = {}
    self.length = 0
    for document in documents:
      abs = ' '.join([str(elem) for elem in document]) 
      abs = abs.split()
      self.length+=len(abs)
      for word in abs:
        if(word not in self.counts):
            self.counts[word] = 1
        else:
          self.counts[word] += 1
      for word in set(abs):
        if(word not in self.document_counts):
            self.document_counts[word] = 1
        else:
          self.document_counts[word] += 1
    for word in self.counts:
      # print(math.log(len(documents)/self.document_counts[word], 10))
      if(math.log(len(documents)/self.document_counts[word], 10) == 0):
        self.weight[word] = (1-(self.counts[word]/self.length))*alpha
      else:
        self.weight[word] = (1-(self.counts[word]/self.length)) * math.log(len(documents)/self.document_counts[word], 10)*alpha

  def get_weight(self, word):
    return self.weight[word]
  
  def get_W(self):
    return self.weight

In [None]:
'''
The Dataset class
This class manages the articles and their respective summaries
The dataset loaded is cnn dailymail version 3.0.0 which is already divided in training, validation and test sets.
Input:
  - mode: the subset to load (train, val or test)
  - max_size: maximum articles to load (None -> All dataset)

The class reads and processes the specific subset of the dataset
'''
class CNN_dailymail(Dataset):
  def __init__(self, mode, vocab, max_size = None):
    dataset = load_dataset("cnn_dailymail", '3.0.0')
    if(mode == "train"):
      dataset = dataset["train"]
    elif(mode == "val"):
      dataset = dataset["validation"]
    else:
      dataset = dataset["test"]
    
    self.stop_words = set(stopwords.words('english'))
    self.tokenizer = RegexpTokenizer(r'\w+')
    self.vocab = vocab
    self.examples = {} # Dictionary composed by the samples of the dataset (each sample is composed by the tensors of article and summary)
    self.summaries = {} # Contains the raw summaries (as strings)
    count = 0

    for article in tqdm(dataset, position=0, leave=True):
      self.examples[count] = self.process(article["article"], article["highlights"])
      self.summaries[count] = ' '.join(article["highlights"].split())
      count += 1
      if((max_size != None) and (count >= max_size)):
        break

  # Get the item for the dataloader
  def __getitem__(self, idx):
    return self.examples[idx], self.get_summary(idx)
  
  # Get the raw summary (string)
  def get_summary(self, idx):
    return self.summaries[idx]

  # Get the length of the dataset
  def __len__(self):
    return len(self.examples)

  # Process the current sample (article, summary)
  # - Removes the stop words and punctuation (using tokenizer)
  # - Truncate the length of both article and summary
  # - Add padding at the end to reach the expected size (if too short)
  # - Converts to hot-encoding and to tensor
  # The stop words are not removed from the summary to let the decoder learn how to create and understandable summary
  # all the articles have the same length (same for summaries), adding padding to the end
  def process(self, article, summary):
    article = [word.lower() for word in self.tokenizer.tokenize(article)]
    summary = summary.split()
    article_processed, summary_processed = [], [self.vocab.get_id(START_DEC)]
    for word in article:
      if(len(article_processed) >= TRUNCATE_TEXT_LENGTH):
        break
      self.vocab.populate_vocab(word)
      if(word not in self.stop_words):
        article_processed.append(self.vocab.get_id(word))
    if(len(article_processed) < TRUNCATE_TEXT_LENGTH):
      padding = TRUNCATE_TEXT_LENGTH-len(article_processed)
      article_processed = article_processed + [self.vocab.get_id(PADDING)]*padding
    
    for word in summary:
      if(len(summary_processed) >= TRUNCATE_ABSTRACT_LENGTH-1):
        break
      word = word.strip('\n')
      word = word.strip('\t')
      summary_processed.append(self.vocab.populate_vocab(word))
    summary_processed.append(self.vocab.get_id(STOP_DEC))
    if(len(summary_processed) < TRUNCATE_ABSTRACT_LENGTH):
      padding = TRUNCATE_ABSTRACT_LENGTH-len(summary_processed)
      summary_processed = summary_processed + [self.vocab.get_id(PADDING)]*padding
    return torch.tensor(article_processed), torch.tensor(summary_processed)

In [None]:
'''
The Encoder, this model has to encode the input word
The encoded word will be added to the output of the encoder which will encode the whole article meaning (this is also called context vector)
This encoding is performed using an embedding and applying a GRU (gated recurrent unit) layer (the LSTM layer has also been tried with worst results)
The hidden layer will be used as next hidden state for the next input word (or as first state for the decoder), the first hidden state is initialized with zeroes (init_layers)
'''
class Encoder(nn.Module):
  def __init__(self, input_size, emb, hidden_size, n_layer, batch_size, vocab, dropout):
    super().__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(input_size, emb, padding_idx = vocab.get_id("[PAD]"))
    self.dropout = nn.Dropout(dropout)
    self.batch_size = batch_size
    self.n_layer = n_layer
    self.gru = nn.GRU(emb, hidden_size, num_layers = n_layer, batch_first=True, dropout=0.4)

  def forward(self, x, h):
    emb = self.dropout(self.embedding(x)).unsqueeze(1)
    encoder_outputs, hidden = self.gru(emb, h)
    return encoder_outputs.squeeze(1), hidden

  def init_layers(self):
    return torch.zeros(self.n_layer, self.batch_size, self.hidden_size).to(device)

'''
The goal of the decoder is to use the last hidden state of the encoder and the attantion modelled from the output of the encoder to generate words for the summary.
It takes in input the last predicted word (initialli the [START] token), the last hidden state and the output from the encoder (used to compute the attention over the article words)
The attention is modelled through two linear layers which is then used as input to a GRU layer. The Output is the probability for each word of the vocabulary.
'''
class Decoder(nn.Module):
  def __init__(self, output_size, hidden_size, input_size, n_layer, vocab, batch_size, dropout):
    super().__init__()
    self.device = device
    self.batch_size = batch_size
    self.embedding = nn.Embedding(output_size, hidden_size, padding_idx = vocab.get_id("[PAD]"))
    self.dropout = nn.Dropout(dropout)
    self.attn = nn.Linear(hidden_size*2, input_size)
    self.attn_combine = nn.Linear(hidden_size*2, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size, num_layers = n_layer, batch_first=False, dropout=0.4)
    self.out = nn.Linear(hidden_size, output_size)
    # self.dropout = nn.Dropout(0.4)

  def forward(self, x, hidden, encoder_output):
    emb = self.dropout(self.embedding(x).view(self.batch_size, 1, -1))

    att_weights = nn.functional.softmax(self.attn(torch.cat((emb[:,0], hidden[0]), 1)), dim = 1)
    att = torch.bmm(att_weights.unsqueeze(1), encoder_output)  

    output = torch.cat((emb[:,0], att[:,0]), 1)
    output = self.attn_combine(output).unsqueeze(0)

    output = nn.functional.relu(output)
    output, hidden = self.gru(output, hidden)
    
    out = nn.functional.log_softmax(self.out(output[0]), dim = 1)

    return out, hidden

In [None]:
'''
Converts the predicted output into a list of sentences (strings)
Used to compute the Rouge metric
'''
def convert_pred(output, vocab):
  pred = []
  for sample in output:
    s = ""
    for word in sample:
      if(vocab.get_word(word.item()) == STOP_DEC):
        break
      s += vocab.get_word(word.item()) + " "
    if(len(s) == 0):
      s = "[UNK]"
    pred.append(s)
  return pred

'''
Convert one article (for debugging purposes)
'''
def print_art(art, vocab):
  article = ""
  for id in art:
    article += vocab.get_word(id.item()) + " "
  return article

'''
Convert one output as sentence (for debugging purposes)
'''
def print_pred(sample, vocab):
  s = ""
  for word in sample:
    if(vocab.get_word(word.item()) == STOP_DEC):
      break
    s += vocab.get_word(word.item()) + " "
  return s
  
'''
Compute the Rouge score given the outputs and abstracts of a batch
'''
def compute_accuracy(outputs, abstracts, vocab):
  acc = Rouge().get_scores(convert_pred(outputs, vocab), abstracts, avg=True)
  return torch.tensor([acc['rouge-1']['r'], 
         acc['rouge-1']['p'], 
         acc['rouge-1']['f'],
         acc['rouge-2']['r'],
         acc['rouge-2']['p'],
         acc['rouge-2']['f'],
         acc['rouge-l']['r'],
         acc['rouge-l']['p'],
         acc['rouge-l']['f']])

'''
Print the metrics in a readable way
'''
def print_accuracies(acc):
  print(f"----ACC----\nRouge-1: recall {acc[0]}, precision {acc[1]}, f1 {acc[2]}\nRouge-2: recall {acc[3]}, precision {acc[4]}, f1 {acc[5]}\nRouge-l: recall {acc[6]}, precision {acc[7]}, f1 {acc[8]}\n")

In [None]:
'''
Save the encoder and decoder for continuing training
Used in combination with google drive during development
'''
def save_model(encoder, decoder, epoch, enc_opt, dec_opt, current_best, best = False):
  PATH = "/content/model_abstractive.ckp" if best else "/content/model_abstractive_best.ckp"
  torch.save({
            'epoch': epoch,
            'encoder_state_dict': encoder.state_dict(),
            'decoder_state_dict': decoder.state_dict(),
            'enc_optimizer_state_dict': enc_opt.state_dict(),
            'dec_optimizer_state_dict': dec_opt.state_dict(),
            'current_best': current_best # current best rouge-1 
            }, PATH)

'''
Load a saved model
'''
def load_model(v_length, emb, hidden_size, n_layers, batch_size, vocab, learning_rate=0.01, best = False):
  # PATH = "/content/model_abstractive.ckp" if best else "/content/model_abstractive_best.ckp"
  encoder = Encoder(v_length, emb, hidden_size, n_layers, batch_size, vocab, dropout = 0.1).to(device)
  decoder = Decoder(v_length, hidden_size, TRUNCATE_TEXT_LENGTH, n_layers, vocab, batch_size, dropout = 0.1).to(device)

  enc_opt = torch.optim.SGD(encoder.parameters(), lr=learning_rate)
  dec_opt = torch.optim.SGD(decoder.parameters(), lr=learning_rate)

  checkpoint = torch.load("/content/model_abstractive.ckp")
  encoder.load_state_dict(checkpoint['encoder_state_dict'])
  enc_opt.load_state_dict(checkpoint['enc_optimizer_state_dict'])

  decoder.load_state_dict(checkpoint['decoder_state_dict'])
  dec_opt.load_state_dict(checkpoint['dec_optimizer_state_dict'])

  epoch = checkpoint['epoch']
  try:
    current_best = checkpoint['current_best']
  except:
    print("Current best set to 0")
    current_best = 0

  return encoder, enc_opt, decoder, dec_opt, epoch, current_best

In [None]:
'''
Download the best model
'''
def download_model(drive_id="1uwqZg86ZIzSqHOaekpamAWz1ggvwQsRe", file_name="model_abstractive.zip"):
  gdd.download_file_from_google_drive(file_id=drive_id,
                                dest_path=f"/content/{file_name}",
                                unzip=True)
  os.remove(f"/content/{file_name}")

download_model()

Downloading 1uwqZg86ZIzSqHOaekpamAWz1ggvwQsRe into /content/model_abstractive.zip... Done.
Unzipping...Done.


In [None]:
'''
Train the encoder and decoder
This function executes N° epochs to train the model
Input:
  - encoder: the encoder to train
  - decoder: the decoder to train
  - enc_opt: the optimizer of the encoder
  - dec_opt: the optimizer of the decoder
  - loss_fn: loss function 
  - data_loader: the dataloader managing the dataset
  - check_val: a dataloader managing a subset of the validation set (100 samples), to check every "print_acc" times
  - vocab: the vocabulary object
  - batch_size: the size of each batch
  - hidden_size: the size of the hidden layers (equal for both encoder and decoder)
  - article_size: the truncated size of the article
  - summary_size: the truncated size of the summary
  - epochs: the number of epochs to perform
  - epoch: the epoch where to start (0 or the last epoch if the model was loaded)
  - device: gpu or cpu
  - force_teaching: probability for the force teaching method
'''
def train(encoder, decoder, enc_opt, dec_opt, loss_fn, data_loader, check_val, vocab, batch_size, hidden_size, article_size, summary_size, epochs, epoch, current_best, device = "cuda", force_teaching = 0.5):
  torch.cuda.empty_cache()

  print_acc = 10 # print the accuracy every 10 epochs
  # Put the models in trainign mode
  encoder.train()
  decoder.train()

  # Epochs
  for e in range(epoch, epochs):
    print(f"---------EPOCH {e}---------")
    avg_loss = torch.zeros(1) # Contains the loss
    c = 0
    acc_avg = torch.zeros(9) # Contains the accuracies (Rouge)
    
    # For each batch (data = (article, summary), summaries = raw summaries in strings)
    for data, summaries in tqdm(data_loader, position=0, leave=True):
      articles, abstracts = data[0].to(device), data[1].to(device)
      hidden = encoder.init_layers() # init the encoder hidden layer

      acc = None
      encoder_output = torch.zeros(batch_size, article_size, hidden_size).to(device)
      # For each word of the article computes the encoding
      for i in range(article_size):
        encoder_output[:,i], hidden = encoder(articles[:,i].clone().to(device), hidden)

      outputs = torch.zeros(batch_size, summary_size, dtype = torch.int32).to(device)
      decoder_input = abstracts[:,0].clone().to(device) # get start token as first input

      dec_hidden = hidden # the first decoder hidden state is the last one from the encoder

      # init the loss to 0
      loss = torch.tensor([0.0], requires_grad=True).to(device)

      if(torch.rand(1) > force_teaching):
        # without force teaching
        # generate a word until summary size is reached
        for step in range(0, summary_size):
          # generate a word using the last hidden state
          out, dec_hidden = decoder(decoder_input, dec_hidden, encoder_output)
          # next decoder input (last generated word)
          decoder_input = out.argmax(1).detach()
          # save the current generated word
          outputs[:,step] = out.argmax(1).squeeze().detach().clone()
          # compute loss for the current generated word
          loss += loss_fn(out, abstracts[:,step].clone())
      else:
        # with force teaching
        # generate a word until summary size is reached
        for step in range(0, summary_size):
          # generate a word using the last hidden state
          out, dec_hidden = decoder(decoder_input, dec_hidden, encoder_output)
          # next decoder input (last generated word)
          decoder_input = abstracts[:,step][:]
          outputs[:,step] = out.argmax(1).squeeze().detach().clone()
          loss += loss_fn(out, abstracts[:,step].clone())

      enc_opt.zero_grad()
      dec_opt.zero_grad()
      loss.backward() # compute backpropagation
      # clipping
      # torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, encoder.parameters()), 2.) 
      # torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, decoder.parameters()), 2.) 
      enc_opt.step()
      dec_opt.step()    

      # compute the accuracy every "print_acc" epochs
      if((e+1)%print_acc == 0):
        acc = compute_accuracy(outputs, summaries, vocab)
        acc_avg += acc

      avg_loss += loss.item()/summary_size
      
      c += 1

    if((e+1)%print_acc == 0):
      if((acc_avg/c)[0] > current_best):
          current_best = (acc_avg/c)[0]
          save_model(encoder, decoder, iter, encoder_optimizer, decoder_optimizer, current_best, best = True)
      print_accuracies(acc_avg/c) 
      print("ACC val subset:\n")
      test(encoder, decoder, check_val, vocab, batch_size) # compute the accuracy in the subset of the validation set
      encoder.train()
      decoder.train()
    save_model(encoder, decoder, e, enc_opt, dec_opt, current_best) # save the current models
    print("\nLoss: ", (avg_loss/c).item()) # print loss

In [None]:
'''
This function aims at testing the model, the dataloader should be the validation or test set
Input:
 - encoder: the trained encoder
 - decoder: the trained decoder
 - loader: the dataloader where to test the models
 - vocab: the vocabulary
 - batch_size: the size of each batch
 - device: cpu or gpu (cuda)
'''
def test(encoder, decoder, loader, vocab, batch_size, device = "cuda"):
  torch.cuda.empty_cache()
  # evaluation mode
  encoder.eval()
  decoder.eval()

  with torch.no_grad():
    c = 0
    acc_avg = torch.zeros(9)
    loss_fn = nn.NLLLoss(ignore_index = vocab.get_id("[PAD]")) # loss
    for data, summaries in tqdm(loader, position=0, leave=True): # for each batch
      input_tensor, target_tensor = data[0].to(device), data[1].to(device)

      encoder_hidden = encoder.init_layers()

      input_length = input_tensor.size(1)
      target_length = target_tensor.size(1)

      encoder_outputs = torch.zeros(batch_size, input_length, encoder.hidden_size, device=device)

      loss = 0

      for ei in range(input_length):
        encoder_outputs[:,ei], encoder_hidden = encoder(input_tensor[:,ei].clone().to(device), encoder_hidden)

      decoder_input = target_tensor[:,0].clone().to(device)

      decoder_hidden = encoder_hidden

      output_decoder = torch.zeros(batch_size, target_length, dtype = torch.int32, device=device)

      for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
        _, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()
        output_decoder[:,di] = topi.squeeze().clone()

        loss += loss_fn(decoder_output, target_tensor[:,di].clone().to(device))
      
      acc_avg += compute_accuracy(output_decoder, summaries, vocab)
      c+=1

    print_accuracies(acc_avg/c) 

In [None]:
# Init the vocabulary
vocab = Vocab(max_size=50000)

# Load the datasets
t = CNN_dailymail("train", vocab, max_size=5000)
v = CNN_dailymail("val", vocab)
check_val = CNN_dailymail("val", vocab, max_size=1000)
# te = CNN_dailymail("te", vocab)

Downloading:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 (download: 558.32 MiB, generated: 1.28 GiB, post-processed: Unknown size, total: 1.82 GiB) to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234...


Downloading: 0.00B [00:00, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

Downloading:   0%|          | 0.00/572k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/661k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234. Subsequent calls will reuse this data.


  2%|▏         | 4999/287113 [00:06<06:34, 715.58it/s]
Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
100%|██████████| 13368/13368 [00:19<00:00, 702.66it/s]
Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
  7%|▋         | 999/13368 [00:01<00:15, 784.38it/s]


In [None]:
# init the device to use
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 250

# init the dataloaders
train_ds = DataLoader(t, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
val_ds = DataLoader(v, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True, drop_last=True)
check_val = DataLoader(check_val, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
# test_ds = DataLoader(te, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

In [None]:
v_length = vocab.get_size()
emb, hidden_size = 256, 256

load = True # load the last saved model
lr = 0.01 # learning rate
epoch = 0 # starting epoch
force_training = False # start from epoch 0 (if already trained model)
n_layers = 1 # number of layers for the GRU units
current_best = 0

if(not load):
  # init the models
  encoder = Encoder(v_length, emb, hidden_size, n_layers, batch_size, vocab, dropout = 0.1).to(device)
  decoder = Decoder(v_length, hidden_size, TRUNCATE_TEXT_LENGTH, 1, vocab, batch_size, dropout = 0.1).to(device)
  # opt_enc = torch.optim.Adam(encoder.parameters(), lr=lr)
  # opt_dec = torch.optim.Adam(decoder.parameters(), lr=lr)
  # init optimizers
  opt_enc = torch.optim.SGD(encoder.parameters(), lr=lr)
  opt_dec = torch.optim.SGD(decoder.parameters(), lr=lr)
else:
  # load the last saved models
  encoder, opt_enc, decoder, opt_dec, epoch, current_best = load_model(vocab.get_size(), emb, hidden_size, n_layers, batch_size, vocab, learning_rate=lr)

# loss function, ignoring the index PAD
loss_fn = nn.NLLLoss(ignore_index = vocab.get_id("[PAD]"))
# loss_fn = nn.CrossEntropyLoss(ignore_index = vocab.get_id("[PAD]"))

if(force_training):
  epoch = 0

print("\nN° Parameters: ", (sum(p.numel() for p in encoder.parameters() if p.requires_grad) + sum(p.numel() for p in decoder.parameters() if p.requires_grad)))

  "num_layers={}".format(dropout, num_layers))



N° Parameters:  39576032


In [None]:
# Train the model
# train(encoder, decoder, opt_enc, opt_dec, loss_fn, train_ds, check_val, vocab, batch_size, hidden_size, TRUNCATE_TEXT_LENGTH, TRUNCATE_ABSTRACT_LENGTH, 6000, epoch, current_best, device)

In [None]:
# Eventually test the final model
test(encoder, decoder, train_ds, vocab, batch_size)

test(encoder, decoder, val_ds, vocab, batch_size)

100%|██████████| 20/20 [01:02<00:00,  3.10s/it]


----ACC----
Rouge-1: recall 0.14691883325576782, precision 0.23964472115039825, f1 0.17866653203964233
Rouge-2: recall 0.07638422399759293, precision 0.09953910112380981, f1 0.08477441966533661
Rouge-l: recall 0.14080430567264557, precision 0.2297707498073578, f1 0.17126384377479553



100%|██████████| 53/53 [03:06<00:00,  3.52s/it]

----ACC----
Rouge-1: recall 0.0003905662742909044, precision 0.0007817125297151506, f1 0.0004818924062419683
Rouge-2: recall 2.678559030755423e-05, precision 4.822750997846015e-05, f1 3.342979107401334e-05
Rouge-l: recall 0.00036040559643879533, precision 0.0007397409644909203, f1 0.0004500193172134459




