<a href="https://colab.research.google.com/github/NLPetroni/assignment_two/blob/main/solution_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and downloads



In [1]:
import random

import numpy as np
import pandas as pd
import sys
import os

try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  %cd /content
  !rm -rf assignment_two &> /dev/null
  !git clone https://github.com/NLPetroni/assignment_two &> /dev/null
  %cd assignment_two
  sys.path.append(os.getcwd())
  !git clone https://gitlab.com/sasso-effe/nlp-assignment-data.git &> /dev/null
  !mv nlp-assignment-data/embedding_matrix.npy res/embedding_matrix.npy
  !rm -rf nlp-assignment-data
  !pip install wandb &> /dev/null
  !pip install torchinfo

In [2]:
import re
import math
from functools import reduce
from typing import List, Callable
import pickle

import wandb
import nltk
from nltk.corpus import stopwords
import torch
from torch import nn
from torchinfo import summary
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

from src import utils

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
utils.download_data('dataset')
train_set = pd.read_csv("dataset/train_pairs.csv")
val_set = pd.read_csv("dataset/val_pairs.csv")
test_set = pd.read_csv("dataset/test_pairs.csv")

print(train_set.columns)
print("Total rows of the train set: {:d}".format(len(train_set)))
print("Total rows of the validation set: {:d}".format(len(val_set)))
print("Total rows of the test set: {:d}".format(len(test_set)))
print(train_set['Label'].value_counts())

Index(['Unnamed: 0', 'Claim', 'Evidence', 'ID', 'Label'], dtype='object')
Total rows of the train set: 121740
Total rows of the validation set: 7165
Total rows of the test set: 7189
SUPPORTS    89389
REFUTES     32351
Name: Label, dtype: int64


# Dataset pre-processing and conversion

In [4]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;\t-]')
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
BAD_SYMBOLS_RE = re.compile('(-LRB-)|(-RRB-)|(-LSB-)|(-RSB-)')
INSIDE_SQAURE_BRACKETS_RE = re.compile('(-LSB-).*?(-RSB-)')

try:
    STOPWORDS = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    STOPWORDS = set(stopwords.words('english'))

def remove_inside_square_brackets(text: str) -> str:
    return INSIDE_SQAURE_BRACKETS_RE.sub('', text)

def remove_bad_symbols(text: str) -> str:
    return BAD_SYMBOLS_RE.sub('', text)

def remove_final_tags(text: str) -> str:
   return re.sub('\.\t.*?$', '', text) 

def lower(text: str) -> str:
    """
    Transforms given text to lower case.
    Example:
    Input: 'I really like New York city'
    Output: 'i really like new your city'
    """

    return text.lower()

def replace_special_characters(text: str) -> str:
    """
    Replaces special characters, such as paranthesis,
    with spacing character
    """

    return REPLACE_BY_SPACE_RE.sub(' ', text)

def replace_br(text: str) -> str:
    """
    Replaces br characters
    """

    return text.replace('</br>', '')

def filter_out_uncommon_symbols(text: str) -> str:
    """
    Removes any special character that is not in the
    good symbols list (check regular expression)
    """

    return GOOD_SYMBOLS_RE.sub('', text)

def remove_stopwords(text: str) -> str:
    return ' '.join([x for x in text.split() if x and x not in STOPWORDS])


def strip_text(text: str) -> str:
    """
    Removes any left or right spacing (including carriage return) from text.
    Example:
    Input: '  This assignment is cool\n'
    Output: 'This assignment is cool'
    """

    return text.strip()

def split_text(text: str) -> List:
  return text.split()

PREPROCESSING_PIPELINE = [
                          remove_inside_square_brackets,
                          remove_bad_symbols,
                          lower,
                          remove_final_tags,
                          replace_special_characters,
                          filter_out_uncommon_symbols,
                          strip_text,
                          split_text
                          ]

# Anchor method

def text_prepare(text: str,
                 filter_methods: List[Callable[[str], str]] = None) -> str:
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """

    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE

    return reduce(lambda txt, f: f(txt), filter_methods, text)

def full_preprocessing(dataset):
    # In the evidences there is an id at the beginning of the sequence which is
    # removed with the splice [:1]
    dataset['Evidence'] = dataset['Evidence'].apply(lambda txt: text_prepare(txt)[1:])
    dataset['Claim'] = dataset['Claim'].apply(lambda txt: text_prepare(txt))
    return dataset


## Padding

In [5]:
class Dataset(torch.utils.data.Dataset):
    """Simple dataset class to use dataloaders (batching) """
    def __init__(self, ids, claims, evidences, labels, majority_vote=False):
        self.majority_vote = majority_vote
        if self.majority_vote:
            self.ids = ids
            self.indexes = list(set(self.ids))
            self.majority_data = [[(claims[i], evidences[i], labels[i]) for i, id in enumerate(self.ids) if id == idx]
                                  for idx in self.indexes]
        else:
            self.claims = claims
            self.evidences = evidences
            self.labels = torch.tensor(labels, dtype=torch.float)

    def __getitem__(self, idx):
        if self.majority_vote:
          item = self.majority_data[idx]
        else:
          item = self.claims[idx], self.evidences[idx], self.labels[idx]        
        return item
    def __len__(self):
        return len(self.indexes) if self.majority_vote else self.claims.shape[0]

In [40]:
def to_tensor_collate(batch):
    claims, evidences, labels = zip(*batch)
    return torch.as_tensor(claims), torch.as_tensor(evidences), torch.as_tensor(labels)


def pad_batch_collate(batch):
    """Used by DataLoader to pad each batch independently"""
    claims, evidences, targets = zip(*batch)

    claims = utils.pad_data(claims)
    evidences = utils.pad_data(evidences)
    targets = torch.as_tensor(targets)

    return claims, evidences, targets


def pad_vote_collate(batch):
    """
    Collate function for the dataloader, used for majority voting evaluation.
    Returns:
        A list of batches, where each batch contains all the pairs claim-evidence for a single claim.
        The list is unpacked in the evaluation method and passed batch-by-batch to the model.
    """
    batch_res = []
    for elem in batch:
        claims, evidences, targets = zip(*elem)
        max_evidence_len = int(np.max([len(evidence) for (_, evidence, _) in elem]))
        claims = utils.pad_data(claims)
        # if 'evidences = torch.as_tensor (evidences)' works, it's faster
        evidences = utils.pad_data(evidences)
        targets = torch.as_tensor(targets)
        batch_res.append([claims, evidences, targets])

    return batch_res

## Glove

In [7]:
def __generate_voc_and_matrix(train_set, val_set, test_set):
  voc_evidence_train = [item for sublist in train_set['Evidence'] for item in sublist]
  voc_claim_train = [item for sublist in train_set['Claim'] for item in sublist]
  TRAIN_VOC = set(voc_evidence_train + voc_claim_train)
  voc_evidence_val = [item for sublist in val_set['Evidence'] for item in sublist]
  voc_claim_val = [item for sublist in val_set['Claim'] for item in sublist]
  VAL_VOC = set(voc_evidence_val + voc_claim_val)
  voc_evidence_test = [item for sublist in test_set['Evidence'] for item in sublist]
  voc_claim_test = [item for sublist in test_set['Claim'] for item in sublist]
  TEST_VOC = set(voc_evidence_test + voc_claim_test)

  inputs = train_set['Evidence'].tolist() + train_set['Claim'].tolist()
  glove_voc, embedding_matrix = utils.get_glove(number_token=False)
  vocabulary, embedding_matrix = utils.add_oov(glove_voc, TRAIN_VOC, embedding_matrix, inputs)
  inputs = val_set['Evidence'].tolist() + val_set['Claim'].tolist()
  vocabulary, embedding_matrix = utils.add_oov(vocabulary, VAL_VOC, embedding_matrix, inputs)
  inputs = test_set['Evidence'].tolist() + test_set['Claim'].tolist()
  vocabulary, embedding_matrix = utils.add_oov(vocabulary, TEST_VOC, embedding_matrix, inputs)

  with open("res/vocabulary.pkl", "wb") as file:
    pickle.dump(vocabulary, file)
  np.save("res/embedding_matrix.npy", embedding_matrix)
  return vocabulary, embedding_matrix


def get_voc_and_matrix(train_set, val_set, test_set):
    if os.path.exists("res/vocabulary.pkl"):
      print('The vocabulary is already present. Loading it...', end=' ')
      with open('res/vocabulary.pkl', 'rb') as f:
        vocabulary = pickle.load(f)
      print('Done!')
      if os.path.exists("res/embedding_matrix.npy"):
        print('The embedding matrix is already present. Loading it...', end=' ')
        embedding_matrix = np.load("res/embedding_matrix.npy", )
        print("Done!")
      else:
        print('The embedding matrix is NOT present. You can download it from https://gitlab.com/sasso-effe/nlp-assignment-data and put it in the res folder. Alternatively you can generate a new embedding matrix, but the process is very long.\n')
        answer = input('Do you want to generate a new embedding matrix (and a new vocabulary)? (Y/n)')
        answer = answer in ['Y', 'y', 'yes', 'Yes']
        if answer:
          vocabulary, embedding_matrix = __generate_voc_and_matrix(train_set, val_set, test_set)
          print("Vocabulary and embedding matrix created! Remember to download the generated files if you are on Colab.")
        else:
          raise Exception('Downaload the embedding matrix from https://gitlab.com/sasso-effe/nlp-assignment-data and rerun this cell')
    else:
      print("The vocabulary and the embedding matrix are NOT present. Creating them...")
      vocabulary, embedding_matrix = __generate_voc_and_matrix(train_set, val_set, test_set)
      print("Vocabulary and embedding matrix created! Remember to download the generated files if you are on Colab.")
    return vocabulary, embedding_matrix


## Tokenization

In [8]:
def tokenize(input: List, vocabulary) -> torch.Tensor:
  result = list(map(lambda x: vocabulary[x], input))
  return result

def detokenize(input: torch.Tensor, inverse_vocabulary) -> List:
  result = input.tolist()
  # FIXME sistemare computazione
  result = list(map(lambda x: inverse_vocabulary[x], result))
  return result

def full_tokenization(data, vocabulary):
    data['Evidence'] = data['Evidence'].map(lambda x: tokenize(x, vocabulary))
    data['Claim'] = data['Claim'].map(lambda x: tokenize(x, vocabulary))
    data['Label'] = data['Label'].map(lambda x: float(1.0) if x == 'SUPPORTS' else float(0.0))
    return data

# Putting everything together

In [10]:
def load_data(padding=False):
    utils.download_data('dataset')
    train_set = pd.read_csv("dataset/train_pairs.csv")
    val_set = pd.read_csv("dataset/val_pairs.csv")
    test_set = pd.read_csv("dataset/test_pairs.csv")

    # preprocessing
    train_set = full_preprocessing(train_set)
    val_set = full_preprocessing(val_set)
    test_set = full_preprocessing(test_set)

    if padding:
        evidence_lens = [len(te) for te in train_set['Evidence']] + [len(ve) for ve in val_set['Evidence']] + [len(te) for te in test_set['Evidence']]
        max_evidence_len = np.quantile(evidence_lens, 0.99, interpolation='nearest')

        claim_lens = [len(te) for te in train_set['Claim']] + [len(ve) for ve in val_set['Claim']] + [len(te) for te in test_set['Claim']]
        max_claim_len = np.quantile(claim_lens, 0.99, interpolation='nearest')

        tr_ev_lens = train_set['Evidence'].map(lambda e: len(e))
        tr_claim_lens = train_set['Claim'].map(lambda c: len(c))
        train_set = train_set.drop(train_set[(tr_ev_lens >= max_evidence_len) | (tr_claim_lens >= max_claim_len)].index).reset_index()
        va_ev_lens = val_set['Evidence'].map(lambda e: len(e))
        va_claim_lens = val_set['Claim'].map(lambda c: len(c))
        val_set = val_set.drop(val_set[(va_ev_lens >= max_evidence_len) | (va_claim_lens >= max_claim_len)].index).reset_index()
        te_ev_lens = test_set['Evidence'].map(lambda e: len(e))
        te_claim_lens = test_set['Claim'].map(lambda c: len(c))
        test_set = test_set.drop(test_set[(te_ev_lens >= max_evidence_len) | (te_claim_lens >= max_claim_len)].index).reset_index()

        train_set.loc[:, 'Evidence'] = train_set['Evidence'].map(lambda x: x + (['<PAD>'] * (max_evidence_len - len(x))))
        val_set.loc[:, 'Evidence'] = val_set['Evidence'].map(lambda x: x + (['<PAD>'] * (max_evidence_len - len(x))))
        test_set.loc[:, 'Evidence'] = test_set['Evidence'].map(lambda x: x + (['<PAD>'] * (max_evidence_len - len(x))))

        train_set.loc[:, 'Claim'] = train_set['Claim'].map(lambda x: x + (['<PAD>'] * (max_claim_len - len(x))))
        val_set.loc[:, 'Claim'] = val_set['Claim'].map(lambda x: x + (['<PAD>'] * (max_claim_len - len(x))))
        test_set.loc[:, 'Claim'] = test_set['Claim'].map(lambda x: x + (['<PAD>'] * (max_claim_len - len(x))))


    vocabulary, embedding_matrix = get_voc_and_matrix(train_set, val_set, test_set)

    # tokenization
    train_set = full_tokenization(train_set, vocabulary)
    val_set = full_tokenization(val_set, vocabulary)
    test_set = full_tokenization(test_set, vocabulary)

    return vocabulary, embedding_matrix, (train_set, val_set, test_set)

# Model definition

In [10]:
# lambda layers

class LambdaLast(torch.nn.Module):
  def forward(self, x):
    return x[:,-1]

class LambdaAvg(torch.nn.Module):
  def forward(self, x):
    return torch.mean(x, dim=1)

class LambdaConcatenation(torch.nn.Module):
  def forward(self, x, y):
    return torch.cat((x, y), dim=1)

class LambdaSum(torch.nn.Module):
  def forward(self, x, y):
    return x + y

class LambdaMean(torch.nn.Module):
  def forward(self, x, y):
    return (x + y) / 2


In [11]:
def get_binary_classifier(name:str,
                    w_in: int,
                    w_hidden: int,
                    n_layers=2
                          ) -> nn.Sequential:
    """Gets a sequential container with a linear+relu+linear classifier

    Args:
        name: the name prefix to append to each layer in the container.
        w_in: the number of the input features.
        w_hidden: the number of internal weights

    Returns: the created sequential.
    """
    assert n_layers >= 2, f'n_layers is {n_layers}, must be >= 2'
    container = nn.Sequential()
    container.add_module(f'{name}_fc1', nn.Linear(in_features=w_in, out_features=w_hidden))    
    container.add_module(f'{name}_ReLU', nn.ReLU())
    for i in range(n_layers-2):
      container.add_module(f'{name}_fc{i+2}', nn.Linear(in_features=w_hidden, out_features=w_hidden))
      container.add_module(f'{name}_ReLU', nn.ReLU())
    container.add_module(f'{name}_fc2', nn.Linear(in_features=w_hidden, out_features=1))
    return container

 
class RNNEncoder(torch.nn.Module):

  def __init__(self, input_size, hidden_size, num_layers, rnn_type='elman', output_fn='last', verbose=False):
    super().__init__()
    if verbose:
      print('Initializing RNNEncoder ')
    types = {'elman': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}
    output_layers = {'last': LambdaLast, 'avg': LambdaAvg}
    try:
      self.output_layer = output_layers[output_fn]()
    except:
      raise ValueError(f"wrong type '{output_fn}', must be in {list(output_layers.keys())}")

    try:
      rec_module = types[rnn_type]
    except:
      valid_types = list(types.keys())
      raise ValueError(f"wrong type '{rnn_type}', must be in {valid_types}")
    self.rec_module = rec_module(input_size=input_size, hidden_size=hidden_size,
                                 bidirectional=True, batch_first=True,
                                 num_layers=num_layers)
    
  def forward(self, x):
    x, _ = self.rec_module(x)
    x = self.output_layer(x)
    x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True, padding_value=400000)
    return x

class BagOfVectorsEncoder(torch.nn.Module):

  def __init__(self):
    super().__init__()

  def forward(self, x):
    x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True, padding_value=400000)
    # TODO: check if the mean is computed on the right axis
    return torch.mean(x, dim=1)


class MLPEncoder(torch.nn.Module):

  def __init__(self, ws_in, w_out, layers=None, name='mlp'):
    super().__init__()
    self.ws_in = ws_in
    self.name = name
    self.__build_mlps(w_out, layers)

  def forward(self, x):
    batch_size, max_tokens, embedding_dim = x.shape
    in_features = max_tokens * embedding_dim
    x = torch.reshape(x, (batch_size, in_features))
    if in_features == self.ws_in[0]:
        x = self.evidence_mlp(x)
    elif in_features == self.ws_in[1]:
        x = self.claim_mlp(x)
    else:
        raise ValueError(f'wrong input shape {x.shape}, must be in {self.ws_in}')
    return x

  def __build_mlps(self, w_out, layers):
    if layers is None:
        layers = [w_out, w_out]
    else:
        layers = [w_out] + layers

    self.evidence_mlp = nn.Sequential()
    self.evidence_mlp.add_module(f'{self.name}_fc1', nn.Linear(in_features=self.ws_in[0], out_features=layers[0]))
    for i, (w_in, w_out) in enumerate(zip(layers[:-1], layers[1:])):
      self.evidence_mlp.add_module(f'{self.name}_ReLU', nn.ReLU())
      self.evidence_mlp.add_module(f'{self.name}_fc{i+2}', nn.Linear(in_features=w_in, out_features=w_out))

    self.claim_mlp = nn.Sequential()
    self.claim_mlp.add_module(f'{self.name}_fc1', nn.Linear(in_features=self.ws_in[1], out_features=layers[0]))
    for i, (w_in, w_out) in enumerate(zip(layers[:-1], layers[1:])):
      self.claim_mlp.add_module(f'{self.name}_ReLU', nn.ReLU())
      self.claim_mlp.add_module(f'{self.name}_fc{i+2}', nn.Linear(in_features=w_in, out_features=w_out))

In [12]:
class FactChecker(torch.nn.Module):

  def __init__(self, embedding_matrix, encoder, merger, ws_in=None, rnn_type=None, rnn_output=None, rec_size=1, hid_size=50, n_layers_classifier=3, cosine_similarity=False):
    """
      A recurrent network performing Neural Language Inference (Fact Checking).
      Params:
        embedding_matrix: the embedding matrix for word embedding
        encoder: [rnn, mlp, bag], the encoder to compute the sentence embedding
        merger: [concatenation, sum, mean], the multi-input merging strategy
        n_layers_classifier: int, the number of layers in the classifier
        RNNEncoder params, only relevant if encoder==rnn:
          rnn_type: [elman, lstm, gru], the RNN architecure used in the encoder
          rnn_output: [last, avg], the function to compute the sentence encoding from the RNN hidden states
          rec_size: int, the number of layers in the rnn
          hid_size: int, the hidden size of the rnn
    """
    super().__init__()
    self.hid_size = hid_size
    self.cosine_similarity = cosine_similarity

    # Word embedding
    emb_size = embedding_matrix.shape[1]
    self.emb_layer = nn.Embedding.from_pretrained(torch.as_tensor(embedding_matrix))

    # Sentence embedding
    if encoder == 'rnn':
      self.encoder = RNNEncoder(emb_size, hid_size, rec_size, rnn_type=rnn_type, output_fn=rnn_output)
    elif encoder == 'mlp':
      self.encoder = MLPEncoder(ws_in=ws_in, w_out=100)
    elif encoder == 'bag':
      self.encoder = BagOfVectorsEncoder()
    else:
      raise ValueError(f"Wrong encoder '{encoder}', must be in ['rnn', 'mlp', 'bag']")

    # Merging
    merging_strategies = {
        'concatenation': LambdaConcatenation,
        'sum': LambdaSum,
        'mean': LambdaMean
    }
    try:
      merging_layer = merging_strategies[merger]
    except:
      valid_strategies = list(merging_strategies.keys())
      raise ValueError(f"wrong type '{merger}', must be in {valid_strategies}")
    self.merger = merging_layer()

    # Classifier
    classifier_in = self.hid_size * 2 if encoder=='rnn' else 100
    if merger == 'concatenation':
      classifier_in *= 2
    if self.cosine_similarity:
      classifier_in += 1
    self.classifier = get_binary_classifier('classifier', w_in=classifier_in, w_hidden=classifier_in//2, n_layers=n_layers_classifier)

  def forward(self, claim, evidence, debug=False):
    # Word embedding
    claim = self.emb_layer(claim).float()
    evidence = self.emb_layer(evidence).float()
    if debug:
      print("After word embedding")
      print(f"\tclaim.shape: {claim.shape}")
      print(f"\tevidence.shape: {evidence.shape}")
    # Sentence embedding
    claim = self.encoder(claim)
    evidence = self.encoder(evidence)
    if debug:
      print("After phrase encoding")
      print(f"\tclaim.shape: {claim.shape}")
      print(f"\tevidence.shape: {evidence.shape}")
    # Merging
    merged_data = self.merger(claim, evidence)
    if debug:
      print("After merging")
      print(f"\tmerged_data.shape: {merged_data.shape}")
    #Cosine Similarity
    if self.cosine_similarity:
      cs = torch.nn.functional.cosine_similarity(claim,evidence, dim=1)
      cs = torch.unsqueeze(cs, dim=1)
      merged_data = torch.cat((merged_data, cs), dim=1)
    # Classifying
    output = self.classifier(merged_data)

    return output

# Training

In [13]:
def training_step(model, optimizer, loss_fn, data_loader, device):
  model.train()
  log_dict = {'train/loss': [], 'train/accuracy': [], 'train/precision': [], 'train/recall': [], 'train/f1': []}

  for (claims, evidences, labels) in data_loader:
    # forward
    claims = claims.to(device)
    evidences = evidences.to(device)
    labels = labels.to(device)

    outputs = model(claims, evidences)
    outputs = outputs.view(outputs.size(0))
    loss = loss_fn(outputs, labels)
    loss_value = loss.item()

    if not math.isfinite(loss_value):
      print(f"Loss is {loss_value}, stopping training")
      exit(1)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    with torch.no_grad():
        preds = torch.sigmoid(outputs).round()
        labels, preds = labels.cpu().numpy(), preds.cpu().numpy()
        acc = ((labels == preds).sum() / labels.size).item()
        prec = precision_score(labels, preds)
        rec = recall_score(labels, preds)
        f1 = f1_score(labels, preds)


    log_dict['train/loss'].append(loss_value)
    log_dict['train/accuracy'].append(acc)
    log_dict['train/precision'].append(prec)
    log_dict['train/recall'].append(rec)
    log_dict['train/f1'].append(f1)

  return log_dict


def evaluate(model, loss_fn, data_loader, device):
  """
    Evaluate model on the given dataloader.
    Parameters:
      model: torch.nn.Module to evaluate
      loss_fn: torch.nn criterion to use to compute loss, given outputs and targets
      data_loader: torch.utils.data.DataLoader
      device: torch.device where evaluation is performed
    Returns log dict {'valid/loss' : mean loss, 'valid/{metric}': mean metric}
  """
  model.eval()
  assert len(data_loader) == 1 # must be a single batch
  split = 'valid' # TODO implement for test set

  with torch.no_grad():
    data = next(iter(data_loader))

    # Variables for majority voting
    vote_labels = []
    vote_preds = []
    # Variable for standard
    loss_values = []
    std_labels = []
    std_preds = []
    for claims, evidences, labels in data:
      claims = claims.to(device)
      evidences = evidences.to(device)
      labels = labels.to(device)
      outputs = model(claims, evidences)
      outputs = outputs.view(outputs.size(0))
      preds = torch.sigmoid(outputs).round()
      loss_values.append(loss_fn(outputs, labels).cpu().numpy())
      labels = list(labels.cpu())
      preds = list(preds.cpu())
      # Majority voting
      true_label = np.mean(labels).round()
      predicted_label = np.mean(preds).round()
      vote_labels.append(true_label)
      vote_preds.append(predicted_label)
      # Standard
      std_labels += labels
      std_preds += preds
    # Majority voting
    vote_acc = accuracy_score(vote_labels, vote_preds)
    vote_prec = precision_score(vote_labels, vote_preds)
    vote_rec = recall_score(vote_labels, vote_preds)
    vote_f1 = f1_score(vote_labels, vote_preds)
    # Standard
    acc = accuracy_score(std_labels, std_preds)
    prec = precision_score(std_labels, std_preds)
    rec = recall_score(std_labels, std_preds)
    f1 = f1_score(std_labels, std_preds)
  log_dict = {f'{split}/loss': np.mean(np.concatenate(loss_values)),
              f'{split}/accuracy': acc,
              f'{split}/precision': prec,
              f'{split}/recall': rec,
              f'{split}/f1': f1,
              f'{split}/vote_accuracy': vote_acc,
              f'{split}/vote_precision': vote_prec,
              f'{split}/vote_recall': vote_rec,
              f'{split}/vote_f1': vote_f1}
  return log_dict


In [14]:
DEFAULT_PARAMS = {
          'encoder': 'rnn',
          'merger': 'concatenation',
          'rnn_type': 'elman',
          'rnn_output': 'last',
          'rec_size': 1,
          'hid_size': 50,
          'cosine_similarity': False
      }
def train(optimizer, lr, device, n_epochs, verbose, batch_size, test=False, tags=None, **model_params):
  model_params = {**DEFAULT_PARAMS, **model_params}

  cfg_dict = {'epochs': n_epochs, 'batch_size': batch_size, 'optimizer': optimizer, 'lr': lr, 'params': model_params}

  wandb.login(key=utils.get_wandbkey())
  run = wandb.init(project="assignment-two", entity="nlpetroni", reinit=True, config=cfg_dict, tags=tags)
  wandb.define_metric("train_step")
  wandb.define_metric("epoch")
  wandb.define_metric('train/loss', step_metric="train_step", summary="min")
  wandb.define_metric("train/accuracy", step_metric="train_step", summary="max")
  wandb.define_metric("valid/loss", step_metric="epoch", summary="min")
  wandb.define_metric("valid/accuracy", step_metric="epoch", summary="max")
  wandb.define_metric("valid/precision", step_metric="epoch", summary="max")
  wandb.define_metric("valid/recall", step_metric="epoch", summary="max")
  wandb.define_metric("valid/f1", step_metric="epoch", summary="max")
  wandb.define_metric("valid/vote_accuracy", step_metric="epoch", summary="max")
  wandb.define_metric("valid/vote_precision", step_metric="epoch", summary="max")
  wandb.define_metric("valid/vote_recall", step_metric="epoch", summary="max")
  wandb.define_metric("valid/vote_f1", step_metric="epoch", summary="max")

  if model_params['encoder'] == 'mlp':
      # with MLP encoder, every sequence must have the same length, thus we must pad at the dataset level
      padding = True
      collate_fn = to_tensor_collate
  else:
      # with other encoders, pad each batch independently
      padding = False
      collate_fn = pad_batch_collate

  vocabulary, embedding_matrix, dataset = load_data(padding=padding)
  train_set, valid_set, test_set = dataset
  train_ds = Dataset(train_set['ID'], train_set['Claim'], train_set['Evidence'], train_set['Label'])
  valid_ds = Dataset(valid_set['ID'], valid_set['Claim'], valid_set['Evidence'], valid_set['Label'], majority_vote=True)
  test_ds = Dataset(test_set['ID'], test_set['Claim'], test_set['Evidence'], test_set['Label'], majority_vote=True)
  train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
  valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=len(valid_ds), collate_fn=pad_vote_collate)
  test_dl = torch.utils.data.DataLoader(test_ds, batch_size=len(test_ds), collate_fn=pad_vote_collate)
  if model_params['encoder'] == 'mlp':
      model_params['ws_in'] = (len(train_set.loc[0, 'Evidence'] * embedding_matrix.shape[1]), len(train_set.loc[0, 'Claim'] * embedding_matrix.shape[1]))

  model = FactChecker(embedding_matrix, **model_params)
  model.to(device)
  wandb.watch(model, log_graph=True)
  if verbose:
    print(summary(model))

  params = [p for p in model.parameters() if p.requires_grad]
  if optimizer == 'rmsprop':
    optimizer = torch.optim.RMSprop(params, lr=lr, alpha=0.99, momentum=0.5, weight_decay=0)
  elif optimizer == 'adam':
    optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.999), weight_decay=0)
  else:
    raise ValueError(f'wrong optim {optimizer}, either rmsprop or adam')

  loss = nn.BCEWithLogitsLoss()
  valid_loss = nn.BCEWithLogitsLoss(reduction='none')
  train_step = 0
  print('STARTING TRAINING')
  print('Evaluation metrics are computed on validation set\n')
  print(f'EPOCHS | TRAIN LOSS | VALID LOSS | ACCURACY | PRECISION | RECALL | {"F1":^5} | V_ACC | V_PREC | V_REC | V_F1')

  for epoch in range(n_epochs):
    log_dict = training_step(model, optimizer, loss, train_dl, device)
    if not test:
      log_dict.update(evaluate(model, valid_loss, valid_dl, device))
      for batch_loss, batch_acc, batch_f1 in zip(log_dict['train/loss'], log_dict['train/accuracy'], log_dict['train/f1']):
        wandb.log({'train_step': train_step, 'epoch': epoch, 'train/loss': batch_loss, 'train/accuracy': batch_acc,
                   'train/f1': batch_f1})
        train_step += 1
      wandb.log({'epoch': epoch, 'valid/loss': log_dict['valid/loss'],
                 'valid/accuracy': log_dict['valid/accuracy'],
                 'valid/precision': log_dict['valid/precision'],
                 'valid/recall': log_dict['valid/recall'],
                 'valid/f1': log_dict['valid/f1'],
                 'valid/vote_accuracy': log_dict['valid/vote_accuracy'],
                 'valid/vote_precision': log_dict['valid/vote_precision'],
                 'valid/vote_recall': log_dict['valid/vote_recall'],
                 'valid/vote_f1': log_dict['valid/vote_f1']})
      print(f'[{epoch:02d}/{n_epochs:02d}]| {np.mean(log_dict["train/loss"]):10.3f} | {log_dict["valid/loss"]:10.3f} | {log_dict["valid/accuracy"]:8.3f} | {log_dict["valid/precision"]:9.3f} | {log_dict["valid/recall"]:6.3f} | {log_dict["valid/f1"]:4.3f} | {log_dict["valid/vote_accuracy"]:5.3f} | {log_dict["valid/vote_precision"]:6.3f} | {log_dict["valid/vote_recall"]:5.3f} | {log_dict["valid/vote_f1"]:4.3f}')
  if test:
    log_dict = evaluate(model, loss, test_dl, device)
    wandb.log()

  run.finish()
  return model

In [18]:
def GridSearch(parameters, optimizer_name='adam', device=None, verbose=True):
  for rnn_type in parameters['rnn_type']:
    for lr in parameters['lr']:
      for batch_size in parameters['batch_size']:
        for num_layer_class in parameters['num_layer_class']:
          for num_layer_rnn in parameters['num_layer_rnn']:
            train(optimizer_name=optimizer_name, lr=lr, device=device, n_epochs=20, verbose=verbose, batch_size=batch_size,
                encoder='rnn', merger='concatenation', rnn_type=rnn_type, rnn_output='last', hid_size=50 , n_layers_classifier=num_layer_class, rec_size=num_layer_rnn)

# Hyperparameters tuning

## Sentence encoding
### Last state of a RNN

In [16]:
out_dir = './models'
if not os.path.isdir(out_dir):
    os.mkdir(out_dir)
lr = 1e-3
epochs = 20
batch_size = 512
rec_size = 2
hid_size = 50
tag = 'encoding choice'

In [17]:
# elman
model = train(optimizer='adam', lr=lr, device=device, n_epochs=epochs, verbose=False, tags=[tag],
              batch_size=batch_size, encoder='rnn', merger='concatenation', rnn_type='elman',
              rnn_output='last', rec_size=rec_size, hid_size=hid_size, cosine_similarity=False)
torch.save(model.state_dict(), os.path.join(out_dir, 'rnn_last.pth'))

[34m[1mwandb[0m: Currently logged in as: [33mdiegochine[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/diego/.netrc


The vocabulary is already present. Loading it... Done!
The embedding matrix are already present. Loading it... Done!


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


STARTING TRAINING
Evaluation metrics are computed on validation set

EPOCHS | TRAIN LOSS | VALID LOSS | ACCURACY | PRECISION | RECALL |  F1   | V_ACC | V_PREC | V_REC | V_F1
[00/20]|      0.508 |      0.645 |    0.652 |     0.623 |  0.785 | 0.695 | 0.649 |  0.619 | 0.778 | 0.689
[01/20]|      0.421 |      0.633 |    0.661 |     0.621 |  0.839 | 0.714 | 0.657 |  0.616 | 0.833 | 0.708
[02/20]|      0.398 |      0.607 |    0.679 |     0.638 |  0.837 | 0.724 | 0.676 |  0.634 | 0.833 | 0.720
[03/20]|      0.382 |      0.592 |    0.679 |     0.662 |  0.740 | 0.699 | 0.677 |  0.659 | 0.732 | 0.693
[04/20]|      0.373 |      0.641 |    0.661 |     0.643 |  0.735 | 0.686 | 0.658 |  0.639 | 0.728 | 0.681
[05/20]|      0.363 |      0.607 |    0.680 |     0.664 |  0.740 | 0.700 | 0.679 |  0.662 | 0.733 | 0.695
[06/20]|      0.354 |      0.608 |    0.684 |     0.649 |  0.813 | 0.722 | 0.681 |  0.645 | 0.805 | 0.716
[07/20]|      0.347 |      0.601 |    0.682 |     0.652 |  0.793 | 0.715 | 0.680 |  

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/accuracy,▁▃▅▆▅▅▅▆▆▅▆▇▆▆▇▆▇▆▆▆▇▆▇▆▇▇▇▆▇▇▇▇▇▇▇▇▇█▇█
train/f1,▁▂▄▆▅▅▅▆▆▅▆▇▅▆▇▆▇▆▆▆▇▆▇▆▇▆▇▆▇▇▇▆▇▇▇▇▇▇▇█
train/loss,█▇▅▄▄▄▅▄▃▄▃▃▄▃▃▃▃▂▃▃▃▃▂▃▂▃▂▃▂▂▂▂▂▂▁▂▁▂▂▁
train_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid/accuracy,▁▂▅▅▂▅▆▅▇▇▇▆▆▅▅▇██▆▇
valid/f1,▃▆█▃▁▃▇▆█▅▇█▂▃▂█▇▅▄▅
valid/loss,█▆▃▁█▃▃▂▁▁▃▃▃▄▅▃▆▅▆█
valid/precision,▁▁▃▅▃▆▄▄▅▇▆▄█▆▇▆██▇█
valid/recall,▅██▃▃▃▇▆▆▃▅▇▁▂▁▆▄▃▂▃

0,1
epoch,19.0
train/f1,0.91497
train_step,4759.0
valid/vote_accuracy,0.69344
valid/vote_f1,0.70292
valid/vote_precision,0.68144
valid/vote_recall,0.72579


In [18]:
# lstm
model = train(optimizer='adam', lr=lr, device=device, n_epochs=epochs, verbose=False, tags=[tag],
                        batch_size=batch_size, encoder='rnn', merger='concatenation', rnn_type='lstm',
                        rnn_output='last', rec_size=rec_size, hid_size=hid_size, cosine_similarity=False)
torch.save(model.state_dict(), os.path.join(out_dir, 'lstm_last.pth'))

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/diego/.netrc


The vocabulary is already present. Loading it... Done!
The embedding matrix are already present. Loading it... Done!


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
  _warn_prf(average, modifier, msg_start, len(result))


STARTING TRAINING
Evaluation metrics are computed on validation set

EPOCHS | TRAIN LOSS | VALID LOSS | ACCURACY | PRECISION | RECALL |  F1   | V_ACC | V_PREC | V_REC | V_F1
[00/20]|      0.513 |      0.575 |    0.694 |     0.652 |  0.842 | 0.735 | 0.692 |  0.649 | 0.836 | 0.731
[01/20]|      0.402 |      0.544 |    0.713 |     0.649 |  0.936 | 0.767 | 0.711 |  0.646 | 0.932 | 0.763
[02/20]|      0.378 |      0.556 |    0.710 |     0.642 |  0.961 | 0.769 | 0.708 |  0.639 | 0.958 | 0.766
[03/20]|      0.361 |      0.539 |    0.726 |     0.657 |  0.955 | 0.778 | 0.724 |  0.653 | 0.952 | 0.775
[04/20]|      0.347 |      0.589 |    0.712 |     0.643 |  0.963 | 0.771 | 0.710 |  0.640 | 0.962 | 0.768
[05/20]|      0.334 |      0.624 |    0.715 |     0.644 |  0.973 | 0.775 | 0.713 |  0.640 | 0.972 | 0.772
[06/20]|      0.324 |      0.593 |    0.722 |     0.652 |  0.960 | 0.777 | 0.722 |  0.650 | 0.959 | 0.775
[07/20]|      0.315 |      0.552 |    0.735 |     0.664 |  0.956 | 0.784 | 0.733 |  

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/accuracy,▁▃▅▅▅▅▆▆▅▇▅▆▆▆▇▇▆▆▆▆▇▇▇█▆▇█▇▇▇▇▆▇█▇████▇
train/f1,▁▂▅▄▄▅▅▆▅▇▅▆▆▅▇▇▆▆▆▆▇▆▇█▆▇█▆▇▇▇▆▇█▇▇███▆
train/loss,█▆▅▄▄▄▃▄▄▃▄▃▂▃▂▂▃▃▃▃▂▃▂▁▂▂▁▃▂▂▂▂▂▁▂▂▁▁▁▂
train_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid/accuracy,▁▃▃▅▃▄▅▆▇▅▅▆▇▇▇███▇█
valid/f1,▁▅▅▇▆▆▆▇█▇▇▇▇████▇██
valid/loss,▃▁▂▁▃▅▃▂▂▃▄▃▃▃▆▄▅▆█▇
valid/precision,▃▂▁▃▁▁▃▅▅▄▄▅▅▆▆▇▇█▆▇
valid/recall,▁▆▇▇▇█▇▇▇▇▇▇▇▇▆▆▆▅▆▆

0,1
epoch,19.0
train/f1,0.94118
train_step,4759.0
valid/vote_accuracy,0.74788
valid/vote_f1,0.78762
valid/vote_precision,0.68009
valid/vote_recall,0.93553


In [19]:
# gru
model = train(optimizer='adam', lr=lr, device=device, n_epochs=epochs, verbose=False, tags=[tag],
                       batch_size=batch_size, encoder='rnn', merger='concatenation', rnn_type='gru',
                       rnn_output='last', rec_size=rec_size, hid_size=hid_size, cosine_similarity=False)
torch.save(model.state_dict(), os.path.join(out_dir, 'gru_last.pth'))

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/diego/.netrc


The vocabulary is already present. Loading it... Done!
The embedding matrix are already present. Loading it... Done!


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


STARTING TRAINING
Evaluation metrics are computed on validation set

EPOCHS | TRAIN LOSS | VALID LOSS | ACCURACY | PRECISION | RECALL |  F1   | V_ACC | V_PREC | V_REC | V_F1
[00/20]|      0.479 |      0.616 |    0.682 |     0.618 |  0.966 | 0.754 | 0.680 |  0.615 | 0.964 | 0.751
[01/20]|      0.390 |      0.571 |    0.702 |     0.635 |  0.963 | 0.765 | 0.700 |  0.631 | 0.961 | 0.762
[02/20]|      0.366 |      0.555 |    0.708 |     0.640 |  0.965 | 0.769 | 0.706 |  0.636 | 0.962 | 0.766
[03/20]|      0.351 |      0.542 |    0.724 |     0.653 |  0.966 | 0.779 | 0.722 |  0.649 | 0.964 | 0.776
[04/20]|      0.339 |      0.626 |    0.708 |     0.637 |  0.978 | 0.772 | 0.706 |  0.634 | 0.976 | 0.769
[05/20]|      0.327 |      0.566 |    0.724 |     0.653 |  0.966 | 0.779 | 0.722 |  0.650 | 0.965 | 0.776
[06/20]|      0.317 |      0.537 |    0.735 |     0.666 |  0.951 | 0.784 | 0.734 |  0.663 | 0.949 | 0.781
[07/20]|      0.307 |      0.550 |    0.729 |     0.658 |  0.961 | 0.781 | 0.727 |  

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/accuracy,▁▃▄▄▅▆▅▅▅▅▅▆▆▄▆▅▅▆▅▅▇▆▆▆▆▆▇▇▇▆▆▇▇▆█▇▇▆█▇
train/f1,▁▂▃▄▅▆▅▅▄▄▅▆▅▄▆▄▄▅▅▅▆▅▅▅▆▆▇▆▇▆▆▇▇▆█▆▇▆█▇
train/loss,█▇▆▆▅▅▄▅▅▅▄▄▄▅▄▅▅▄▄▄▃▃▃▃▄▃▂▃▂▃▃▂▂▃▂▃▂▃▁▂
train_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid/accuracy,▁▃▄▆▄▆▇▆▆▇█▇█▇█▇▇▇▇▇
valid/f1,▁▃▄▆▅▆▇▇▆▇█▇█▇▇▇▇▆▇▆
valid/loss,▄▂▂▁▄▂▁▁▃▂▂▃▃▃▄▅▅▄▆█
valid/precision,▁▃▄▅▃▅▇▆▅▇▇▇█▇█▇▇██▇
valid/recall,▇▆▆▆█▆▄▆▆▆▅▅▅▅▄▅▄▁▂▃

0,1
epoch,19.0
train/f1,0.93718
train_step,4759.0
valid/vote_accuracy,0.73367
valid/vote_f1,0.77885
valid/vote_precision,0.66559
valid/vote_recall,0.93856


### Average of all the output states of a RNN

In [20]:
# elman
model = train(optimizer='adam', lr=lr, device=device, n_epochs=epochs, verbose=False, tags=[tag],
                        batch_size=batch_size, encoder='rnn', merger='concatenation', rnn_type='elman',
                        rnn_output='avg', rec_size=rec_size, hid_size=hid_size, cosine_similarity=False)
torch.save(model.state_dict(), os.path.join(out_dir, 'rnn_avg.pth'))

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/diego/.netrc


The vocabulary is already present. Loading it... Done!
The embedding matrix are already present. Loading it... Done!


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
  _warn_prf(average, modifier, msg_start, len(result))


STARTING TRAINING
Evaluation metrics are computed on validation set

EPOCHS | TRAIN LOSS | VALID LOSS | ACCURACY | PRECISION | RECALL |  F1   | V_ACC | V_PREC | V_REC | V_F1
[00/20]|      0.532 |      0.669 |    0.653 |     0.614 |  0.840 | 0.709 | 0.653 |  0.612 | 0.832 | 0.705
[01/20]|      0.451 |      0.641 |    0.674 |     0.625 |  0.880 | 0.731 | 0.672 |  0.623 | 0.873 | 0.727
[02/20]|      0.422 |      0.620 |    0.702 |     0.679 |  0.775 | 0.724 | 0.700 |  0.678 | 0.763 | 0.718
[03/20]|      0.403 |      0.600 |    0.697 |     0.645 |  0.883 | 0.746 | 0.694 |  0.642 | 0.876 | 0.741
[04/20]|      0.406 |      0.579 |    0.719 |     0.690 |  0.802 | 0.742 | 0.717 |  0.689 | 0.790 | 0.736
[05/20]|      0.384 |      0.582 |    0.711 |     0.683 |  0.795 | 0.735 | 0.708 |  0.681 | 0.783 | 0.728
[06/20]|      0.377 |      0.596 |    0.713 |     0.665 |  0.869 | 0.753 | 0.712 |  0.663 | 0.862 | 0.749
[07/20]|      0.371 |      0.572 |    0.714 |     0.675 |  0.834 | 0.746 | 0.711 |  

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/accuracy,▁▁▄▄▄▅▅▅▆▆▆▆▅▇▅▇▅▇▅▆█▅▇▆▆▇▆▇▇▇▇▆▇▇▇▇█▆▇█
train/f1,▁▁▄▄▄▅▄▄▅▆▆▆▄▇▄▆▅▇▅▅█▅▇▆▆▆▆▇█▇▇▆▇▇█▇█▆▇█
train/loss,█▇▅▅▅▄▅▄▄▃▃▄▄▃▄▂▅▃▃▃▂▄▂▂▂▃▃▂▁▂▂▃▂▂▁▂▁▃▁▁
train_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid/accuracy,▁▃▆▅▇▆▇▇▇█▇█▇▇██▇███
valid/f1,▁▄▃▆▆▅▇▆▇▅▆▇▅▇▇█▅▆▇▆
valid/loss,█▆▅▄▃▃▄▃▁▂▂▂▂▂▂▄▁▁▃▂
valid/precision,▁▂▆▃▇▆▅▆▇█▇▇▇▆▇▆▇▇▇█
valid/recall,▅█▁█▃▂▇▅▄▁▂▅▂▄▃▆▂▃▄▂

0,1
epoch,19.0
train/f1,0.91228
train_step,4759.0
valid/vote_accuracy,0.72278
valid/vote_f1,0.73592
valid/vote_precision,0.70223
valid/vote_recall,0.773


In [21]:
# lstm
model = train(optimizer='adam', lr=lr, device=device, n_epochs=epochs, verbose=False, tags=[tag],
                       batch_size=batch_size, encoder='rnn', merger='concatenation', rnn_type='lstm',
                       rnn_output='avg', rec_size=rec_size, hid_size=hid_size, cosine_similarity=False)
torch.save(model.state_dict(), os.path.join(out_dir, 'lstm_avg.pth'))

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/diego/.netrc


The vocabulary is already present. Loading it... Done!
The embedding matrix are already present. Loading it... Done!


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


STARTING TRAINING
Evaluation metrics are computed on validation set

EPOCHS | TRAIN LOSS | VALID LOSS | ACCURACY | PRECISION | RECALL |  F1   | V_ACC | V_PREC | V_REC | V_F1
[00/20]|      0.522 |      0.712 |    0.624 |     0.574 |  0.986 | 0.726 | 0.623 |  0.571 | 0.985 | 0.723
[01/20]|      0.418 |      0.617 |    0.673 |     0.610 |  0.975 | 0.750 | 0.672 |  0.607 | 0.973 | 0.748
[02/20]|      0.393 |      0.616 |    0.681 |     0.615 |  0.977 | 0.755 | 0.680 |  0.613 | 0.975 | 0.753
[03/20]|      0.379 |      0.584 |    0.699 |     0.631 |  0.966 | 0.764 | 0.697 |  0.629 | 0.963 | 0.761
[04/20]|      0.367 |      0.557 |    0.715 |     0.650 |  0.938 | 0.768 | 0.713 |  0.648 | 0.934 | 0.765
[05/20]|      0.357 |      0.558 |    0.719 |     0.652 |  0.950 | 0.773 | 0.718 |  0.650 | 0.946 | 0.771
[06/20]|      0.346 |      0.569 |    0.717 |     0.650 |  0.950 | 0.772 | 0.715 |  0.647 | 0.947 | 0.769
[07/20]|      0.340 |      0.586 |    0.716 |     0.648 |  0.957 | 0.772 | 0.714 |  

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/accuracy,▁▁▂▅▃▆▄▅▆▄▆▆▅▆▄▅▇▄▇▆▆▆▆▅▆▆▆▆▆▆▆▆█▇▇▇▇▇▇▇
train/f1,▂▁▁▄▃▅▄▅▅▄▆▆▅▆▄▅▇▃▇▆▆▆▅▅▆▆▆▆▆▆▆▆█▇▇▇▆▆▇▇
train/loss,█▇▇▅▅▄▅▄▃▅▄▃▄▃▄▄▂▅▃▄▃▃▃▃▃▃▂▄▃▃▃▃▁▂▂▂▃▂▂▂
train_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid/accuracy,▁▄▅▆▇▇▇▇▇███▇▇▇▇▇██▇
valid/f1,▁▄▅▆▇▇▇▇▇████▇▇▇▇▇▇▆
valid/loss,█▄▄▂▁▁▂▂▂▂▂▄▃▄▄▄▄▅▇█
valid/precision,▁▄▄▅▆▇▆▆▇▇▇▇▇▇▇▇▇██▇
valid/recall,█▇▇▆▄▅▅▆▄▅▄▅▅▃▄▄▄▂▂▁

0,1
epoch,19.0
train/f1,0.94669
train_step,4759.0
valid/vote_accuracy,0.7131
valid/vote_f1,0.75676
valid/vote_precision,0.65651
valid/vote_recall,0.89316


In [22]:
# gru
model = train(optimizer='adam', lr=lr, device=device, n_epochs=epochs, verbose=False, tags=[tag],
                      batch_size=batch_size, encoder='rnn', merger='concatenation', rnn_type='gru',
                      rnn_output='avg', rec_size=rec_size, hid_size=hid_size, cosine_similarity=False)
torch.save(model.state_dict(), os.path.join(out_dir, 'gru_avg.pth'))

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/diego/.netrc


The vocabulary is already present. Loading it... Done!
The embedding matrix are already present. Loading it... Done!


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`
  _warn_prf(average, modifier, msg_start, len(result))


STARTING TRAINING
Evaluation metrics are computed on validation set

EPOCHS | TRAIN LOSS | VALID LOSS | ACCURACY | PRECISION | RECALL |  F1   | V_ACC | V_PREC | V_REC | V_F1
[00/20]|      0.492 |      0.632 |    0.672 |     0.609 |  0.973 | 0.749 | 0.670 |  0.606 | 0.970 | 0.746
[01/20]|      0.394 |      0.649 |    0.676 |     0.611 |  0.984 | 0.754 | 0.674 |  0.607 | 0.983 | 0.751
[02/20]|      0.370 |      0.586 |    0.699 |     0.630 |  0.975 | 0.766 | 0.697 |  0.627 | 0.974 | 0.763
[03/20]|      0.357 |      0.563 |    0.711 |     0.642 |  0.963 | 0.771 | 0.709 |  0.639 | 0.960 | 0.767
[04/20]|      0.345 |      0.591 |    0.704 |     0.635 |  0.971 | 0.768 | 0.702 |  0.631 | 0.969 | 0.765
[05/20]|      0.336 |      0.515 |    0.740 |     0.675 |  0.934 | 0.783 | 0.740 |  0.673 | 0.929 | 0.781
[06/20]|      0.327 |      0.539 |    0.732 |     0.664 |  0.946 | 0.781 | 0.730 |  0.662 | 0.942 | 0.778
[07/20]|      0.319 |      0.516 |    0.745 |     0.679 |  0.937 | 0.787 | 0.744 |  

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/accuracy,▁▃▅▅▄▅▆▅▅▅▅▅▆▆▆▆▇▆▆▆▆▆▇▅▆▆▇▇▇▆▇▇▆▆▇█▇▆▇█
train/f1,▁▃▄▄▄▅▅▅▅▅▅▄▅▅▆▅▇▅▅▆▅▆▆▅▆▆▇▇▆▆▇▆▆▆▆█▆▆▇█
train/loss,█▆▅▅▅▄▄▄▄▄▄▅▃▃▃▄▂▃▄▃▄▃▂▃▃▃▂▂▂▂▂▃▂▃▂▁▂▂▂▁
train_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid/accuracy,▁▁▃▅▄▇▆█▇▆▆█████▇█▇▇
valid/f1,▁▂▄▅▄▇▆█▇▇▇███▇▇█▇▇▇
valid/loss,▆▇▄▃▄▁▂▁▃▄▄▂▄▄▄▄▇▄▇█
valid/precision,▁▁▃▄▃▇▆▇▆▅▆▇▇▇▇█▆█▇▇
valid/recall,▇█▇▆▇▃▄▄▄▅▅▃▃▄▃▂▅▁▃▂

0,1
epoch,19.0
train/f1,0.94617
train_step,4759.0
valid/vote_accuracy,0.74183
valid/vote_f1,0.77971
valid/vote_precision,0.67964
valid/vote_recall,0.91435


### MLP layer

In [23]:
model = train(optimizer='adam', lr=lr, device=device, n_epochs=epochs, verbose=False,  tags=[tag],
              batch_size=batch_size, encoder='mlp', merger='concatenation', cosine_similarity=False)
torch.save(model.state_dict(), os.path.join(out_dir, 'mlp.pth'))

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/diego/.netrc


The vocabulary is already present. Loading it... Done!
The embedding matrix are already present. Loading it... Done!


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


STARTING TRAINING
Evaluation metrics are computed on validation set

EPOCHS | TRAIN LOSS | VALID LOSS | ACCURACY | PRECISION | RECALL |  F1   | V_ACC | V_PREC | V_REC | V_F1
[00/20]|      0.491 |      0.611 |    0.638 |     0.593 |  0.891 | 0.712 | 0.639 |  0.592 | 0.887 | 0.710
[01/20]|      0.422 |      0.663 |    0.643 |     0.591 |  0.938 | 0.725 | 0.642 |  0.589 | 0.935 | 0.723
[02/20]|      0.381 |      0.659 |    0.675 |     0.625 |  0.885 | 0.732 | 0.675 |  0.624 | 0.881 | 0.730
[03/20]|      0.350 |      0.603 |    0.690 |     0.662 |  0.784 | 0.718 | 0.690 |  0.661 | 0.777 | 0.714
[04/20]|      0.317 |      0.643 |    0.680 |     0.629 |  0.883 | 0.735 | 0.682 |  0.629 | 0.881 | 0.734
[05/20]|      0.288 |      0.700 |    0.684 |     0.634 |  0.878 | 0.736 | 0.685 |  0.633 | 0.875 | 0.735
[06/20]|      0.265 |      0.737 |    0.691 |     0.652 |  0.827 | 0.729 | 0.691 |  0.650 | 0.822 | 0.726
[07/20]|      0.243 |      0.808 |    0.685 |     0.636 |  0.871 | 0.735 | 0.686 |  

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/accuracy,▁▂▃▃▄▄▄▄▄▅▆▅▅▅▆▆▆▇▇▆▇▆▇▇██▇▇▆▇█▇██▇▇█▇█▇
train/f1,▁▁▃▃▄▄▄▄▄▅▆▅▅▅▆▅▆▆▇▆▆▆▇▇██▇▇▆▇█▇██▇▇▇▇█▇
train/loss,██▆▆▆▆▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▃▃▂▂▁▂▂▂▂▁▂▁▁▂▂▁▂▁▂
train_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid/accuracy,▁▂▆█▆▇█▇▇▇▆▇▅▅▇▆▆▆▅▇
valid/f1,▁▄▆▂▇▇▅▇█▅▅▇▅▆▅▆▃▅▅▆
valid/loss,▁▁▁▁▁▂▂▂▃▃▃▄▅▅▆▆▆▇█▇
valid/precision,▁▁▄█▅▅▇▅▅▅▅▅▃▄▅▅▆▅▄▅
valid/recall,▆█▆▁▆▅▃▅▆▄▅▅▆▆▄▅▃▄▅▅

0,1
epoch,19.0
train/f1,1.0
train_step,4659.0
valid/vote_accuracy,0.68309
valid/vote_f1,0.72953
valid/vote_precision,0.63492
valid/vote_recall,0.85727


### Bag of vectors

In [24]:
model = train(optimizer='adam', lr=lr, device=device, n_epochs=epochs, verbose=False,  tags=[tag],
              batch_size=batch_size, encoder='bag', merger='concatenation', cosine_similarity=False)
torch.save(model.state_dict(), os.path.join(out_dir, 'bow.pth'))

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/diego/.netrc


The vocabulary is already present. Loading it... Done!
The embedding matrix are already present. Loading it... Done!


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


STARTING TRAINING
Evaluation metrics are computed on validation set

EPOCHS | TRAIN LOSS | VALID LOSS | ACCURACY | PRECISION | RECALL |  F1   | V_ACC | V_PREC | V_REC | V_F1
[00/20]|      0.570 |      0.898 |    0.516 |     0.510 |  0.993 | 0.674 | 0.512 |  0.506 | 0.992 | 0.670
[01/20]|      0.534 |      0.789 |    0.583 |     0.552 |  0.913 | 0.688 | 0.581 |  0.549 | 0.907 | 0.684
[02/20]|      0.524 |      0.788 |    0.599 |     0.565 |  0.885 | 0.690 | 0.598 |  0.563 | 0.879 | 0.686
[03/20]|      0.519 |      0.726 |    0.610 |     0.582 |  0.804 | 0.675 | 0.610 |  0.580 | 0.793 | 0.670
[04/20]|      0.514 |      0.802 |    0.600 |     0.565 |  0.899 | 0.694 | 0.599 |  0.562 | 0.894 | 0.691
[05/20]|      0.508 |      0.749 |    0.613 |     0.579 |  0.855 | 0.690 | 0.612 |  0.576 | 0.847 | 0.686
[06/20]|      0.502 |      0.825 |    0.601 |     0.564 |  0.910 | 0.697 | 0.600 |  0.562 | 0.906 | 0.694
[07/20]|      0.496 |      0.724 |    0.624 |     0.591 |  0.820 | 0.687 | 0.623 |  

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/accuracy,▂▂▂▁▂▃▄▅▃▃▅▄▆▅▅▅▆▃▂▅▅▅▄▄▄▄▆▆▇▄▆▅▅▅▅▇▆▇█▆
train/f1,▂▂▃▁▁▄▅▅▃▂▅▄▆▅▅▅▆▃▂▄▅▅▄▄▄▃▆▆▆▃▆▄▅▄▅▇▆▇█▅
train/loss,█▇▇██▆▄▄▆▆▅▄▅▄▄▄▃▆▅▄▃▄▅▄▃▅▃▃▃▄▃▄▄▄▄▂▃▂▁▄
train_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid/accuracy,▁▅▅▆▅▆▅▇▆▆▆▇▆▇▆▇▇▇▇█
valid/f1,▁▄▄▁▅▄▅▃▆▆▅▇▇▇▇▇▇█▇▆
valid/loss,█▄▄▁▄▂▅▁▃▄▅▄▆▅▆▄▅▇▅▂
valid/precision,▁▄▅▆▅▆▅▇▅▅▅▆▅▆▆▇▆▆▆█
valid/recall,█▅▄▁▅▃▅▂▄▄▄▄▅▄▅▄▅▅▄▂

0,1
epoch,19.0
train/f1,0.88274
train_step,4759.0
valid/vote_accuracy,0.64837
valid/vote_f1,0.6988
valid/vote_precision,0.61087
valid/vote_recall,0.81628


The best sentence embedding is taking **the last state of an RNN encoder (LSTM)**. Indeed it reaches the highest valalues between all the runs in F1 score (both standard and group voting) and in accuracy (only group voting, since taking the average of the states of a GRU RNN results in a barely better standard accuracy).

## Merging

In [19]:
out_dir = './models'
if not os.path.isdir(out_dir):
    os.mkdir(out_dir)
lr = 1e-3
epochs = 20
batch_size = 512
rec_size = 2
hid_size = 50
tag = 'merging choice'
encoder = 'rnn'
rnn_type = 'lstm'
rnn_output = 'last'

### Concatenation

In [None]:
model = train(optimizer='adam', lr=lr, device=device, n_epochs=epochs, verbose=False,  tags=[tag],
              batch_size=batch_size, encoder=encoder, rnn_type=rnn_type, rnn_output=rnn_output, merger='concatenate', cosine_similarity=False)

### Sum

In [None]:
model = train(optimizer='adam', lr=lr, device=device, n_epochs=epochs, verbose=False,  tags=[tag],
              batch_size=batch_size, encoder=encoder, rnn_type=rnn_type, rnn_output=rnn_output, merger='sum', cosine_similarity=False)

### Mean

In [None]:
model = train(optimizer='adam', lr=lr, device=device, n_epochs=epochs, verbose=False,  tags=[tag],
              batch_size=batch_size, encoder=encoder, rnn_type=rnn_type, rnn_output=rnn_output, merger='mean', cosine_similarity=False)

The best merging strategy is

## Other hyperparameters tuning

In [None]:
out_dir = './models'
if not os.path.isdir(out_dir):
    os.mkdir(out_dir)
epochs = 20
batch_size = 512
tag = 'hyperparameters tuning'
encoder = ''
rnn_type = ''
rnn_output = ''
merger = ''

lrs = [1e-3, 1e-4, 1e-5]
rec_sizes = [1, 2]
hid_sizes = random.sample(range(25,100), 3)
n_layers_classifiers = [2, 3]

for run in range(10):
    lr = random.uniform(1e-5, 1e-3)
    rec_size = random.randint(1, 2)
    hid_size = random.randint(25, 100)
    n_layers_classifier = random.randint(2,3)
