<a href="https://colab.research.google.com/github/NLPetroni/assignment_two/blob/main/solution_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and downloads



In [1]:
import numpy as np
import pandas as pd
import sys
import os

try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  %cd /content
  !rm -rf assignment_two &> /dev/null
  !git clone https://github.com/NLPetroni/assignment_two &> /dev/null
  %cd assignment_two
  sys.path.append(os.getcwd())
  !git clone https://gitlab.com/sasso-effe/nlp-assignment-data.git &> /dev/null
  !mv nlp-assignment-data/embedding_matrix.npy res/embedding_matrix.npy
  !rm -rf nlp-assignment-data
  !pip install wandb


In [2]:
from src import utils
import re
from functools import reduce
import nltk
from nltk.corpus import stopwords
from typing import List, Callable, Dict
import random
import torch
from torch import nn
from torchsummary import summary
import math
import wandb

In [3]:
utils.download_data('dataset')
train_set = pd.read_csv("dataset/train_pairs.csv")
val_set = pd.read_csv("dataset/val_pairs.csv")
test_set = pd.read_csv("dataset/test_pairs.csv")

In [4]:
print(train_set.columns)
print("Total rows of the train set: {:d}".format(len(train_set)))
print("Total rows of the validation set: {:d}".format(len(val_set)))
print("Total rows of the test set: {:d}".format(len(test_set)))

Index(['Unnamed: 0', 'Claim', 'Evidence', 'ID', 'Label'], dtype='object')
Total rows of the train set: 121740
Total rows of the validation set: 7165
Total rows of the test set: 7189


In [5]:
train_set['Label'].value_counts()

SUPPORTS    89389
REFUTES     32351
Name: Label, dtype: int64

# Dataset pre-processing and conversion

In [6]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;\t-]')
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
BAD_SYMBOLS_RE = re.compile('(-LRB-)|(-RRB-)|(-LSB-)|(-RSB-)')
INSIDE_SQAURE_BRACKETS_RE = re.compile('(-LSB-).*?(-RSB-)')

try:
    STOPWORDS = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    STOPWORDS = set(stopwords.words('english'))

def remove_inside_square_brackets(text: str) -> str:
    return INSIDE_SQAURE_BRACKETS_RE.sub('', text)

def remove_bad_symbols(text: str) -> str:
    return BAD_SYMBOLS_RE.sub('', text)

def remove_final_tags(text: str) -> str:
   return re.sub('\.\t.*?$', '', text) 

def lower(text: str) -> str:
    """
    Transforms given text to lower case.
    Example:
    Input: 'I really like New York city'
    Output: 'i really like new your city'
    """

    return text.lower()

def replace_special_characters(text: str) -> str:
    """
    Replaces special characters, such as paranthesis,
    with spacing character
    """

    return REPLACE_BY_SPACE_RE.sub(' ', text)

def replace_br(text: str) -> str:
    """
    Replaces br characters
    """

    return text.replace('</br>', '')

def filter_out_uncommon_symbols(text: str) -> str:
    """
    Removes any special character that is not in the
    good symbols list (check regular expression)
    """

    return GOOD_SYMBOLS_RE.sub('', text)

def remove_stopwords(text: str) -> str:
    return ' '.join([x for x in text.split() if x and x not in STOPWORDS])


def strip_text(text: str) -> str:
    """
    Removes any left or right spacing (including carriage return) from text.
    Example:
    Input: '  This assignment is cool\n'
    Output: 'This assignment is cool'
    """

    return text.strip()

def split_text(text: str) -> List:
  return text.split()

PREPROCESSING_PIPELINE = [
                          remove_inside_square_brackets,
                          remove_bad_symbols,
                          lower,
                          remove_final_tags,
                          replace_special_characters,
                          filter_out_uncommon_symbols,
                          remove_stopwords,
                          strip_text,
                          split_text
                          ]

# Anchor method

def text_prepare(text: str,
                 filter_methods: List[Callable[[str], str]] = None) -> str:
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """

    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE

    return reduce(lambda txt, f: f(txt), filter_methods, text)


# In the evidences there is an id at the beginning of the sequence which is
# removed with the splice [:1]
train_set['Evidence'] = train_set['Evidence'].apply(lambda txt: text_prepare(txt)[1:])
train_set['Claim'] = train_set['Claim'].apply(lambda txt: text_prepare(txt))

val_set['Evidence'] = val_set['Evidence'].apply(lambda txt: text_prepare(txt)[1:])
val_set['Claim'] = val_set['Claim'].apply(lambda txt: text_prepare(txt))

test_set['Evidence'] = test_set['Evidence'].apply(lambda txt: text_prepare(txt)[1:])
test_set['Claim'] = test_set['Claim'].apply(lambda txt: text_prepare(txt))

## Padding

In [7]:
# add padding to the train set
max_claim = max(map(lambda x : len(x), train_set['Claim']))
max_evidence = max(map(lambda x : len(x), train_set['Evidence']))
train_set['Claim'] = utils.pad_data(train_set['Claim'],max_claim)
train_set['Evidence'] = utils.pad_data(train_set['Evidence'],max_evidence)

# add padding to the validation set
max_claim = max(map(lambda x : len(x), val_set['Claim']))
max_evidence = max(map(lambda x : len(x), val_set['Evidence']))
val_set['Claim'] = utils.pad_data(val_set['Claim'],max_claim)
val_set['Evidence'] = utils.pad_data(val_set['Evidence'],max_evidence)

In [9]:
class Dataset(torch.utils.data.Dataset):
    """Simple dataset class to use dataloaders (batching) """
    def __init__(self, claims, evidences, labels):
        self.claims = claims
        self.evidences = evidences
        self.labels = labels
    def __getitem__(self, idx):
        return self.claims[idx], self.evidences[idx], self.labels[idx]
    def __len__(self):
        return self.claims.shape[0]

## Glove

In [10]:
import pickle

if (os.path.exists("res/vocabulary.pkl") and os.path.exists("res/embedding_matrix.npy")):
  print('The vocabulary and the embedding matrix are already present. Loading them...')
  with open('res/vocabulary.pkl', 'rb') as f:
    VOCABULARY = pickle.load(f)
  EMBEDDING_MATRIX = np.load("res/embedding_matrix.npy", )
  print("Done!")
  
else:
  print("The vocabulary and the embedding matrix are NOT present. Creating them...")
  TRAIN_VOC = set(vocabulary)
  voc_evidence = [item for sublist in val_set[:]['Evidence'] for item in sublist]
  voc_claim = [item for sublist in val_set[:]['Claim'] for item in sublist]
  VAL_VOC = set(voc_evidence + voc_claim)

  inputs = train_set[:]['Evidence'].tolist() + train_set[:]['Claim'].tolist()
  glove_voc, embedding_matrix = utils.get_glove(number_token=False)
  vocabulary, embedding_matrix = utils.add_oov(glove_voc, TRAIN_VOC, embedding_matrix, inputs)
  inputs = val_set[:]['Evidence'].tolist() + val_set[:]['Claim'].tolist()
  vocabulary, embedding_matrix = utils.add_oov(vocabulary, VAL_VOC, embedding_matrix, inputs)

  with open("res/vocabulary.pkl", "wb") as file:
    pickle.dump(vocabulary, file)
  np.save("res/embedding_matrix.npy", embedding_matrix)
  print("Vocabulary and embedding matrix created! Remember to download the generated files.")

The vocabulary and the embedding matrix are already present. Loading them...
Done!


## Tokenization

In [28]:
voc_key_list = list(VOCABULARY.keys())
voc_val_list = list(VOCABULARY.values())

def tokenize(input: List) -> torch.Tensor:
  result = list(map(lambda x: VOCABULARY[x], input))
  return torch.tensor(result)

def detokenize(input: torch.Tensor) -> List:
  result = input.tolist()
  result = list(map(lambda x: voc_key_list[voc_val_list.index(x)], result))
  return result

# tokenize training set
train_set['Evidence'] = train_set['Evidence'].map(tokenize)
train_set['Claim'] = train_set['Claim'].map(tokenize)

# tokenize validation set
val_set['Evidence'] = val_set['Evidence'].map(tokenize)
val_set['Claim'] = val_set['Claim'].map(tokenize)

# Model definition

In [19]:
def get_binary_classifier(name:str,
                    w_in: int,
                    w_hidden: int) -> nn.Sequential:
    """Gets a sequential container with a linear+relu+linear classifier

    Args:
        name: the name prefix to append to each layer in the container.
        w_in: the number of the input features.
        w_hidden: the number of internal weights

    Returns: the created sequential.
    """
    container = nn.Sequential()
    container.add_module(f'{name}_fc1', nn.Linear(in_features=w_in, out_features=w_hidden))    
    container.add_module(f'{name}_ReLU', nn.ReLU(inplace=True))
    container.add_module(f'{name}_fc2', nn.Linear(in_features=w_hidden, out_features=1))
    container.add_module(f'{name}_sigmoid', nn.Sigmoid())
    return container

 
class RNNEncoder(torch.nn.Module):

  def __init__(self, input_size, hidden_size, num_layers, rnn_type='elman', output_state='last', verbose=False):
    super().__init__()
    if verbose:
      print('Initializing RNNEncoder ')
    types = {'elman': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}
    states = {
        'last': lambda x: x[:,-1],
        'avg': lambda x: torch.mean(x, dim=1)}
    
    try:
      self.output_state_fn = states[output_state]
    except:
      valid_states = list(states.keys())
      raise ValueError(f"wrong type '{output_state}', must be in {valid_states}")

    try:
      rec_module = types[rnn_type]
    except:
      valid_types = list(types.keys())
      raise ValueError(f"wrong type '{rnn_type}', must be in {valid_types}")
    self.rec_module = rec_module(input_size=input_size, hidden_size=hidden_size,
                                 bidirectional=True, batch_first=True,
                                 num_layers=num_layers)
    
  def forward(self, x):
    output, _ = self.rec_module(x)
    return self.output_state_fn(output)

class BagOfVectorsEncoder(torch.nn.Module):

  def __init__(self):
    super().__init__()

  def forward(self, x):
    # TODO: check if the mean is computed on the right axis
    return torch.mean(x, dim=1)


In [20]:
class FactChecker(torch.nn.Module):

  def __init__(self, embedding_matrix, encoder, merger, rnn_type=None, rnn_output=None, rec_size=1, hid_size=50):
    """
      A recurrent network performing Neural Language Inference (Fact Checking).
      Params:
        embedding_matrix: the embedding matrix for word embedding
        encoder: [rnn, mlp, bag], the encoder to compute the sentence embedding
        merger: [concatenation, sum, mean], the multi-input merging strategy
        RNNEncoder params, only relevant if encoder==rnn:
          rnn_type: [elman, lstm, gru], the RNN architecure used in the encoder
          rnn_output: [last, avg], the function to compute the sentence encoding from the RNN hidden states
          rec_size: int, the number of layers in the rnn
          hid_size: int, the hidden size of the rnn

    """
    super().__init__()
    self.hid_size = hid_size

    # Word embedding
    emb_size = embedding_matrix.shape[1]
    self.emb_layer = nn.Embedding.from_pretrained(torch.as_tensor(embedding_matrix))

    # Sentence embedding
    if encoder == 'rnn':
      self.encoder = RNNEncoder(emb_size, hid_size, rec_size, rnn_type=rnn_type, output_state=rnn_output)
    elif encoder == 'mlp':
      pass #TODO: implement
    elif encoder == 'bag':
      self.encoder = BagOfVectorsEncoder()
    else:
      raise ValueError(f"Wrong encoder '{encoder}', must be in ['rnn', 'mlp', 'bag']")

    # Merging
    merging_strategies = {
        'concatenation': lambda claim, ev: torch.cat((claim, ev), dim=1),
        'sum': lambda claim, ev : claim + ev,
        'mean': lambda claim, ev : (claim + ev) / 2
    }
    try:
      merging_fn = merging_strategies[merger]
    except:
      valid_strategies = list(merging_strategies.keys())
      raise ValueError(f"wrong type '{merger}', must be in {valid_strategies}")
    self.merger = merging_fn

    # Classifier
    classifier_in = self.hid_size * 2
    if merger == 'concatenation':
      classifier_in *= 2
    self.classifier = get_binary_classifier('classifier', w_in=classifier_in, w_hidden=self.hid_size)

  def __call__(self, claim, evidence, debug=False):
    # Word embedding
    claim = self.emb_layer(claim).float()
    evidence = self.emb_layer(evidence).float()
    if debug:
      print("After word embedding")
      print(f"\tclaim.shape: {claim.shape}")
      print(f"\tevidence.shape: {evidence.shape}")
    # Sentence embedding
    claim = self.encoder(claim)
    evidence = self.encoder(evidence)
    if debug:
      print("After phrase encoding")
      print(f"\tclaim.shape: {claim.shape}")
      print(f"\tevidence.shape: {evidence.shape}")
    # Merging
    merged_data = self.merger(claim, evidence)
    if debug:
      print("After merging")
      print(f"\tmerged_data.shape: {merged_data.shape}")
    # Classifying
    output = self.classifier(merged_data)



    return output

# Training

In [22]:
def training_step(model, optimizer, loss_fn, data_loader, device):
  model.train()
  log_dict = {'train/loss': []}

  for (claim, evidence, label) in data_loader:
    # forward
    #claim = torch.tensor(claim).to(device)
    #evidence = torch.tensor(evidence).to(device)
    #label = torch.tensor(label).to(device)
    output = model(claim, evidence)
    loss = loss_fn(output, label)
    loss_value = loss.item()

    if not math.isfinite(loss_value):
      print(f"Loss is {loss_value}, stopping training")
      exit(1)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    log_dict['train/loss'].append(loss_value)

  return log_dict


def evaluate(model, loss_fn, data_loader, device, metric='accuracy'):
  """
    Evaluate model on the given dataloader.
    Parameters:
      model: torch.nn.Module to evaluate
      loss_fn: torch.nn criterion to use to compute loss, given outputs and targets
      data_loader: torch.utils.data.DataLoader
      device: torch.device where evaluation is performed
      metric: either 'accuracy' or 'f1'
    Returns log dict {'valid/loss' : mean loss, 'valid/{metric}': mean metric}
  """
  model.eval()
  batch_losses = []
  batch_metrics = []
  if metric == 'f1':
    assert len(data_loader) == 1 # must be a single batch
    split = 'test'
  else:
    split = 'valid'
  with torch.no_grad():
    for claim, evidence, label in data_loader:
      claim = claim.to(device)
      evidence = evidence.to(device)
      label = label.to(device)

      logprobs = model(claim, evidence).transpose(1, 2)
      loss_value = loss_fn(logprobs, label).item()
      preds = torch.argmax(logprobs, 1)

      if metric == 'accuracy':
        metric_value = ((label == preds).sum() / (data_loader.batch_size * label.shape[1])).item()
      else:
        raise ValueError(f'wrong metric {metric}, must be in [accuracy]')

      batch_losses.append(loss_value)
      batch_metrics.append(metric_value)

  log_dict = {f'{split}/loss': np.mean(batch_losses),
             f'{split}/{metric}': np.mean(batch_metrics) if metric == 'accuracy' else batch_metrics[0]}
  return log_dict


In [23]:
def train(optimizer_name, lr, loss_fn, device, n_epochs, verbose, batch_size, test=False, **model_params):
  cfg_dict = {'epochs': n_epochs, 'batch_size': batch_size, 'optimizer': optimizer_name}

  wandb.login(key=utils.get_wandbkey())
  run = wandb.init(project="assignment-two", entity="nlpetroni", reinit=True, config=cfg_dict)
  wandb.define_metric("train_step")
  wandb.define_metric("epoch")
  wandb.define_metric('train/loss', step_metric="train_step", summary="min")
  wandb.define_metric("valid/loss", step_metric="epoch", summary="min")
  wandb.define_metric("valid/accuracy", step_metric="epoch", summary="max")

  if len(model_params) == 0:
      model_params = {
          'encoder': 'rnn',
          'merger': 'concatenation',
          'rnn_type': 'elman',
          'rnn_output': 'last',
          'rec_size': 1,
          'hid_size': 50
      }
  train_dl = torch.utils.data.DataLoader(
    Dataset(train_set['Claim'], train_set['Evidence'], train_set['Label']),
    batch_size=batch_size)
  valid_dl = torch.utils.data.DataLoader(
    Dataset(val_set['Claim'], val_set['Evidence'], val_set['Label']),
    batch_size=batch_size)

  model = FactChecker(EMBEDDING_MATRIX, **model_params)
  wandb.watch(model, log_graph=True)
  #if verbose:
    #print(summary(model))

  params = [p for p in model.parameters() if p.requires_grad]
  if optimizer_name == 'rmsprop':
    optimizer = torch.optim.RMSprop(params, lr=lr, alpha=0.99, momentum=0.5, weight_decay=0)
  elif optimizer_name == 'adam':
    optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.999), weight_decay=0)
  else:
    raise ValueError(f'wrong optim {optimizer_name}, either rmsprop or adam')

  loss = nn.NLLLoss()
  train_step = 0
  print('STARTING TRAINING')

  for epoch in range(n_epochs):
    log_dict = training_step(model, optimizer, loss, train_dl, device)
    if not test:
      log_dict.update(evaluate(model, loss, valid_dl, device, metric=metric))
      for batch_loss in log_dict['train/loss']:
        wandb.log({'train_step': train_step, 'epoch': epoch, 'train/loss': batch_loss})
        train_step += 1
      wandb.log({'epoch': epoch, 'valid/loss': log_dict['valid/loss'], 'valid/accuracy': log_dict['valid/accuracy']})
      if (epoch % 50) == 0:
        print(f'[{epoch:03d}/{n_epochs:03d}] train loss: {np.mean(log_dict["train/loss"]):.3f}, valid loss: {log_dict["valid/loss"]:.3f}, accuracy: {log_dict["valid/accuracy"]:.2f}')
  if test:
    #log_dict = evaluate(model, loss, test_dl, device, metric=metric)
    wandb.log()

  run.finish()
  return model


In [None]:
train(optimizer_name='adam', lr=0.1, loss_fn='', device='', n_epochs=1, verbose=True, batch_size=10)
