<a href="https://colab.research.google.com/github/NLPetroni/assignment_two/blob/main/solution_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and downloads



In [1]:
import numpy as np
import pandas as pd
import sys
import os

%cd /content
!rm -rf assignment_two &> /dev/null
!git clone https://github.com/NLPetroni/assignment_two &> /dev/null
%cd assignment_two
sys.path.append(os.getcwd())



/content
/content/assignment_two


In [2]:
from src import utils
import re
from functools import reduce
import nltk
from nltk.corpus import stopwords
from typing import List, Callable, Dict
import random
import torch
from torch import nn

In [3]:
utils.download_data('dataset')
train_set = pd.read_csv("dataset/train_pairs.csv")
val_set = pd.read_csv("dataset/val_pairs.csv")
test_set = pd.read_csv("dataset/test_pairs.csv")

Downloading FEVER data splits...
Download completed!
Extracting dataset...
Extraction completed!


In [4]:
print(train_set.columns)
print("Total rows of the train set: {:d}".format(len(train_set)))
print("Total rows of the validation set: {:d}".format(len(val_set)))
print("Total rows of the test set: {:d}".format(len(test_set)))

Index(['Unnamed: 0', 'Claim', 'Evidence', 'ID', 'Label'], dtype='object')
Total rows of the train set: 121740
Total rows of the validation set: 7165
Total rows of the test set: 7189


In [5]:
train_set['Label'].value_counts()

SUPPORTS    89389
REFUTES     32351
Name: Label, dtype: int64

In [6]:
print(train_set.iloc[0]['Evidence'])

2	Hemsworth has also appeared in the science fiction action film Star Trek -LRB- 2009 -RRB- , the thriller adventure A Perfect Getaway -LRB- 2009 -RRB- , the horror comedy The Cabin in the Woods -LRB- 2012 -RRB- , the dark-fantasy action film Snow White and the Huntsman -LRB- 2012 -RRB- , the war film Red Dawn -LRB- 2012 -RRB- , and the biographical sports drama film Rush -LRB- 2013 -RRB- .	Star Trek	Star Trek (film)	A Perfect Getaway	A Perfect Getaway	The Cabin in the Woods	The Cabin in the Woods	Snow White and the Huntsman	Snow White and the Huntsman	Red Dawn	Red Dawn (2012 film)	Rush	Rush (2013 film)


# Dataset pre-processing and conversion

In [7]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;\t-]')
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
BAD_SYMBOLS_RE = re.compile('(-LRB-)|(-RRB-)|(-LSB-)|(-RSB-)')
INSIDE_SQAURE_BRACKETS_RE = re.compile('(-LSB-).*?(-RSB-)')

try:
    STOPWORDS = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    STOPWORDS = set(stopwords.words('english'))

def remove_inside_square_brackets(text: str) -> str:
    return INSIDE_SQAURE_BRACKETS_RE.sub('', text)

def remove_bad_symbols(text: str) -> str:
    return BAD_SYMBOLS_RE.sub('', text)

def remove_final_tags(text: str) -> str:
   return re.sub('\.\t.*?$', '', text) 

def lower(text: str) -> str:
    """
    Transforms given text to lower case.
    Example:
    Input: 'I really like New York city'
    Output: 'i really like new your city'
    """

    return text.lower()

def replace_special_characters(text: str) -> str:
    """
    Replaces special characters, such as paranthesis,
    with spacing character
    """

    return REPLACE_BY_SPACE_RE.sub(' ', text)

def replace_br(text: str) -> str:
    """
    Replaces br characters
    """

    return text.replace('</br>', '')

def filter_out_uncommon_symbols(text: str) -> str:
    """
    Removes any special character that is not in the
    good symbols list (check regular expression)
    """

    return GOOD_SYMBOLS_RE.sub('', text)

def remove_stopwords(text: str) -> str:
    return ' '.join([x for x in text.split() if x and x not in STOPWORDS])


def strip_text(text: str) -> str:
    """
    Removes any left or right spacing (including carriage return) from text.
    Example:
    Input: '  This assignment is cool\n'
    Output: 'This assignment is cool'
    """

    return text.strip()

def split_text(text: str) -> List:
  return text.split()

PREPROCESSING_PIPELINE = [
                          remove_inside_square_brackets,
                          remove_bad_symbols,
                          lower,
                          remove_final_tags,
                          replace_special_characters,
                          filter_out_uncommon_symbols,
                          remove_stopwords,
                          strip_text,
                          split_text
                          ]

# Anchor method

def text_prepare(text: str,
                 filter_methods: List[Callable[[str], str]] = None) -> str:
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """

    filter_methods = filter_methods if filter_methods is not None else PREPROCESSING_PIPELINE

    return reduce(lambda txt, f: f(txt), filter_methods, text)


# In the evidences there is an id at the beginning of the sequence which is
# removed with the splice [:1]
train_set['Evidence'] = train_set['Evidence'].apply(lambda txt: text_prepare(txt)[1:])
train_set['Claim'] = train_set['Claim'].apply(lambda txt: text_prepare(txt))

val_set['Evidence'] = val_set['Evidence'].apply(lambda txt: text_prepare(txt)[1:])
val_set['Claim'] = val_set['Claim'].apply(lambda txt: text_prepare(txt))

test_set['Evidence'] = test_set['Evidence'].apply(lambda txt: text_prepare(txt)[1:])
test_set['Claim'] = test_set['Claim'].apply(lambda txt: text_prepare(txt))

In [8]:
voc_evidence = [item for sublist in train_set[:]['Evidence'] for item in sublist]
voc_claim = [item for sublist in train_set[:]['Claim'] for item in sublist]
vocabulary = list(set(voc_evidence + voc_claim))

def tokenize(input: List) -> torch.Tensor:
  result = list(map(lambda x: vocabulary.index(x), input))
  return torch.tensor(result)

def detokenize(input: torch.Tensor) -> List:
  result = input.tolist()
  result = list(map(lambda x: vocabulary[x], result))
  return result

## Glove

In [20]:
import pickle

if (os.path.exists("res/vocabulary.pkl") and os.path.exists("res/embedding_matrix.npy")):
  with open('res/vocabulary.pkl', 'rb') as f:
    VOCABULARY = pickle.load(f)
  EMBEDDING_MATRIX = np.load("res/embedding_matrix.npy")
  
else:
  TRAIN_VOC = set(vocabulary)
  voc_evidence = [item for sublist in val_set[:]['Evidence'] for item in sublist]
  voc_claim = [item for sublist in val_set[:]['Claim'] for item in sublist]
  VAL_VOC = set(voc_evidence + voc_claim)

  inputs = train_set[:]['Evidence'].tolist() + train_set[:]['Claim'].tolist()
  glove_voc, embedding_matrix = utils.get_glove(number_token=False)
  vocabulary, embedding_matrix = utils.add_oov(glove_voc, TRAIN_VOC, embedding_matrix, inputs)
  inputs = val_set[:]['Evidence'].tolist() + val_set[:]['Claim'].tolist()
  vocabulary, embedding_matrix = utils.add_oov(vocabulary, VAL_VOC, embedding_matrix, inputs)

  with open("vocabulary.pkl", "wb") as file:
    pickle.dump(vocabulary, file)
  np.save("embedding_matrix.npy", embedding_matrix)

# Model definition

In [23]:
def get_classifier(name:str,
                    fc1_in: int,
                    hidden: int
                    fc2_out: int) -> nn.Sequential:
    """Gets a sequential container with a sandwich of Fully connected layer 
       and Relu.

    Args:
        name: the name prefix to append to each layer in the container.
        features_in: the number of the input features.
        features_out: the number of the output features.

    Returns: the created sequential.
    """
    container = nn.Sequential()
    container.add_module(f'{name}_fc1', nn.Linear(in_features=fc1_in, out_features=hidden))    
    container.add_module(f'{name}_ReLU', nn.ReLU(inplace=True))
    container.add_module(f'{name}_fc2', nn.Linear(in_features=hidden, out_features=fc2_out))    
    return container

In [9]:
class RNNEncoder(torch.nn.Module):

  def __init__(self, embedding_matrix, type, rec_size=1, units=None, hid_size=50, state='last'):
    """
      A recurrent network performing multiclass classification (POS tagging).
      Params:
        type: [elman, lstm, gru]
        embedding_matrix: 
        rec_size: 
        units: 
        hid_size:
        state: [avg, last]
    """
    super().__init__()
    self.state = state

    emb_size = embedding_matrix.shape[1]
    self.emb_layer = nn.Embedding.from_pretrained(torch.as_tensor(embedding_matrix))

    if type == 'elman':
      rec_module = nn.RNN
    elif type == 'lstm':
      rec_module = nn.LSTM
    elif type == 'gru':
      rec_module = nn.GRU
    else:
      raise ValueError(f'wrong type {type}, either elman, lstm or gru')
    self.rec_modules = rec_module(input_size=emb_size, hidden_size=hid_size, bidirectional=True, batch_first=False, num_layers=rec_size)

    self.classifier = get_classifier('classifier', fc1_in=2*self.hid_size, hidden=self.hid_size, fc2_out=2)
    self.

  def __call__(self, x):
    vecs = self.emb_layer(x).float()
    output, _ = self.rec_modules(vecs)
    if self.state == 'last':
      sentence_emb = output[-1]
    if self.state == 'avg':
      sentence_emb = torch.mean(output, dim=0)
    # it works until here (probably)


    

# Training

In [10]:
def training_step(model, optimizer, loss_fn, data_loader, device):
  model.train()

  for (input, target) in data_loader:
    #forward
    input = input.to(device)
    target = target.to(device)
    output = model(input)
    loss = loss_fn(output, target)
    loss_value = loss.item()

    if not math.isfinite(loss_value):
      print(f"Loss is {loss_value}, stopping training")
      exit(1)

    #backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()    


In [11]:
def train(optimizer_name, lr, loss_fn, data_loader, device, n_epochs, verbose=False, batch_size):
  '''wandb.login(key=utils.get_wandbkey()) # TODO: implement getter
  run = wandb.init(project="assignment-two", entity="nlpetroni", reinit=True, config=cfg_dict)
  wandb.define_metric("train_step")
  wandb.define_metric("epoch")
  wandb.define_metric('train/loss', step_metric="train_step", summary="min")
  wandb.define_metric("valid/loss", step_metric="epoch", summary="min")
  wandb.define_metric("valid/accuracy", step_metric="epoch", summary="max")'''

  train_dl = torch.utils.data.DataLoader(train_set, batch_size=batch_size)
  valid_dl = torch.utils.data.DataLoader(val_set, batch_size=batch_size)

  model = RNNEncoder(EMBEDDING_MATRIX, type='lstm', state='last')
  wandb.watch(model, log_graph=True)
    if verbose:
        print(summary(model))

  if optimizer_name == 'rmsprop':
    optimizer = torch.optim.RMSprop(params, lr=cfg.LR, alpha=0.99, momentum=0.5, weight_decay=0)
  elif optimizer_name == 'adam':
    optimizer = torch.optim.Adam(params, lr=cfg.LR, betas=(0.9, 0.999), weight_decay=0)
  else:
    raise ValueError(f'wrong optim {optimizer_name}, either rmsprop or adam')

  loss = nn.NLLLoss()
  train_step = 0
  print('STARTING TRAINING')

  for epoch in range(n_epochs):
    training_step(model, optimizer, loss, train_dl, device)

    

In [12]:
v, m = train(0, 0, 0, 0, 0, 0)
print(list(m.keys())[0])

downloading and unzipping glove... completed


AttributeError: ignored

In [13]:
print(m[-1])

[ 0.03331625 -0.308181    0.63416249 -0.5216718   0.98621252 -0.04922992
  0.48023     0.1507365  -0.089796    0.05830625  0.66453999 -0.0792175
  0.20350499  0.00856375 -0.03349375 -0.03475501  0.44441099 -0.00996525
 -0.0481375   0.20436675  0.72433324 -0.60414249  0.23826051  0.8045575
  0.08928501  0.10531501  0.3549425  -0.20389625  0.6420625   0.33227601
 -0.17211001  0.1980715  -0.056469    0.48801751  0.08070225  0.26807705
 -0.06970499  0.48458    -0.35779    -0.50152176 -0.01007025  0.14516001
  0.14906275  0.64062253  0.32606751 -0.17768801 -0.187397   -0.7296275
  0.24034    -0.62037999 -0.14349001 -0.13984125 -0.05807375  0.63350751
 -0.40306251 -2.93017501 -0.31627426  0.00958225  1.10951     0.70182
 -0.0930165   0.58280248 -0.68472751 -0.20408176  0.93593749 -0.10864325
 -0.03932074  0.25197174  0.2498875  -0.34059025  0.02416751  0.66234
 -0.4861475   0.21644074 -0.09540675  0.64083499 -0.25014575 -0.085182
 -0.52128751 -0.53219751  0.21471     0.21045225 -0.158838    

In [None]:
inputs = train_set[:]['Evidence'].tolist() + train_set[:]['Claim'].tolist()
print(inputs[0])

In [None]:
glove_voc = utils.get_glove(number_token=False)

In [None]:
list(glove_voc[0].keys())[0]