In [1]:
# %pip install pandas nltk numpy torch torchvision

In [2]:
import pandas as pd
import json
from nltk.tokenize import word_tokenize
import numpy as np
import re
import random

# import nltk
# nltk.download('punkt')

# 1. Preprocess the data

In [4]:
# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidence = json.load(f)
eviden = pd.DataFrame.from_dict(evidence, orient='index', columns=['evidence'])
ev_txt = eviden['evidence'].values
max_len = max([len(j.split()) for i,j in evidence.items()])

# Read train claims
with open('../data/train-claims.json', 'r') as f:
    df_train = pd.DataFrame(json.load(f)).transpose()

# Read dev claims
with open('../data/dev-claims.json', 'r') as f:
    df_dev = pd.DataFrame(json.load(f)).transpose()

In [11]:
print(f'evidence len: {len(ev_txt)} * dev len {df_train.shape[0]} = {len(ev_txt)*df_train.shape[0]}')

evidence len: 1208827 * dev len 1228 = 1484439556


In [4]:
# Helper function to prepare datasets
def prepare_df(df):
    # Support labels
    df_1 = df.explode("evidences")
    df_1['evidences_text'] = [evidence[item] for item in df_1['evidences']]
    df_1['label'] = 1
    # Refuse labels
    df_2 = df_1[['claim_text']].copy()
    df_2['evidences_text'] = [random.choice(ev_txt) for i in range(df_2.shape[0])]
    df_2['label'] = 0
    df = pd.concat([df_1[['claim_text' , 'evidences_text', 'label']], df_2]).sample(frac=1)
    return df

# Select columns to work on and retrieve tokenized and preprocesed vectors 
def feature_selection(df):
    # Prepare df
    df_ = prepare_df(df)
    # Set words to lower and tokenize
    tok_evidence = [word_tokenize(i.lower()) for i in df_['evidences_text']]
    tok_claim = [word_tokenize(i.lower()) for i in df_['claim_text']]
    # Drop unknown characters (This may be modified depending model performance)
    tok_evidence = [' '.join([w for w in seq if re.match('^[\w\d]+$', w)]) for seq in tok_evidence]
    tok_claim = [' '.join([w for w in seq if re.match('^[\w\d]+$', w)]) for seq in tok_claim]
    # Class label
    label = df_['label']
    return tok_claim, tok_evidence, label

In [5]:
# Get data features
claim_train, evid_train, y_train = feature_selection(df_train)
claim_dev, evid_dev, y_dev = feature_selection(df_dev)

In [6]:
# Tokenizer class ( Can be improved)
class token:
    def __init__(self):
        self.word2index = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[MASK]": 3}
        self.index2word = {0: "[PAD]", 1: "[CLS]", 2: "[SEP]", 3: "[MASK]"}
        self.n_words = 4  # Count CLS and SEP

    def addSentence(self, sentence):
        for word in sentence.split():
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1

# tokenizer function
def tok(corpus):
    [tokenizer.addSentence(i) for i in corpus]
    
# Add tokens to idx dict
tokenizer = token()

# Create dicts
tok(claim_train)
tok(evid_train)
tok(claim_dev)
tok(evid_dev)

# 2. Model the data

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Load the data to tensor batches

<center><h3> 2.1 Bert embeding</h3></center>

<center><img src=../Images/BERT_emb.png alt="drawing" width="500"></center>
<center><img src=../Images/BERT_emb_example.png alt="drawing" width="500"></center>

In [19]:
# Dataset class
class Dataset(Dataset):
    def __init__(self, texts, tokenizer, seq_len=max_len):
        self.text = texts
        self.tokenizer = tokenizer
        self.seq_len = seq_len
    def __len__(self):
        return len(self.text)
    def __getitem__(self, idx):
        # Step 1: get text tokens
        sent = [self.tokenizer.word2index[i] for i in self.text[idx].split()]
        
        # Step 2: replace random words in sentence with mask / random words
        sent_mask, labels = self.masking(sent)

        # Step 3: Adding CLS and SEP tokens to the start and end of sentence
        # Adding PAD token for labels
        sent = [self.tokenizer.word2index['[CLS]']] + sent_mask + [self.tokenizer.word2index['[SEP]']]
        labels = [self.tokenizer.word2index['[PAD]']] + labels + [self.tokenizer.word2index['[PAD]']]

        # Step 4: Add PAD tokens to make the sentence same length as seq_len
        padding = [self.tokenizer.word2index['[PAD]'] for empty in range(self.seq_len - len(sent))]
        sent.extend(padding)
        labels.extend(padding)
        return np.array(sent), np.array(labels)
    
    #------------------------------------------------------------------------------------------
    # Function to mask/randomize tokens
    def masking(self, tokens, to_replace = 0.15):
        # tokens = input.split()
        output = []
        label = []
        for token in tokens:
            prob = random.random()
            # 15% of the tokens would be replaced
            if prob <= to_replace:
                # 10% chance change token to current token
                if prob < to_replace*.1:
                    output.append(token)
                # 10% chance change token to random
                elif prob < to_replace*.1*2:
                    output.append(random.choice(list(self.tokenizer.word2index.values())))
                # 10% chance change token to random
                else:
                    output.append(self.tokenizer.word2index["[MASK]"])
                label.append(token)
            else:
                output.append(token)
                label.append(0)
        return output, label

In [20]:
# Define collate (pre_process) function
def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = torch.from_numpy(np.array(texts)).to(device)
    labels = torch.from_numpy(np.array(labels)).to(device)
    return texts, labels

# Instanciate DataLoader
bs = 32

# ______________________________Traing data______________________________
# Datasets
tr_ev_ds = Dataset(evid_train, tokenizer)
tr_cl_ds = Dataset(claim_train, tokenizer)

# Dataloaders
tr_ev_dl = DataLoader(tr_ev_ds, batch_size=bs, collate_fn=collate_batch)
tr_cl_dl = DataLoader(tr_cl_ds, batch_size=bs, collate_fn=collate_batch)
tr_y_dl = DataLoader(y_train, batch_size=bs)

# ______________________________Test data______________________________
# Datasets
dv_ev_ds = Dataset(evid_dev, tokenizer)
dv_cl_ds = Dataset(claim_dev, tokenizer)

# Dataloaders
dv_ev_dl = DataLoader(dv_ev_ds, batch_size=bs, collate_fn=collate_batch)
dv_cl_dl = DataLoader(dv_cl_ds, batch_size=bs, collate_fn=collate_batch)
dv_y_dl = DataLoader(y_dev, batch_size=bs)

<center><h3> 2.1 Positional encoding to embed the data</h3></center>

<center><img src=../Images/pos_encoder.png alt="drawing" width="300"></center>

<center>Details on:</center>
<center><a href="https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/"><ph>A Gentle Introduction to Positional Encoding in Transformer Models</ph></a></center>

In [21]:
import math
# Positional embeding function
class positionalEmbeding(nn.Module):
    def __init__(self, embedding_dim, drop = 0.2, max_len = max_len):
        # Inputs:
        # embedding_dim: Length of input embeding
        # max_len: Max number of tokens in an input sentence
        # Return: Positional Embeding Matrix
        super(positionalEmbeding, self).__init__()
        self.dropout = nn.Dropout(p=drop)                                                                           # Dropout layer
        
        # Positional embeding matrix 
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)                                         # Positional increasing vector [max_len, 1]
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))      # Division term for the sin/cos functions
        pe = torch.zeros(max_len, embedding_dim).float()                                                            # Matrix of 0's [max_len, embedding_dim]
        pe[:, 0::2] = torch.sin(position * div_term)                                                                # 0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term)                                                                # 1::2 means starting with index 1, step = 2
        pe = pe.unsqueeze(0)                                                                                        # Resize pos encoder [1, max_len, embedding_dim]
        self.register_buffer('pe', pe)                                                                              # Adds pos encoder to the model state_dict

    def forward(self, x):
        # Input:
        # x: Embeding matrix [batch_size, text_length, embedding_dim]
        x = x + self.pe.requires_grad_(False)                      # Sum the position embeding
        return self.dropout(x)                                     # Apply dropout

<center><h3> 2.2 Multihead attention</h3></center>
<center><img src=../Images/attention.png alt="drawing" width="600"></center>

<center>Details on:</center>
<center><a href="https://towardsdatascience.com/build-your-own-transformer-from-scratch-using-pytorch-84c850470dcb"><ph>Build your own Transformer from scratch using Pytorch</ph></a></center>

In [22]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        assert embedding_dim % num_heads == 0, "in_size must be divisible by num_heads"

        self.embedding_dim = embedding_dim                      # Embeding input size
        self.num_heads = num_heads                              # Num heads of multihead attention model
        self.head_dim = embedding_dim // num_heads              # Embedding parameters for each head
        
        # Instanciate weights
        self.W_q = nn.Linear(embedding_dim, embedding_dim)      # Query weights
        self.W_k = nn.Linear(embedding_dim, embedding_dim)      # Key weights
        self.W_v = nn.Linear(embedding_dim, embedding_dim)      # Values weights
        self.linear = nn.Linear(embedding_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout)

    # scaled_dot_product_attention
    def dot_prd_attn(self, Q, K, V, mask):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)   # MatMult (Q*K)

        # Fill 0 mask with super small number so it wont affect the softmax weight
        attn_scores = attn_scores.masked_fill(mask == 0, -1e9)     

        # softmax to put attention weight for all non-pad tokens
        attn_probs = self.dropout(torch.softmax(attn_scores, dim=-1))                   # Softmax
        context = torch.matmul(attn_probs, V)                                           # MatMult (Probs*V)
        return context
    
    # Function to split attention heads
    def split_heads(self, x):
        batch_size, seq_length, embedding_dim = x.size()
        return x.view(batch_size, self.num_heads, seq_length, self.head_dim)
    # Function to join attention heads
    def combine_heads(self, x):
        batch_size, num_heads, seq_length, head_dim = x.size()
        return x.view(batch_size, seq_length, self.embedding_dim)
    
    def forward(self, x, mask):
        # Weights linear pass (Random inicialization) + Split heads
        Q = self.split_heads(self.W_q(x))
        K = self.split_heads(self.W_k(x))
        V = self.split_heads(self.W_v(x))
        # Multihead attention
        attn = self.dot_prd_attn(Q, K, V, mask)                 # scaled_dot_product_attention
        attn = self.combine_heads(attn)                         # Concat heads
        attn = self.linear(attn)                                # Linear pass
        return attn

<center><h3> 2.3 Encoder model (Passage Ranking)</h3></center>
<center>Source papers:</center>
<center><a href="https://arxiv.org/pdf/1706.03762"><ph>Attention Is All You Need</ph></a></center>
<center><a href="https://arxiv.org/pdf/1706.03762"><ph>Text and Code Embeddings by Contrastive Pre-Training</ph></a></center>

<center>Encoder:</center>
<center><img src=../Images/encoder.png alt="drawing" width="300"></center>

In [23]:
# Encoder class based 
class EncoderLayer(nn.Module):
    def __init__(self, 
                vocab_size,                            # Size of vocabulary
                embedding_dim,                         # Embedding dimension
                n_head,                                # Number of heads  in the multihead attention model
                hidden_dim = 300,                      # Hiden dims for the feed forward pass
                dropout = 0.5):
        
        super(EncoderLayer, self).__init__()
        self.embedding_dim = embedding_dim

        self.multihead = MultiHeadAttention(embedding_dim, n_head)              # Multihead attention layer
        self.normalization = nn.LayerNorm(embedding_dim)                        # Normalization layer
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(embedding_dim, 1)                               # Output layer

        # Feed forward pass
        self.feed_forward = nn.Sequential().to(device)
        self.feed_forward.add_module('fc1', nn.Linear(embedding_dim, hidden_dim))
        self.feed_forward.add_module('relu', nn.GELU())
        self.feed_forward.add_module('fc2', nn.Linear(hidden_dim, embedding_dim))

    def forward(self, embeding, mask):
        attn = self.dropout(self.multihead(embeding, mask))                      # Multihead attention
        normal = self.normalization(embeding + attn)                             # Add & Normalize pass
        forward = self.dropout(self.feed_forward(normal))                       # Feed Forward pass
        encoded = self.normalization(normal + forward)                          # Add & Normalize pass #2
        return encoded

<center><h3> 2.4 BERT model</h3></center>
<center><img src=../Images/BERT_enc.png alt="drawing" width="400"></center>

In [24]:
# Bert model
class BERT(nn.Module):
    # Encoder is a stack of N encoder layers. 
    def __init__(self, vocab_size, d_model, num_layers, n_head, dropout):
        super(BERT, self).__init__()

        self.d_model = d_model
        self.n_layers = num_layers
        self.heads = n_head

        # paper noted they used 4 * hidden_size for ff_network_hidden_size
        self.feed_forward_hidden = d_model * 4

        # embedding for BERT, sum of positional and token embeddings (No sentence since it is a SBERT)
        self.encoder = nn.Embedding(vocab_size, d_model, padding_idx=0)   # Embeding layer
        self.pos_encoder = positionalEmbeding(d_model, dropout)           # Positional embeding

        # multi-layers transformer blocks, deep network
        self.encoder_blocks = torch.nn.ModuleList(
            [EncoderLayer(vocab_size = vocab_size, embedding_dim = d_model, n_head = n_head, hidden_dim = 500, dropout = 0.5)\
                .to(device) for _ in range(num_layers)])
        

    def forward(self, text, mask):
        mask = (text > 0).unsqueeze(1).repeat(1, text.size(1), 1).unsqueeze(1)  # Redim mask [batch_size, 1, 1, max_len]
        encoder = self.encoder(text) * math.sqrt(self.d_model)                  # Text embeding imput
        pos_enc = self.pos_encoder(encoder)                                     # Positional embeding + Text embeding
        # running over multiple transformer blocks
        for layer in self.encoder_blocks:
            output = layer(pos_enc, mask)
        return output

<center><h3> 2.5 SBERT model</h3></center>

<center><img src=../Images/SBERT.png alt="drawing" width="400"></center>

In [42]:
# Parameters
vocab_size = len(tokenizer.word2index)+1
d_model = 300
n_head = 1
dropout = 0.1
hidden_dim = 2048
num_layers = 3
# Instanciate model
model = BERT(vocab_size, d_model, num_layers, n_head, dropout).to(device)

In [43]:
# Loss fn
loss_fn = nn.CrossEntropyLoss().to(device)
# Optimizer
optimizer = torch.optim.Adam(model.parameters())    # lr=2e−5

In [44]:
# Train SBERT model
from tqdm import tqdm

def train_model():
    # Cosine similarity function
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)

    train_loss = 0

    # Iterate dataloader
    for t1, t2, y in tqdm(zip(tr_ev_dl, tr_cl_dl, tr_y_dl)):
        # Set parameters
        sent_a, m1 = t1
        sent_b, m2 = t2
        y = y.float().to(device)

        model.train()
        
        # Reset gradient
        optimizer.zero_grad()

        # Encoder layer
        enc_a = model(sent_a, m1)
        enc_b = model(sent_b, m2)

        # Pooling layer mean
        u = torch.mean(enc_a, 1) 
        v = torch.mean(enc_b, 1) 

        # Similarity metric
        similarity = cos(u, v)

        # Loss
        loss = loss_fn(similarity, y)
        acc = torch.sum((similarity>=0.5).float() == y)
        total = y.size()[0]

        # Metrics
        train_loss += loss.item()
        loss.backward()             # Backpropagation
        optimizer.step()            # Update parameters

    # Print results
    d_acc = (acc)/(total)
    loss = train_loss/len(tr_y_dl)

    tqdm.write(
        f'Train Accuracy: {d_acc:.3f}\
        Train Loss: {loss:.3f}',
    )

In [45]:
def test_model():
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    test_loss = 0
    # Iterate dataloader
    for t1, t2, y in tqdm(zip(dv_ev_dl, dv_cl_dl, dv_y_dl)):
        # Set parameters
        sent_a, m1 = t1
        sent_b, m2 = t2
        y = y.float().to(device)
    
        model.eval()
    
        # Encoder layer
        enc_a = model(sent_a, m1)
        enc_b = model(sent_b, m2)
    
        # Pooling layer mean
        u = torch.mean(enc_a, 1) 
        v = torch.mean(enc_b, 1) 
    
        # Similarity metric
        similarity = cos(u, v)
    
        # Loss
        loss = loss_fn((similarity>=0.5).float(), y)
        acc = torch.sum((similarity>=0.5).float() == y)
        total = y.size()[0]
    
        # Metrics
        test_loss += loss.item()
    
    # Print results
    d_acc = (acc)/(total)
    loss = test_loss/len(dv_y_dl)
    
    tqdm.write(
        f'Test Accuracy: {d_acc:.3f}\
        Test Loss: {loss:.3f}',
    )

In [46]:
from tqdm import tqdm
# from sklearn.metrics import f1_score
from tqdm import tqdm, tqdm_notebook # show progress bar

# Epochs
epochs = 4
print("Training SBERT model!")

for epoch in range(epochs):
    print('epoch: %d'% (epoch))
    train_model()
    test_model()

Training SBERT model!
epoch: 0


258it [00:21, 11.93it/s]


Train Accuracy: 0.800        Train Loss: 53.759


31it [00:01, 17.07it/s]


Test Accuracy: 0.818        Test Loss: 52.885
epoch: 1


258it [00:24, 10.51it/s]


Train Accuracy: 0.800        Train Loss: 51.747


31it [00:01, 16.84it/s]


Test Accuracy: 0.909        Test Loss: 51.949
epoch: 2


258it [00:24, 10.42it/s]


Train Accuracy: 0.900        Train Loss: 50.833


31it [00:01, 16.59it/s]


Test Accuracy: 0.955        Test Loss: 51.999
epoch: 3


258it [00:24, 10.47it/s]


Train Accuracy: 0.900        Train Loss: 50.290


31it [00:01, 16.81it/s]

Test Accuracy: 0.955        Test Loss: 52.043





<center><h3> 3 Predict on test data</h3></center>

In [52]:
# Read test claims
with open('../data/test-claims-unlabelled.json', 'r') as f:
    df_test = pd.DataFrame(json.load(f)).transpose()

# Prepare
claim_test = [word_tokenize(i.lower()) for i in df_test['claim_text']]
claim_test = [' '.join([w for w in seq if re.match('^[\w\d]+$', w)]) for seq in claim_test]

# Add to dict
tok(claim_test)

# Load Dataloader
ts_cl_ds = Dataset(claim_test, tokenizer)
ts_cl_dl = DataLoader(ts_cl_ds, batch_size=bs, collate_fn=collate_batch)

In [None]:
'''
The model is approaching wrong the dev data, it is necessary to treat the dev set as we are going to treat the test set, that is:
a. Compare the model claims to all the evidence claims and retrieve the ones that are the most similar

To do so, it is necessary to:
1. Preprocess the evidence texts and save the processed ev.
2. Train the model evaluating the performance over the whole dev-ev relation:
    evidence len (1.208.827) * dev len (154) = 186.159.358 tuples to compare
3. If the baseline trained model does not perform properly, enlarge the model including all the evidence data
    evidence len (1.208.827) * train len (1.227) = 1.484.439.556 tuples to compare
This would give the closest possible results without any changes on architecture parameters
'''