In [1]:
import pandas as pd
import json
import copy
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import re

# 1. Preprocess the data

In [297]:
# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidence = json.load(f)
eviden = pd.DataFrame.from_dict(evidence, orient='index', columns=['evidence'])
ev_txt = eviden['evidence'].values
max_len = max([len(j.split()) for i,j in evidence.items()])

# Read train claims
with open('../data/train-claims.json', 'r') as f:
    df_train = pd.DataFrame(json.load(f)).transpose()
df_train = df_train.explode("evidences")
df_train['evidences_text'] = [evidence[item] for item in df_train['evidences']]

# Read dev claims
with open('../data/dev-claims.json', 'r') as f:
    df_dev = pd.DataFrame(json.load(f)).transpose()
df_dev['split'] = 'dev'

In [298]:
# Set words to lower and tokenize
tok_ev = [word_tokenize(i.lower()) for i in df_train['evidences_text']]
tok_cl = [word_tokenize(i.lower()) for i in df_train['claim_text']]
# Drop unknown characters (This may be modified depending model performance)
tok_ev = [' '.join([w for w in seq if re.match('^[\w\d]+$', w)]) for seq in tok_ev]
tok_cl = [' '.join([w for w in seq if re.match('^[\w\d]+$', w)]) for seq in tok_cl]
# Join claims and evidences
# input = [" ".join(["[CLS]"] + i.split() + ["[SEP]"] + j.split()) for i, j in zip(tok_cl, tok_ev)]
input = [[i, j]for i, j in zip(tok_cl, tok_ev)]

In [301]:
# Tokenizer class ( Can be improved)
class tokenizer:
    def __init__(self):
        self.word2index = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[MASK]": 3}
        self.index2word = {0: "[PAD]", 1: "[CLS]", 2: "[SEP]", 3: "[MASK]"}
        self.n_words = 4  # Count CLS and SEP

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1

# Words to idx
tokenizer = tokenizer()
for i, j in zip(tok_cl, tok_ev):
    tokenizer.addSentence(i)
    tokenizer.addSentence(j)

In [306]:
[tokenizer.word2index[i] for i in input[0][0].split()]

[4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 6,
 13,
 14,
 15,
 12,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25]

# 2. Model the data

In [300]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Load the data to tensor batches

<center><h3> 2.1 Bert embeding</h3></center>

<center><img src=../Images/BERT_emb.png alt="drawing" width="500"></center>
<center><img src=../Images/BERT_emb_example.png alt="drawing" width="500"></center>

In [74]:
import random
# Dataset class
class Dataset(Dataset):
    def __init__(self, texts, tokenizer, seq_len=max_len):
        self.text = texts
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.text)
    def __getitem__(self, idx):
        # Step 1: get random sentence pair, either negative or positive
        sent1, sent2, label = self.get_sent()
        sent1 = [tokenizer.word2index[i] for i in sent1.split()]
        sent2 = [tokenizer.word2index[i] for i in sent2.split()]
        
        # Step 2: replace random words in sentence with mask / random words
        sent1_mask, sent1_label = self.masking(sent1)
        sent2_mask, sent2_label = self.masking(sent2)

        # Step 3: Adding CLS and SEP tokens to the start and end of sentences
        # Adding PAD token for labels
        sent1 = [self.tokenizer.word2index['[CLS]']] + sent1_mask + [self.tokenizer.word2index['[SEP]']]
        sent2 = sent2_mask + [self.tokenizer.word2index['[SEP]']]
        sent1_label = [self.tokenizer.word2index['[PAD]']] + sent1_label + [self.tokenizer.word2index['[PAD]']]
        sent2_label = sent2_label + [self.tokenizer.word2index['[PAD]']]

        # Step 4: combine sentence 1 and 2 as one input
        # adding PAD tokens to make the sentence same length as seq_len
        
        return texts, labels
    
    #------------------------------------------------------------------------------------------
    # Return tuple fo 2 sentences plus relation
    def get_sent(self):
        sentence1, sentence2 = self.text[0], self.text[1]
        # randomly return pair of sentences
        if random.random() > 0.5:
            return sentence1, sentence2, 1                              # 1: Relation
        else:
            return sentence1, ev_txt[random.randrange(len(ev_txt))], 0  # O: No relation
    #------------------------------------------------------------------------------------------
    # Function to mask/randomize tokens
    def masking(self, tokens, to_replace = 0.15):
        # tokens = input.split()
        output = []
        label = []
        for token in tokens:
            prob = random.random()
            # 15% of the tokens would be replaced
            if prob <= to_replace:
                # 10% chance change token to current token
                if prob < to_replace*.1:
                    output.append(token)
                # 10% chance change token to random
                elif prob < to_replace*.1*2:
                    output.append(random.choice(list(self.tokenizer.word2index.values())))
                # 10% chance change token to random
                else:
                    output.append(self.tokenizer.word2index["[MASK]"])
                label.append(token)
            else:
                output.append(token)
                label.append(0)
        return output, label

In [None]:
# Define collate (pre_process) function
def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = nn.utils.rnn.pad_sequence(texts, batch_first=True).to(device)
    return texts, labels

# Instanciate DataLoader
bs = 32
tr_ds = Dataset(device)
# dv_ds = Dataset(num_quer, range(len(num_quer)))

tr_dl = DataLoader(tr_ds, batch_size=bs, collate_fn=collate_batch)
# dv_dl = DataLoader(dv_ds, batch_size=bs)

<center><h3> 2.1.1 Positional encoding to embed the data</h3></center>

<center><img src=../Images/pos_encoder.png alt="drawing" width="300"></center>

<center>Details on:</center>
<center><a href="https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/"><ph>A Gentle Introduction to Positional Encoding in Transformer Models</ph></a></center>

In [69]:
import math
max_len_ = max([len(i) for i in num_evid]) # Maximum number of tokens in a sentence
# Positional embeding function
class positionalEmbeding(nn.Module):
    def __init__(self, embedding_dim, drop = 0.2, max_len = max_len_):
        # Inputs:
        # embedding_dim: Length of input embeding
        # max_len: Max number of tokens in an input sentence
        # Return: Positional Embeding Matrix
        super(positionalEmbeding, self).__init__()
        self.dropout = nn.Dropout(p=drop)                                                                           # Dropout layer
        
        # Positional embeding matrix 
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)                                         # Positional increasing vector [max_len, 1]
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))      # Division term for the sin/cos functions
        pe = torch.zeros(max_len, embedding_dim)                                                                    # Matrix of 0's [max_len, embedding_dim]
        pe[:, 0::2] = torch.sin(position * div_term)                                                                # 0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term)                                                                # 1::2 means starting with index 1, step = 2
        pe = pe.unsqueeze(0).transpose(0, 1)                                                                        # Resize pos encoder [max_len, 1, embedding_dim]
        self.register_buffer('pe', pe)                                                                              # Adds pos encoder to the model state_dict

    def forward(self, x):
        # Input:
        # x: Embeding matrix [batch_size, text_length, embedding_dim]
        x = x + self.pe[:x.size(0), :x.size(1)]      # Sum the position embeding
        return self.dropout(x)              # Apply dropout

<center><h3> 2.2 Multihead attention</h3></center>
<center><img src=../Images/attention.png alt="drawing" width="600"></center>

<center>Details on:</center>
<center><a href="https://towardsdatascience.com/build-your-own-transformer-from-scratch-using-pytorch-84c850470dcb"><ph>Build your own Transformer from scratch using Pytorch</ph></a></center>

In [70]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(MultiHeadAttention, self).__init__()

        assert embedding_dim % num_heads == 0, "in_size must be divisible by num_heads"

        self.embedding_dim = embedding_dim                      # Embeding input size
        self.num_heads = num_heads                              # Num heads of multihead attention model
        self.head_dim = embedding_dim // num_heads              # Embedding parameters for each head
        
        # Instanciate weights
        self.W_q = nn.Linear(embedding_dim, embedding_dim)      # Query weights
        self.W_k = nn.Linear(embedding_dim, embedding_dim)      # Key weights
        self.W_v = nn.Linear(embedding_dim, embedding_dim)      # Values weights
        self.linear = nn.Linear(embedding_dim, embedding_dim)

    # scaled_dot_product_attention
    def dot_prd_attn(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)   # MatMult (Q*K)
        if mask is not None: attn_scores = attn_scores.masked_fill(mask == 0, -1e9)     # Masking (Optional)
        attn_probs = torch.softmax(attn_scores, dim=-1)                                 # Softmax
        output = torch.matmul(attn_probs, V)                                            # MatMult (Probs*V)
        return output
    
    # Function to split attention heads
    def split_heads(self, x):
        batch_size, seq_length, embedding_dim = x.size()
        return x.view(batch_size, self.num_heads, seq_length, self.head_dim)
    # Function to join attention heads
    def combine_heads(self, x):
        batch_size, num_heads, seq_length, head_dim = x.size()
        return x.view(batch_size, seq_length, self.embedding_dim)
    
    def forward(self, x, mask = None):
        # Weights linear pass (Random inicialization) + Split heads
        Q = self.split_heads(self.W_q(x))
        K = self.split_heads(self.W_k(x))
        V = self.split_heads(self.W_v(x))
        # Multihead attention
        attn = self.dot_prd_attn(Q, K, V, mask)                 # scaled_dot_product_attention
        attn = self.combine_heads(attn)                         # Concat heads
        attn = self.linear(attn)                                # Linear pass
        return attn

<center><h3> 2.5 Transformer model (Passage Ranking)</h3></center>
<center>Source papers:</center>
<center><a href="https://arxiv.org/pdf/1706.03762"><ph>Attention Is All You Need</ph></a></center>
<center><a href="https://arxiv.org/pdf/1706.03762"><ph>Text and Code Embeddings by Contrastive Pre-Training</ph></a></center>

<center>Encoder:</center>
<center><img src=../Images/encoder.png alt="drawing" width="300"></center>

In [71]:
# Encoder class based 
class EncoderLayer(nn.Module):
    def __init__(self, 
                vocab_size,                            # Size of vocabulary
                embedding_dim,                         # Embedding dimension
                n_head,                                # Number of heads  in the multihead attention model
                hidden_dim,                            # Hiden dims for the feed forward pass
                dropout = 0.5):
        
        super(EncoderLayer, self).__init__()
        self.embedding_dim = embedding_dim

        self.encoder = nn.Embedding(vocab_size, embedding_dim)                  # Embeding layer
        self.pos_encoder = positionalEmbeding(embedding_dim, dropout)           # Positional embeding
        self.multihead = MultiHeadAttention(embedding_dim, n_head)              # Multihead attention layer
        self.normalization = nn.LayerNorm(embedding_dim)                        # Normalization layer
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(embedding_dim, 1)                               # Output layer

        # Feed forward pass
        self.feed_forward = nn.Sequential()
        self.feed_forward.add_module('fc1', nn.Linear(embedding_dim, hidden_dim))
        self.feed_forward.add_module('relu', nn.ReLU())
        self.feed_forward.add_module('fc2', nn.Linear(hidden_dim, embedding_dim))

    def forward(self, text):
        encoder = self.encoder(text) * math.sqrt(self.embedding_dim)            # Encode imput text [batch_size, text_length, embedding_dim]
        pos_enc = self.pos_encoder(encoder)                                     # Reurn pos encoder [batch_size, text_length, embedding_dim]
        attn = self.multihead(pos_enc)                                          # Multihead encoder
        normal = self.normalization(text.unsqueeze(2) + self.dropout(attn))     # Add & Normalize pass #1  UNSQUEEZE
        forward = self.feed_forward(normal)                                     # Feed Forward pass
        encoded = self.normalization(normal + self.dropout(forward))            # Add & Normalize pass #2
        lin_vec = self.linear(encoded)
        return lin_vec

In [72]:
class Encoder(nn.Module):
    # Encoder is a stack of N encoder layers. 
    def __init__(self, encoder_layer, num_layers):
        super().__init__()

        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for i in range(num_layers)])
        self.num_layers = num_layers

    def forward(self, text, mask = None, src_key_padding_mask = None):
        output = text
        for layer in self.layers:
            output = layer(output)
        return output

In [73]:
vocab_size = max([max(i) for i in num_evid]) + max(num_quer)
embedding_dim = 300
n_head = 2
dropout = 0.5
hidden_dim = 2048
num_layers = 2
encoder_layer = EncoderLayer(vocab_size, embedding_dim, n_head, hidden_dim, dropout).to(device)
encoder = Encoder(encoder_layer, num_layers)

In [81]:
from tqdm import tqdm
from numpy.linalg import norm
cos_sim = []
y = encoder_layer(next(iter(dv_dl))[0].unsqueeze(1)).reshape(-1).detach().numpy()
for x, _ in tqdm(tr_dl):
    enc = encoder_layer(x)
    for line in enc:
        X = line.reshape(-1).detach().numpy()
        cos_sim.append(np.dot(X[:len(y)], y[:len(X)])/(norm(X)*norm(y)))

100%|██████████| 469/469 [00:08<00:00, 57.90it/s]


In [82]:
df = pd.DataFrame(cos_sim, columns=['similarity'])
df['Evidence'] = ev[:15000]
df.sort_values('similarity', ascending=False, inplace=True)
df

Unnamed: 0,similarity,Evidence
8048,0.567922,within countries as differing political moveme...
13066,0.566776,or sitamun was a princess of the early eightee...
6668,0.535906,geese are waterfowl belonging to the tribe ans...
7129,0.517724,the galaxy y duos is a mobile phone from
11642,0.494271,jacques schotte september was a belgian psychi...
...,...,...
381,-0.541321,casley never played for torquay league team in...
8748,-0.550421,while the characters and instances in the movi...
11473,-0.575943,momofuku is a cookbook by american chef david ...
11450,-0.587998,ross mccloud 1819 august was a california pion...


In [83]:
df_train['claim_text'].values[0]

'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'

In [84]:
df[df.index == 12171]

Unnamed: 0,similarity,Evidence
12171,-0.201789,higher carbon dioxide concentrations will favo...


In [90]:
' '.join(dv[0])

'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'

In [None]:
df_train['evidences'][0]

  df_train['evidences'][0]


['evidence-442946', 'evidence-1194317', 'evidence-12171']

In [None]:
max(list(sequence.word2index.values()))

4933

In [None]:
# from tqdm import tqdm
# def reject(df, eviden):
#     reject = []
#     for row in tqdm(df.index):
#         ev = df.loc[row, 'evidences']
#         rej = []
#         samp = True
#         while samp:
#             sp = eviden.sample(len(ev))['evidence'].values
#             for val in sp:
#                 if val not in ev:
#                     rej.append(sp)
#                     samp = False
#         reject.append(rej[0])
#     return reject

# df_train_ref = df_train[['claim_text','claim_label']].copy()
# df_train_ref['evidences'] = reject(df_train, eviden)
# df_train_ref['split'] = 'train'
# df_train_ref['label'] = 'disengagement'

# df_train['evidences'] = [[evidence[ev] for ev in val] for val in df_train['evidences']]

# df = pd.concat([df_train, df_train_ref])[['claim_text', 'evidences', 'label', 'split']]
# df = df.explode('evidences')
# df