In [59]:
import pandas as pd
import json
from nltk.tokenize import word_tokenize
import numpy as np
import re
import random

# 1. Preprocess the data

In [60]:
# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidence = json.load(f)
eviden = pd.DataFrame.from_dict(evidence, orient='index', columns=['evidence'])
ev_txt = eviden['evidence'].values
max_len = max([len(j.split()) for i,j in evidence.items()])

# Read train claims
with open('../data/train-claims.json', 'r') as f:
    df_train = pd.DataFrame(json.load(f)).transpose()
df_train = df_train.explode("evidences")
df_train['evidences_text'] = [evidence[item] for item in df_train['evidences']]
df_train['label'] = 1
df_train_ = df_train[['claim_text']].copy()
df_train_['evidences_text'] = [random.choice(ev_txt) for i in range(df_train_.shape[0])]
df_train_['label'] = 0
df_train = pd.concat([df_train[['claim_text' , 'evidences_text', 'label']], df_train_]).sample(frac=1)

# Read dev claims
with open('../data/dev-claims.json', 'r') as f:
    df_dev = pd.DataFrame(json.load(f)).transpose()
df_dev['split'] = 'dev'

In [61]:
# Set words to lower and tokenize
tok_ev = [word_tokenize(i.lower()) for i in df_train['evidences_text']]
tok_cl = [word_tokenize(i.lower()) for i in df_train['claim_text']]
# Drop unknown characters (This may be modified depending model performance)
tok_ev = [' '.join([w for w in seq if re.match('^[\w\d]+$', w)]) for seq in tok_ev]
tok_cl = [' '.join([w for w in seq if re.match('^[\w\d]+$', w)]) for seq in tok_cl]
# Class label
y = df_train['label']

In [62]:
# Tokenizer class ( Can be improved)
class tokenizer:
    def __init__(self):
        self.word2index = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[MASK]": 3}
        self.index2word = {0: "[PAD]", 1: "[CLS]", 2: "[SEP]", 3: "[MASK]"}
        self.n_words = 4  # Count CLS and SEP

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1

# Add tokens to idx dict
tokenizer = tokenizer()
for i, j in zip(tok_cl, tok_ev):
    tokenizer.addSentence(i)
    tokenizer.addSentence(j)

# 2. Model the data

In [63]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Load the data to tensor batches

<center><h3> 2.1 Bert embeding</h3></center>

<center><img src=../Images/BERT_emb.png alt="drawing" width="500"></center>
<center><img src=../Images/BERT_emb_example.png alt="drawing" width="500"></center>

In [64]:
# Dataset class
class Dataset(Dataset):
    def __init__(self, texts, tokenizer, seq_len=max_len):
        self.text = texts
        self.tokenizer = tokenizer
        self.seq_len = seq_len
    def __len__(self):
        return len(self.text)
    def __getitem__(self, idx):
        # Step 1: get text tokens
        sent = [self.tokenizer.word2index[i] for i in self.text[idx].split()]
        
        # Step 2: replace random words in sentence with mask / random words
        sent_mask, labels = self.masking(sent)

        # Step 3: Adding CLS and SEP tokens to the start and end of sentence
        # Adding PAD token for labels
        sent = [self.tokenizer.word2index['[CLS]']] + sent_mask + [self.tokenizer.word2index['[SEP]']]
        labels = [self.tokenizer.word2index['[PAD]']] + labels + [self.tokenizer.word2index['[PAD]']]

        # Step 4: Add PAD tokens to make the sentence same length as seq_len
        padding = [self.tokenizer.word2index['[PAD]'] for empty in range(self.seq_len - len(sent))]
        sent.extend(padding)
        labels.extend(padding)
        return np.array(sent), np.array(labels)
    
    #------------------------------------------------------------------------------------------
    # Function to mask/randomize tokens
    def masking(self, tokens, to_replace = 0.15):
        # tokens = input.split()
        output = []
        label = []
        for token in tokens:
            prob = random.random()
            # 15% of the tokens would be replaced
            if prob <= to_replace:
                # 10% chance change token to current token
                if prob < to_replace*.1:
                    output.append(token)
                # 10% chance change token to random
                elif prob < to_replace*.1*2:
                    output.append(random.choice(list(self.tokenizer.word2index.values())))
                # 10% chance change token to random
                else:
                    output.append(self.tokenizer.word2index["[MASK]"])
                label.append(token)
            else:
                output.append(token)
                label.append(0)
        return output, label

In [66]:
# Define collate (pre_process) function
def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = torch.from_numpy(np.array(texts)).to(device)
    labels = torch.from_numpy(np.array(labels)).to(device)
    return texts, labels

# Instanciate DataLoader
bs = 32

# Datasets
tr_ev_ds = Dataset(tok_ev, tokenizer)
tr_cl_ds = Dataset(tok_cl, tokenizer)

# Dataloaders
tr_ev_dl = DataLoader(tr_ev_ds, batch_size=bs, collate_fn=collate_batch)
tr_cl_dl = DataLoader(tr_cl_ds, batch_size=bs, collate_fn=collate_batch)
y_dl = DataLoader(y, batch_size=bs)

<center><h3> 2.1 Positional encoding to embed the data</h3></center>

<center><img src=../Images/pos_encoder.png alt="drawing" width="300"></center>

<center>Details on:</center>
<center><a href="https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/"><ph>A Gentle Introduction to Positional Encoding in Transformer Models</ph></a></center>

In [67]:
import math
# Positional embeding function
class positionalEmbeding(nn.Module):
    def __init__(self, embedding_dim, drop = 0.2, max_len = max_len):
        # Inputs:
        # embedding_dim: Length of input embeding
        # max_len: Max number of tokens in an input sentence
        # Return: Positional Embeding Matrix
        super(positionalEmbeding, self).__init__()
        self.dropout = nn.Dropout(p=drop)                                                                           # Dropout layer
        
        # Positional embeding matrix 
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)                                         # Positional increasing vector [max_len, 1]
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))      # Division term for the sin/cos functions
        pe = torch.zeros(max_len, embedding_dim).float()                                                            # Matrix of 0's [max_len, embedding_dim]
        pe[:, 0::2] = torch.sin(position * div_term)                                                                # 0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term)                                                                # 1::2 means starting with index 1, step = 2
        pe = pe.unsqueeze(0)                                                                                        # Resize pos encoder [1, max_len, embedding_dim]
        self.register_buffer('pe', pe)                                                                              # Adds pos encoder to the model state_dict

    def forward(self, x):
        # Input:
        # x: Embeding matrix [batch_size, text_length, embedding_dim]
        x = x + self.pe.requires_grad_(False)                      # Sum the position embeding
        return self.dropout(x)                                     # Apply dropout

<center><h3> 2.2 Multihead attention</h3></center>
<center><img src=../Images/attention.png alt="drawing" width="600"></center>

<center>Details on:</center>
<center><a href="https://towardsdatascience.com/build-your-own-transformer-from-scratch-using-pytorch-84c850470dcb"><ph>Build your own Transformer from scratch using Pytorch</ph></a></center>

In [68]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        assert embedding_dim % num_heads == 0, "in_size must be divisible by num_heads"

        self.embedding_dim = embedding_dim                      # Embeding input size
        self.num_heads = num_heads                              # Num heads of multihead attention model
        self.head_dim = embedding_dim // num_heads              # Embedding parameters for each head
        
        # Instanciate weights
        self.W_q = nn.Linear(embedding_dim, embedding_dim)      # Query weights
        self.W_k = nn.Linear(embedding_dim, embedding_dim)      # Key weights
        self.W_v = nn.Linear(embedding_dim, embedding_dim)      # Values weights
        self.linear = nn.Linear(embedding_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout)

    # scaled_dot_product_attention
    def dot_prd_attn(self, Q, K, V, mask):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)   # MatMult (Q*K)

        # Fill 0 mask with super small number so it wont affect the softmax weight
        attn_scores = attn_scores.masked_fill(mask == 0, -1e9)     

        # softmax to put attention weight for all non-pad tokens
        attn_probs = self.dropout(torch.softmax(attn_scores, dim=-1))                   # Softmax
        context = torch.matmul(attn_probs, V)                                           # MatMult (Probs*V)
        return context
    
    # Function to split attention heads
    def split_heads(self, x):
        batch_size, seq_length, embedding_dim = x.size()
        return x.view(batch_size, self.num_heads, seq_length, self.head_dim)
    # Function to join attention heads
    def combine_heads(self, x):
        batch_size, num_heads, seq_length, head_dim = x.size()
        return x.view(batch_size, seq_length, self.embedding_dim)
    
    def forward(self, x, mask):
        # Weights linear pass (Random inicialization) + Split heads
        Q = self.split_heads(self.W_q(x))
        K = self.split_heads(self.W_k(x))
        V = self.split_heads(self.W_v(x))
        # Multihead attention
        attn = self.dot_prd_attn(Q, K, V, mask)                 # scaled_dot_product_attention
        attn = self.combine_heads(attn)                         # Concat heads
        attn = self.linear(attn)                                # Linear pass
        return attn

<center><h3> 2.3 Encoder model (Passage Ranking)</h3></center>
<center>Source papers:</center>
<center><a href="https://arxiv.org/pdf/1706.03762"><ph>Attention Is All You Need</ph></a></center>
<center><a href="https://arxiv.org/pdf/1706.03762"><ph>Text and Code Embeddings by Contrastive Pre-Training</ph></a></center>

<center>Encoder:</center>
<center><img src=../Images/encoder.png alt="drawing" width="300"></center>

In [69]:
# Encoder class based 
class EncoderLayer(nn.Module):
    def __init__(self, 
                vocab_size,                            # Size of vocabulary
                embedding_dim,                         # Embedding dimension
                n_head,                                # Number of heads  in the multihead attention model
                hidden_dim = 300,                      # Hiden dims for the feed forward pass
                dropout = 0.5):
        
        super(EncoderLayer, self).__init__()
        self.embedding_dim = embedding_dim

        self.multihead = MultiHeadAttention(embedding_dim, n_head)              # Multihead attention layer
        self.normalization = nn.LayerNorm(embedding_dim)                        # Normalization layer
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(embedding_dim, 1)                               # Output layer

        # Feed forward pass
        self.feed_forward = nn.Sequential().to(device)
        self.feed_forward.add_module('fc1', nn.Linear(embedding_dim, hidden_dim))
        self.feed_forward.add_module('relu', nn.GELU())
        self.feed_forward.add_module('fc2', nn.Linear(hidden_dim, embedding_dim))

    def forward(self, embeding, mask):
        attn = self.dropout(self.multihead(embeding, mask))                      # Multihead attention
        normal = self.normalization(embeding + attn)                             # Add & Normalize pass
        forward = self.dropout(self.feed_forward(normal))                       # Feed Forward pass
        encoded = self.normalization(normal + forward)                          # Add & Normalize pass #2
        return encoded

<center><h3> 2.4 BERT model</h3></center>
<center><img src=../Images/BERT_enc.png alt="drawing" width="400"></center>

In [70]:
# Bert model
class BERT(nn.Module):
    # Encoder is a stack of N encoder layers. 
    def __init__(self, vocab_size, d_model, num_layers, n_head, dropout):
        super(BERT, self).__init__()

        self.d_model = d_model
        self.n_layers = num_layers
        self.heads = n_head

        # paper noted they used 4 * hidden_size for ff_network_hidden_size
        self.feed_forward_hidden = d_model * 4

        # embedding for BERT, sum of positional and token embeddings (No sentence since it is a SBERT)
        self.encoder = nn.Embedding(vocab_size, d_model, padding_idx=0)   # Embeding layer
        self.pos_encoder = positionalEmbeding(d_model, dropout)           # Positional embeding

        # multi-layers transformer blocks, deep network
        self.encoder_blocks = torch.nn.ModuleList(
            [EncoderLayer(vocab_size = vocab_size, embedding_dim = d_model, n_head = n_head, hidden_dim = 500, dropout = 0.5)\
                .to(device) for _ in range(num_layers)])
        

    def forward(self, text, mask):
        mask = (text > 0).unsqueeze(1).repeat(1, text.size(1), 1).unsqueeze(1)  # Redim mask [batch_size, 1, 1, max_len]
        encoder = self.encoder(text) * math.sqrt(self.d_model)                  # Text embeding imput
        pos_enc = self.pos_encoder(encoder)                                     # Positional embeding + Text embeding
        # running over multiple transformer blocks
        for layer in self.encoder_blocks:
            output = layer(pos_enc, mask)
        return output

<center><h3> 2.4 SBERT model</h3></center>

<center><img src=../Images/SBERT.png alt="drawing" width="400"></center>

In [71]:
# Parameters
vocab_size = len(tokenizer.word2index)+1
d_model = 300
n_head = 1
dropout = 0.1
hidden_dim = 2048
num_layers = 3
# Instanciate model
model = BERT(vocab_size, d_model, num_layers, n_head, dropout)

In [72]:
# Loss fn
loss_fn = nn.CrossEntropyLoss().to(device)
# Optimizer
optimizer = torch.optim.Adam(model.parameters())    # lr=2e−5

In [76]:
# Train SBERT model
from tqdm import tqdm

def train_model():
    # Cosine similarity function
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)

    train_loss = 0

    # Iterate dataloader
    for t1, t2, y in tqdm(zip(tr_ev_dl, tr_cl_dl, y_dl)):
        # Set parameters
        sent_a, m1 = t1
        sent_b, m2 = t2
        y = y.float()

        model.train()
        
        # Reset gradient
        optimizer.zero_grad()

        # Encoder layer
        enc_a = model(sent_a, m1)
        enc_b = model(sent_b, m2)

        # Pooling layer mean
        u = torch.mean(enc_a, 1) 
        v = torch.mean(enc_b, 1) 

        # Similarity metric
        similarity = cos(u, v)

        # Loss
        loss = loss_fn(similarity, y)
        acc = torch.sum((similarity>=0.5).float() == y)
        total = y.size()[0]

        # Metrics
        train_loss += loss.item()
        loss.backward()             # Backpropagation
        optimizer.step()            # Update parameters

    # Print results
    d_acc = (acc)/(total)
    loss = train_loss/len(y_dl)

    tqdm.write(
        f'Domain_Acc: {d_acc:.3f}\
        Loss: {loss:.3f}',
    )

258it [05:36,  1.30s/it]

Domain_Acc: 0.750    Loss: 53.429





In [113]:
a.shape

torch.Size([32, 479, 300])

  3%|▎         | 4/129 [00:02<01:24,  1.48it/s]


KeyboardInterrupt: 

In [None]:
from tqdm import tqdm
from numpy.linalg import norm
cos_sim = []
y = encoder_layer(next(iter(dv_dl))[0].unsqueeze(1)).reshape(-1).detach().numpy()
for x, _ in tqdm(tr_dl):
    enc = encoder_layer(x)
    for line in enc:
        X = line.reshape(-1).detach().numpy()
        cos_sim.append(np.dot(X[:len(y)], y[:len(X)])/(norm(X)*norm(y)))

NameError: name 'dv_dl' is not defined

In [None]:
df = pd.DataFrame(cos_sim, columns=['similarity'])
df['Evidence'] = ev[:15000]
df.sort_values('similarity', ascending=False, inplace=True)
df

In [None]:
df_train['claim_text'].values[0]

In [None]:
df[df.index == 12171]

In [None]:
' '.join(dv[0])

In [None]:
df_train['evidences'][0]

In [None]:
max(list(sequence.word2index.values()))

In [None]:
# from tqdm import tqdm
# def reject(df, eviden):
#     reject = []
#     for row in tqdm(df.index):
#         ev = df.loc[row, 'evidences']
#         rej = []
#         samp = True
#         while samp:
#             sp = eviden.sample(len(ev))['evidence'].values
#             for val in sp:
#                 if val not in ev:
#                     rej.append(sp)
#                     samp = False
#         reject.append(rej[0])
#     return reject

# df_train_ref = df_train[['claim_text','claim_label']].copy()
# df_train_ref['evidences'] = reject(df_train, eviden)
# df_train_ref['split'] = 'train'
# df_train_ref['label'] = 'disengagement'

# df_train['evidences'] = [[evidence[ev] for ev in val] for val in df_train['evidences']]

# df = pd.concat([df_train, df_train_ref])[['claim_text', 'evidences', 'label', 'split']]
# df = df.explode('evidences')
# df