In [1]:
import pandas as pd
import json

# 1. Preprocess the data

In [2]:
# Read train claims
with open('../data/train-claims.json', 'r') as f:
    df_train = pd.DataFrame(json.load(f)).transpose()

# Read dev claims
with open('../data/dev-claims.json', 'r') as f:
    df_dev = pd.DataFrame(json.load(f)).transpose()

# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidence = json.load(f)

In [3]:
print(df_train.iloc[0,0])
print(df_train.iloc[0,1])
print(df_train.iloc[0,2])
print()
print(f'{df_train.iloc[0,2][0]}: {evidence[df_train.iloc[0,2][0]]}')
print(f'{df_train.iloc[0,2][1]}: {evidence[df_train.iloc[0,2][1]]}')
print(f'{df_train.iloc[0,2][2]}: {evidence[df_train.iloc[0,2][2]]}')

Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
DISPUTED
['evidence-442946', 'evidence-1194317', 'evidence-12171']

evidence-442946: At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.
evidence-1194317: Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.
evidence-12171: Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.


In [4]:
import nltk
from nltk import tokenize
import numpy as np

In [5]:
# Function to transform text to numbers and inverse
def txt_encod(evidence: list(), to_id=False, to_txt=False):
    # evidence: Evidence text 

    # Transform to id
    if to_id and to_txt:
        print('Error: You have to pass only one true parameter')
        return 0
    if to_id:
        return [vocab.index(token) for token in evidence]
    # Transform to text
    elif to_txt:
        return [vocab[token] for token in evidence]
    else:
        print('Error: You have to pass to_id or to_txt parameter')

In [6]:
# Evidence claims
ev = [j for i,j in evidence.items()]
# Add BOS and EOS
evidence_texts = ['[' + ev + ']' for ev in ev[:1000]]
# Tokenize text
evidence_texts = [tokenize.word_tokenize(text) for text in evidence_texts]
# Get Vocabulary list
vocab = set()
[[vocab.add(tok) for tok in text] for text in evidence_texts]
vocab = list(vocab)
# Texts to numeric
num_txts = [np.array(txt_encod(text, True)) for text in evidence_texts]

# 2. Model the data

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Load the data to tensor batches

In [101]:
# Dataset class
class Dataset(Dataset):
    def __init__(self, texts, labels):
        self.text = texts
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        texts = torch.tensor(self.text[idx])
        labels = torch.tensor(self.labels[idx]).reshape(-1,1)
        return texts, labels
    
# Define collate (pre_process) function
def collate_batch(batch):  
    texts, labels = zip(*batch)
    texts = nn.utils.rnn.pad_sequence(texts, batch_first=True).to(device)
    return texts, labels

# Instanciate DataLoader
bs = 32
tr_ds = Dataset(num_txts, range(len(num_txts)))
tr_dl = DataLoader(tr_ds, batch_size=bs, collate_fn=collate_batch)

<center><h3> 2.1 Positional encoding to embed the data</h3></center>

<center><img src=../Images/pos_encoder.png alt="drawing" width="300"></center>

<center>Details on:</center>
<center><a href="https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/"><ph>A Gentle Introduction to Positional Encoding in Transformer Models</ph></a></center>

In [135]:
import math
max_len_ = max([len(i) for i in num_txts]) # Maximum number of tokens in a sentence
# Positional embeding function
class positionalEmbeding(nn.Module):
    def __init__(self, embedding_dim, drop = 0.2, max_len = max_len_):
        # Inputs:
        # embedding_dim: Length of input embeding
        # max_len: Max number of tokens in an input sentence
        # Return: Positional Embeding Matrix
        super(positionalEmbeding, self).__init__()
        self.dropout = nn.Dropout(p=drop)                                                                           # Dropout layer
        
        # Positional embeding matrix 
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)                                         # Positional increasing vector [max_len, 1]
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))      # Division term for the sin/cos functions
        pe = torch.zeros(max_len, embedding_dim)                                                                    # Matrix of 0's [max_len, embedding_dim]
        pe[:, 0::2] = torch.sin(position * div_term)                                                                # 0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term)                                                                # 1::2 means starting with index 1, step = 2
        pe = pe.unsqueeze(0).transpose(0, 1)                                                                        # Resize pos encoder [max_len, 1, embedding_dim]
        self.register_buffer('pe', pe)                                                                              # Adds pos encoder to the model state_dict

    def forward(self, x):
        # Input:
        # x: Embeding matrix [batch_size, text_length, embedding_dim]
        x = x + self.pe[:x.size(0), :x.size(1)]      # Sum the position embeding
        return self.dropout(x)              # Apply dropout

<center><h3> 2.2 Multihead attention</h3></center>
<center><img src=../Images/attention.png alt="drawing" width="600"></center>

<center>Details on:</center>
<center><a href="https://towardsdatascience.com/build-your-own-transformer-from-scratch-using-pytorch-84c850470dcb"><ph>Build your own Transformer from scratch using Pytorch</ph></a></center>

In [178]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(MultiHeadAttention, self).__init__()

        assert embedding_dim % num_heads == 0, "in_size must be divisible by num_heads"

        self.embedding_dim = embedding_dim                      # Embeding input size
        self.num_heads = num_heads                              # Num heads of multihead attention model
        self.head_dim = embedding_dim // num_heads              # Embedding parameters for each head
        
        # Instanciate weights
        self.W_q = nn.Linear(embedding_dim, embedding_dim)      # Query weights
        self.W_k = nn.Linear(embedding_dim, embedding_dim)      # Key weights
        self.W_v = nn.Linear(embedding_dim, embedding_dim)      # Values weights
        self.linear = nn.Linear(embedding_dim, embedding_dim)

    # scaled_dot_product_attention
    def dot_prd_attn(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)   # MatMult (Q*K)
        if mask is not None: attn_scores = attn_scores.masked_fill(mask == 0, -1e9)     # Masking (Optional)
        attn_probs = torch.softmax(attn_scores, dim=-1)                                 # Softmax
        output = torch.matmul(attn_probs, V)                                            # MatMult (Probs*V)
        return output
    
    # Function to split attention heads
    def split_heads(self, x):
        batch_size, seq_length, embedding_dim = x.size()
        return x.view(batch_size, self.num_heads, seq_length, self.head_dim)
    # Function to join attention heads
    def combine_heads(self, x):
        batch_size, num_heads, seq_length, head_dim = x.size()
        return x.view(batch_size, seq_length, self.embedding_dim)
    
    def forward(self, x, mask = None):
        # Weights linear pass (Random inicialization) + Split heads
        Q = self.split_heads(self.W_q(x))
        K = self.split_heads(self.W_k(x))
        V = self.split_heads(self.W_v(x))
        # Multihead attention
        attn = self.dot_prd_attn(Q, K, V, mask)                 # scaled_dot_product_attention
        attn = self.combine_heads(attn)                         # Concat heads
        attn = self.linear(attn)                                # Linear pass
        return attn

<center><h3> 2.5 Transformer model (Passage Ranking)</h3></center>
<center>Source papers:</center>
<center><a href="https://arxiv.org/pdf/1706.03762"><ph>Attention Is All You Need</ph></a></center>
<center><a href="https://arxiv.org/pdf/1706.03762"><ph>Text and Code Embeddings by Contrastive Pre-Training</ph></a></center>

<center>Encoder:</center>
<center><img src=../Images/encoder.png alt="drawing" width="300"></center>

In [209]:
# Encoder class based 
class Encoder(nn.Module):
    def __init__(self, 
                vocab_size,                            # Size of vocabulary
                embedding_dim,                         # Embedding dimension
                n_head,                                # Number of heads  in the multihead attention model
                hidden_dim,                            # Hiden dims for the feed forward pass
                dropout = 0.5):
        
        super(Encoder, self).__init__()
        self.embedding_dim = embedding_dim

        self.encoder = nn.Embedding(vocab_size, embedding_dim)                  # Embeding layer
        self.pos_encoder = positionalEmbeding(embedding_dim, dropout)           # Positional embeding
        self.multihead = MultiHeadAttention(embedding_dim, n_head)              # Multihead attention layer
        self.normalization = nn.LayerNorm(embedding_dim)                        # Normalization layer
        self.dropout = nn.Dropout(dropout)

        # Feed forward pass
        self.feed_forward = nn.Sequential()
        self.feed_forward.add_module('fc1', nn.Linear(embedding_dim, hidden_dim))
        self.feed_forward.add_module('relu', nn.ReLU())
        self.feed_forward.add_module('fc2', nn.Linear(hidden_dim, embedding_dim))

    def forward(self, text):
        encoder = self.encoder(text) * math.sqrt(self.embedding_dim)            # Encode imput text [batch_size, text_length, embedding_dim]
        pos_enc = self.pos_encoder(encoder)                                     # Reurn pos encoder [batch_size, text_length, embedding_dim]
        attn = self.multihead(pos_enc)                                          # Multihead encoder
        normal = self.normalization(text.unsqueeze(2) + self.dropout(attn))     # Add & Normalize pass #1  UNSQUEEZE
        forward = self.feed_forward(normal)                                     # Feed Forward pass
        encoded = self.normalization(normal + self.dropout(forward))            # Add & Normalize pass #2
        return encoded

In [210]:
vocab_size = len(vocab)
embedding_dim = 300
n_head = 2
dropout = 0.5
hidden_dim = 2048
model = Encoder(vocab_size, embedding_dim, n_head, hidden_dim, dropout).to(device)

In [211]:
enc = model(next(iter(tr_dl))[0])
enc.size()

torch.Size([32, 44, 300])

In [212]:
enc

tensor([[[-3.9594e-01, -2.4603e-01,  7.7858e-02,  ..., -1.0842e+00,
           1.8080e+00,  2.2548e-01],
         [ 2.8391e-01,  9.3860e-01, -8.9123e-02,  ..., -8.7096e-01,
           4.7969e-02, -7.5383e-01],
         [ 7.4855e-02, -5.8413e-01, -1.3787e+00,  ..., -6.4854e-01,
          -5.2938e-01, -2.9025e-02],
         ...,
         [-2.6756e+00,  8.2335e-01,  1.9251e+00,  ..., -6.6846e-01,
           2.4282e+00, -9.7128e-01],
         [-2.9915e+00,  1.0594e+00,  5.9534e-01,  ...,  2.8972e+00,
           1.3887e-01,  6.4214e-02],
         [-1.5426e+00,  1.6817e+00,  1.9284e+00,  ..., -4.2212e-01,
          -5.3812e-02, -8.2272e-01]],

        [[ 7.6182e-01, -4.7167e-01,  1.1669e+00,  ..., -1.3793e+00,
          -6.3860e-02, -3.8378e-01],
         [ 7.1473e-01, -1.2070e+00,  2.3097e-01,  ..., -6.4373e-02,
          -6.4373e-02,  4.0369e-01],
         [-2.7365e-01, -1.7996e+00, -3.7576e-02,  ..., -1.1863e+00,
          -1.3729e+00,  1.3109e+00],
         ...,
         [-1.1603e-01,  5