In [2]:
import pandas as pd
import json

# 1. Preprocess the data

In [39]:
# Read train claims
with open('../data/train-claims.json', 'r') as f:
    df_train = pd.DataFrame(json.load(f)).transpose()

# Read dev claims
with open('../data/dev-claims.json', 'r') as f:
    df_dev = pd.DataFrame(json.load(f)).transpose()

# Read evidence
with open('../data/evidence.json', 'r') as f:
    evidence = json.load(f)

In [40]:
print(df_train.iloc[0,0])
print(df_train.iloc[0,1])
print(df_train.iloc[0,2])
print()
print(f'{df_train.iloc[0,2][0]}: {evidence[df_train.iloc[0,2][0]]}')
print(f'{df_train.iloc[0,2][1]}: {evidence[df_train.iloc[0,2][1]]}')
print(f'{df_train.iloc[0,2][2]}: {evidence[df_train.iloc[0,2][2]]}')

Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
DISPUTED
['evidence-442946', 'evidence-1194317', 'evidence-12171']

evidence-442946: At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.
evidence-1194317: Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.
evidence-12171: Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.


In [72]:
import nltk
from nltk import tokenize
import numpy as np

In [86]:
# Function to transform text to numbers and inverse
def txt_encod(evidence: list(), to_id=False, to_txt=False):
    # evidence: Evidence text 

    # Transform to id
    if to_id and to_txt:
        print('Error: You have to pass only one true parameter')
        return 0
    if to_id:
        return [vocab.index(token) for token in evidence]
    # Transform to text
    elif to_txt:
        return [vocab[token] for token in evidence]
    else:
        print('Error: You have to pass to_id or to_txt parameter')

In [126]:
# Evidence claims
ev = [j for i,j in evidence.items()]
# Add BOS and EOS
evidence_texts = ['[' + ev + ']' for ev in ev[:1000]]
# Tokenize text
evidence_texts = [tokenize.word_tokenize(text) for text in evidence_texts]
# Get Vocabulary list
vocab = set()
[[vocab.add(tok) for tok in text] for text in evidence_texts]
vocab = list(vocab)
# Texts to numeric
num_txts = [np.array(txt_encod(text, True)) for text in evidence_texts]

# 2. Model the data

In [53]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Load the data to tensor batches

In [260]:
# Dataset class
class Dataset(Dataset):
    def __init__(self, texts, labels):
        self.text = texts
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        texts = torch.tensor(self.text[idx], dtype=torch.float32)
        labels = torch.tensor(self.labels[idx], dtype=torch.int64).reshape(-1,1)
        return texts, labels
    
# Define collate (pre_process) function
def collate_batch(batch):  
    texts, labels = zip(*batch)
    texts = nn.utils.rnn.pad_sequence(texts, batch_first=True).to(device)
    return texts, labels

# Instanciate DataLoader
bs = 32
tr_ds = Dataset(num_txts, range(len(num_txts)))
tr_dl = DataLoader(tr_ds, batch_size=bs, collate_fn=collate_batch)

<center><h3> 2.1 Positional encoding to embed the data</h3></center>

<center><img src=../Images/pos_encoder.png alt="drawing" width="300"></center>

<center>Details on:</center>
<center><a href="https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/"><ph>A Gentle Introduction to Positional Encoding in Transformer Models</ph></a></center>

In [306]:
import math
# Positional embeding function
class positionalEmbeding(nn.Module):
    def __init__(self, model_dim, max_len = 5000,  drop = 0.2):
        # Inputs:
        # model_dim: Vocabulary of the training set
        # max_len: Max number of tokens in an input sentence
        # Return: Positional Embeding Matrix
        super(positionalEmbeding, self).__init__()
        self.dropout = nn.Dropout(p=drop)                                                                   # Dropout layer
        
        # Positional embeding matrix 
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)                                 # Positional increasing vector [max_len, 1]
        div_term = torch.exp(torch.arange(0, model_dim, 2).float() * (-math.log(10000.0) / model_dim))      # Division tern for the sin/cos functions
        pe = torch.zeros(max_len, model_dim)                                                                # Matrix of 0's [max_len,d_model]
        pe[:, 0::2] = torch.sin(position * div_term)                                                        # 0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term)                                                        # 1::2 means starting with index 1, step = 2
        pe = pe.unsqueeze(0).transpose(0, 1)                                                                # [max_len, 1, 1]
        self.register_buffer('pe', pe)                                                                      # Adds pos encoder to the model state_dict

    def forward(self, x):
        # Input:
        # x: Embeding matrix dim
        self.pe[:x.size(0), :]      # Create the embeding
        return self.dropout(x)      # Apply dropout

<center><h3> 2.2 Transformer model (Passage Ranking)</h3></center>
<center>Source papers:</center>
<center><a href="https://arxiv.org/pdf/1706.03762"><ph>Attention Is All You Need</ph></a></center>
<center><a href="https://arxiv.org/pdf/1706.03762"><ph>Text and Code Embeddings by Contrastive Pre-Training</ph></a></center>

In [None]:
class passageRanking(nn.Module):
    def __init__(self):
        super(passageRanking, self)
        self.positionalEncoding = TransformerEncoderLayer(d_model=44, nhead=44)

