# Baseline Model

For the baseline we have build a simple transformer that just has as an input context+question and tries to predict an answer

In [1]:
import pandas as pd
import numpy as np
import json

Helper class to covert json to dataframe for easier batch processing

In [2]:
class Squad:
    def __init__(self, input_location):
        self.location = input_location
        file = open(input_location)
        json_file = json.load(file)
        # Save version and data
        self.version = json_file['version']
        self.data = json_file['data']
        
        df_builder = [] # We will store every row of dataframe here
        for sample in self.data:
            title = sample['title'] # Get title
            paragraphs = sample['paragraphs']
            
            for paragraph in paragraphs:
                context = paragraph['context'] # Get context, e.g. a paragraph
                questions = paragraph['qas']
                
                for question in questions:
                    q_id = question['id'] # Question id
                    q_content = question['question'] # Question itself
                    answers = question['answers'] # Possible answers
                    is_impossible = question['is_impossible'] # If it is possible to answer
                    
                    # Build a row of dataframe
                    qas = {
                        'id':q_id,
                        'wiki_title':title,
                        'context':context,
                        'content':q_content,
                        'is_impossible':is_impossible
                    }
                    if is_impossible:
                        qas['answer'] = ""
                        qas['answer_start'] = -1
                    else:
                        answer = answers[0]
                        qas['answer'] = answer['text']
                        qas['answer_start'] = answer['answer_start']
                    df_builder.append(qas) 
        self.df = pd.DataFrame(df_builder)

In [3]:
train_sq = Squad('./data/train-v2.0.json')
test_sq = Squad('./data/dev-v2.0.json')
train_df  = train_sq.df
test_df  = test_sq.df

In [4]:
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [9]:
from torchtext import *
from torchtext.data import *

# Taken from here for easier work with dataframe and torchtext
# https://gist.github.com/notnami/3c4d636f2b79e206b26acfe349f2657a
class DataFrameExampleSet:
    def __init__(self, df, fields):
        self._df = df
        self._fields = fields
        self._fields_dict = {field_name: (field_name, field)
                             for field_name, field in fields.items()
                             if field is not None}

    def __iter__(self):
        for item in tqdm(self._df.itertuples(), total=len(self)):
            example = Example.fromdict(item._asdict(), fields=self._fields_dict)
            yield example

    def __len__(self):
        return len(self._df)

    def shuffle(self, random_state=None):
        self._df = self._df.sample(frac=1.0, random_state=random_state)


class DataFrameDataset(Dataset):
    def __init__(self, df, fields, filter_pred=None):
        examples = DataFrameExampleSet(df, fields)
        super().__init__(examples, fields, filter_pred=filter_pred)


class DataFrameBucketIterator(BucketIterator):
    def data(self):
        if isinstance(self.dataset.examples, DataFrameExampleSet):
            if self.shuffle:
                self.dataset.examples.shuffle()
            dataset = self.dataset
        else:
            dataset = super().data()
        return dataset

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [11]:
# Concatenate content and context
train_df['content_q'] = train_df.context +' '+ train_df.content
test_df['content_q'] = test_df.context +' '+ test_df.content

In [12]:
import torchtext
from typing import *
from torchtext.data import *
from tqdm.notebook import tqdm
from torchtext.data.utils import get_tokenizer
import dill
load = False
TRG_LEN=15

if load:
    with open("model/CONTEXT_Q.Field","rb") as f:
        CONTEXT_Q=dill.load(f)
    with open("model/TRG.Field","rb") as f:
        TRG=dill.load(f)
else:
    # Init Fields 
    
    # Here will be context and question
    CONTEXT_Q = torchtext.data.Field(tokenize = get_tokenizer("basic_english"),
                          init_token = '<sos>',
                          eos_token = '<eos>',
                          lower = False,
                          batch_first = False)
    # here the target 
    TRG = torchtext.data.Field(tokenize = get_tokenizer("basic_english"), 
                         init_token = '<sos>',
                         eos_token = '<eos>',
                         lower = False,
                         batch_first = False,
                         fix_length=TRG_LEN)
    
# Will store id to later check correctness
ID = torchtext.data.Field(is_target=True, sequential=False)

In [13]:
train_dataset = DataFrameDataset(train_df, fields={'content_q':CONTEXT_Q,'answer':TRG, 'id':ID})
test_dataset = DataFrameDataset(test_df, fields={'content_q':CONTEXT_Q,'answer':TRG, 'id':ID})

In [14]:
if load:
    pass
else:
    # Build vocabulary from our data, target will have the same vocab as context + questions
    CONTEXT_Q.build_vocab(train_dataset, min_freq=100)
    TRG.build_vocab([''], min_freq=10)
    TRG.vocab = CONTEXT_Q.vocab
    
    
    with open("model/CONTEXT_Q.Field","wb+")as f:
        dill.dump(CONTEXT_Q,f)
    with open("model/TRG.Field","wb+")as f:
        dill.dump(TRG,f)
ID.build_vocab(list(train_df.id)+ list(test_df.id))

HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




In [15]:
batch_size = 64
# Create iterators
train_iterator, test_iterator = DataFrameBucketIterator.splits((train_dataset, test_dataset), 
                                    batch_size = batch_size,
                                    device = device)

In [44]:
from torch import nn


class Encoder(nn.Module):
    def __init__(self, src_dim, embedding_dim, hidden_dim, lstm_layers, dropout=0.5):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(src_dim, embedding_dim)
        self.drop = nn.Dropout(dropout)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            lstm_layers, dropout=dropout)

        self.src_dim = src_dim
        self.emb_dim = embedding_dim
        self.hid_dim = hidden_dim
        self.lstm_layers = lstm_layers

    def forward(self, inp):
        # Get's input of size [seq_len, batch_size]
        # Then embed it and run LSTM (pytorches lstm cell input in RNN from sequence in dim=0)
        
        embed = self.drop(self.embedding(inp)) # [seq_len, batch_size, embedding_dim]
        
        # Runs lstm layer and the following outputs:
        # output - outputs fromm all ltsm cells [seq_len, batch_size, hidden_dim]
        # hidden, cell - state of the ltsm cell both [lstm_num_layers, batch_size, hidden_dim]
        output, (hidden, cell) = self.lstm(embed)
        return output, hidden, cell

In [45]:
class Decoder(nn.Module):
    def __init__(self, tgt_dim, embedding_dim, hidden_dim, lstm_layers, tgt_len, dropout=0.5):
        super(Decoder, self).__init__()

        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(tgt_dim, embedding_dim)
        
        if lstm_layers==1:
            dropout=0
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            lstm_layers, dropout=dropout)
        self.out = nn.Linear(hidden_dim, tgt_dim)
        self.softmax = nn.LogSoftmax(dim=1)

        self.tgt_dim = tgt_dim
        self.emb_dim = embedding_dim
        self.hid_dim = hidden_dim
        self.lstm_layers = lstm_layers
        self.tgt_len = tgt_len

    def forward(self, inp, hidden, cell):
        # Receives
        # inp - [seq_len, batch_size] (seq_len will be 1 in our example as we pass to decoder in iterations)
        # hidden, cell - state from previous decoder/encoder runs [num_lstm_layer, hidden, dim]
        # outputs - outputs from the encoder [seq_len, batch_size, hidden_dim]
        
        embed = self.dropout(self.embedding(inp)) # [1, batch_size, embedding_dim]
        
        # Run lstm
        # returns output - output of the lstm cell [1, batch_size, hidden_dim] 
        # hidden, cell - state of the lstm cell both [lstm_num_layers, batch_size, hidden_dim]
        output, (hidden, cell) = self.lstm(embed, (hidden, cell))
        output = self.softmax(self.out(output.squeeze(0))) # [batch_size, tgt_vocab]
        return output, hidden, cell

In [55]:
class Seq2Seq(nn.Module):
    """Seq2Seq module that include Encoder and Decoder and hides logic behind their interaction"""
    def __init__(self, tgt_vocab_dim, src_vocab_dim, embedding_dim, hidden_dim,
                 lstm_layers, tgt_max_len, device='cpu', dropout=0.5):
        super(Seq2Seq, self).__init__()

        self.encoder = Encoder(
            src_vocab_dim, embedding_dim, hidden_dim, lstm_layers,dropout=dropout)
        self.decoder = Decoder(tgt_vocab_dim, embedding_dim, hidden_dim,
                                   lstm_layers, tgt_max_len, dropout=dropout)

    def forward(self, src, tgt=None, teacher_forcing=0.5):
        # Inputs: 
        # src and tgt - source and target encoded sequence both [seq_len, batch_size]
        # target maybe optional (if we are not doing teacher forcing or training)
        
        # Encoder first
        enc_outs, hidden, cell = self.encoder(src)
        # output - outputs fromm all ltsm cells [seq_len, batch_size, hidden_dim]
        # hidden - last output, cell - state of the ltsm cell both [lstm_num_layers, batch_size, hidden_dim]
        
        batch_size = src.shape[1]
        
        # We will store results from decoder here
        # [seq_len-1, batch_size, tgt_dim]; -1 as we don't output <sos>
        outputs = None

        # Init batch size of <sos> tokens [1, batch_size]
        inp = torch.LongTensor([2]*batch_size).unsqueeze(0).to(device)
        
        # As we need to feed LSTM's own output to itself we need to do it in loop
        # We will go over the decoder length (excluding <eos>) and generate translation
        for i in range(1, self.decoder.tgt_len):
            # If we use translation we also pass encoder outputs
            output, hidden, cell = self.decoder(inp, hidden, cell)
            
            # Store output
            if outputs is None:
                outputs = output.unsqueeze(0) #[batch_size, tgt_vocab] -> # [1, batch_size, tgt_vocab]
            else:
                outputs = torch.cat((outputs, output.unsqueeze(0)), dim=0) # [i, batch_size, tgt_dim]

            # Take ids with highest probs (remember they are logsoftmaxed) as the next input
            inp = output.argmax(1) # [batch_size, tgt_vocab] -> [batch_size]

            if random.random() < teacher_forcing: # Try teaching forcing
                inp = tgt[i, :] # If succeeds - use target word as an input [batch_size]
            inp = inp.unsqueeze(0) #[batch_size] -> [1, batch_size]

        return outputs #[seq_len, batch_size, tgt_dim]

In [56]:
context_q_vocab = len(CONTEXT_Q.vocab)
target_vocab = len(TRG.vocab)
emb_size = 256
hidden_dim = 512
dropout=0.2

model = Seq2Seq(context_q_vocab, target_vocab, emb_size, hidden_dim,1,TRG_LEN).to(device)

In [57]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'Model has a total of {count_trainable_parameters(model):,} of trainable parameters')

Model has a total of 15,263,270 of trainable parameters


In [58]:
optimizer = torch.optim.Adam(model.parameters(),lr=2e-4)
loss_func = nn.CrossEntropyLoss(ignore_index=1)

In [62]:
def train(model, iterator, optimizer, loss_func):
    """
    Runs training loop for whole dataset in iterator
    
    model - model to be trained
    iterator - data loader from which we take source and target
    optimizer - our optimizer
    loss_func - function which will compute loss
    return average loss
    """
    model.train() # Switch to train
    epoch_loss = [] # We will calculate cumulative loss
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        
        context = batch.content_q
        tgt = batch.answer
        
        output = model(context, tgt)
        tgt = tgt[1:].reshape(-1)
        output = output.view(-1, output.shape[-1]) 
        
        loss = loss_func(output, tgt)
        writer.add_scalar(f'Loss/train Epoch {epoch}', loss, i)
        
        epoch_loss.append(loss.item())
        
        loss.backward()
        optimizer.step()
    return epoch_loss

In [65]:
def evaluate(model, iterator, loss_func):
    """
    Runs an evaluation loop and returns average loss
    
    model - model to be evaluated
    iterator - data loader with validation set
    loss_func - function which will compute loss
    returns average loss
    """
    model.eval() # Switch to eval
    epoch_loss = 0 # We will calculate cumulative loss
    
    with torch.no_grad():
        to_return = []
        
        for i, batch in enumerate(iterator):
            optimizer.zero_grad()

            context = batch.content_q
            tgt = batch.answer
            
            output = model(context, tgt, teacher_forcing=0)
            
            softmaxed = nn.functional.softmax(output, dim=2)
            # store ids of batch and result
            to_return.append((softmaxed.topk(1)[1].squeeze().cpu().detach().numpy(), batch.id)) 
            
            tgt = tgt[1:].reshape(-1)
            output = output.view(-1, output.shape[-1]) 
            
            loss = loss_func(output, tgt)

            epoch_loss += loss.item()

            optimizer.step()
    return epoch_loss / len(iterator), to_return

## Training

In [66]:
import torch.nn.functional as F
import numpy.random as random 

best_loss = float('inf')
epochs = 5
for epoch in range(epochs):
    train_loss = train(model, train_iterator, optimizer, loss_func)
    
    eval_loss, preds = evaluate(model, test_iterator, loss_func)
    
    # save "best" model
    if best_loss > eval_loss:
        best_loss = eval_loss
        torch.save(model.state_dict(), 'baseline.model')
    print(f"Epoch {epoch}. Train loss: {np.mean(train_loss)}. Eval loss: {eval_loss}")

HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


Epoch 0. Train loss: 4.917997114086011. Eval loss: 4.573740874567339


HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


Epoch 1. Train loss: 4.854379529278829. Eval loss: 4.55540370684798


HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


Epoch 2. Train loss: 4.806496285549495. Eval loss: 4.586937814630488


HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))

KeyboardInterrupt: 

In [32]:
eval_loss, preds = evaluate(model, test_iterator, loss_func)


HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))




## Evaluation

In [67]:
# flatten output
predictions = []
labels = []
for i in preds:
    for seq, tgt in zip(i[0].T, i[1]):
        if isinstance(seq, np.ndarray):
            predictions.append(seq)
        else:
            predictions.append(np.array(seq))
        labels.append(tgt.T)

In [68]:
def to_string(predictions):
    # Converting to string our predictions
    to_return = []
    for pred in predictions:
        result = []
        for word in pred:
            if word == 3: # If eos token - break
                break
            if word != 2: # If sos token - skip
                result.append(TRG.vocab.itos[word])
        to_return.append(' '.join(result))
    return to_return

In [69]:
def check_correctness(predictions, labels, df):
    # Transform our predictions and check their correctness
    tokenizer = get_tokenizer("basic_english")
    answers = {}
    for i, row in df.iterrows():
        answers[row.id] = ' '.join(tokenizer(row.answer))
    
    correct = {}
    my_preds = {}
    for pred, tgt in zip(predictions, labels):
        correct[ID.vocab.itos[tgt]] = answers[ID.vocab.itos[tgt]] == pred
        my_preds[ID.vocab.itos[tgt]] = pred
    correct_preds = answers
    return correct, correct_preds, my_preds

In [70]:
predictions_str = to_string(predictions)
correct, correct_preds, my_preds = check_correctness(predictions_str, labels, test_df)

In [71]:
# For the more representetive results we have taken script that squad owner's have written to check predictions

dataset = test_sq.data
preds = my_preds
na_probs = {k: 0.0 for k in preds}

qid_to_has_ans = make_qid_to_has_ans(dataset) 
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact_raw, f1_raw = get_raw_scores(dataset, preds)
exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
                                      1.0)
f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
                                   1.0)
out_eval = make_eval_dict(exact_thresh, f1_thresh)
if has_ans_qids:
    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
    merge_eval(out_eval, has_ans_eval, 'HasAns')
if no_ans_qids:
    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
    merge_eval(out_eval, no_ans_eval, 'NoAns')
print(json.dumps(out_eval, indent=2))

{
  "exact": 50.07159100480081,
  "f1": 50.07159100480081,
  "total": 11873,
  "HasAns_exact": 0.0,
  "HasAns_f1": 0.0,
  "HasAns_total": 5928,
  "NoAns_exact": 100.0,
  "NoAns_f1": 100.0,
  "NoAns_total": 5945
}


In [72]:
my_preds

{'56de148dcffd8e1900b4b5bc': '',
 '5ad3ea79604f3c001a3ff6eb': '',
 '5ad3ea79604f3c001a3ff6ea': '',
 '5ad3ea79604f3c001a3ff6e9': '',
 '56de11154396321400ee25aa': '',
 '5ad3e96b604f3c001a3ff68c': '',
 '5ad3e96b604f3c001a3ff68b': '',
 '5ad3e96b604f3c001a3ff68a': '',
 '5ad3e96b604f3c001a3ff689': '',
 '56de10b44396321400ee2595': '',
 '56de10b44396321400ee2594': '',
 '56de10b44396321400ee2593': '',
 '5ad3de8b604f3c001a3ff46a': '',
 '5ad3de8b604f3c001a3ff469': '',
 '5ad3de8b604f3c001a3ff468': '',
 '5ad3de8b604f3c001a3ff467': '',
 '56de0ffd4396321400ee258f': '',
 '56de0ffd4396321400ee258e': '',
 '56de0ffd4396321400ee258d': '',
 '5ad3dbc6604f3c001a3ff3ec': '',
 '5ad3dbc6604f3c001a3ff3eb': '',
 '5ad3dbc6604f3c001a3ff3ea': '',
 '5ad3dbc6604f3c001a3ff3e9': '',
 '56de0f6a4396321400ee257f': '',
 '5ad3c626604f3c001a3ff013': '',
 '5ad3c626604f3c001a3ff012': '',
 '5ad3c626604f3c001a3ff011': '',
 '56dde2fa66d3e219004dad9b': '',
 '5ad3af11604f3c001a3fec65': '',
 '5ad3af11604f3c001a3fec64': '',
 '5ad3af11

## Some samples of the results

In [73]:
for i in range(5):
    choice = np.random.choice(list(my_preds))
    row = test_df[test_df.id == choice].iloc[0]
    print("Context: ", str(row.context))
    print()
    print("Question: ", str(row.content))
    print()
    if row.is_impossible:
        print("Impossible to answer")
    else:
        print("Answer: ", row.answer)
    print()
    if my_preds[choice]:
        print("Predicted answer: ", my_preds[choice])
    else:
        print("Predicted impossbile to answer")
    print("\n//////////////////// \n")

Context:  Immediately after Decision Time a "Members Debate" is held, which lasts for 45 minutes. Members Business is a debate on a motion proposed by an MSP who is not a Scottish minister. Such motions are on issues which may be of interest to a particular area such as a member's own constituency, an upcoming or past event or any other item which would otherwise not be accorded official parliamentary time. As well as the proposer, other members normally contribute to the debate. The relevant minister, whose department the debate and motion relate to "winds up" the debate by speaking after all other participants.

Question:  What debate is a debate on a motion proposed by an MSP who is not a Scottish minister?

Impossible to answer

Predicted impossbile to answer

//////////////////// 

Context:  It is likely that a multicomponent, adaptive immune system arose with the first vertebrates, as invertebrates do not generate lymphocytes or an antibody-based humoral response. Many species, h