# Baseline Model

For the baseline we have build a simple transformer that just has as an input context+question and tries to predict an answer

In [1]:
import pandas as pd
import numpy as np
import json

Helper class to covert json to dataframe for easier batch processing

In [2]:
class Squad:
    def __init__(self, input_location):
        self.location = input_location
        file = open(input_location)
        json_file = json.load(file)
        # Save version and data
        self.version = json_file['version']
        self.data = json_file['data']
        
        df_builder = [] # We will store every row of dataframe here
        for sample in self.data:
            title = sample['title'] # Get title
            paragraphs = sample['paragraphs']
            
            for paragraph in paragraphs:
                context = paragraph['context'] # Get context, e.g. a paragraph
                questions = paragraph['qas']
                
                for question in questions:
                    q_id = question['id'] # Question id
                    q_content = question['question'] # Question itself
                    answers = question['answers'] # Possible answers
                    is_impossible = question['is_impossible'] # If it is possible to answer
                    
                    # Build a row of dataframe
                    qas = {
                        'id':q_id,
                        'wiki_title':title,
                        'context':context,
                        'content':q_content,
                        'is_impossible':is_impossible
                    }
                    if is_impossible:
                        qas['answer'] = ""
                        qas['answer_start'] = -1
                    else:
                        answer = answers[0]
                        qas['answer'] = answer['text']
                        qas['answer_start'] = answer['answer_start']
                    df_builder.append(qas) 
        self.df = pd.DataFrame(df_builder)

In [3]:
train_sq = Squad('./data/train-v2.0.json')
test_sq = Squad('./data/dev-v2.0.json')
train_df  = train_sq.df
test_df  = test_sq.df

In [4]:
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [5]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded)
                
        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [src len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden



In [6]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

In [7]:


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs)
                
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #seq len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0)



In [8]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [9]:
from torchtext import *
from torchtext.data import *

# Taken from here for easier work with dataframe and torchtext
# https://gist.github.com/notnami/3c4d636f2b79e206b26acfe349f2657a
class DataFrameExampleSet:
    def __init__(self, df, fields):
        self._df = df
        self._fields = fields
        self._fields_dict = {field_name: (field_name, field)
                             for field_name, field in fields.items()
                             if field is not None}

    def __iter__(self):
        for item in tqdm(self._df.itertuples(), total=len(self)):
            example = Example.fromdict(item._asdict(), fields=self._fields_dict)
            yield example

    def __len__(self):
        return len(self._df)

    def shuffle(self, random_state=None):
        self._df = self._df.sample(frac=1.0, random_state=random_state)


class DataFrameDataset(Dataset):
    def __init__(self, df, fields, filter_pred=None):
        examples = DataFrameExampleSet(df, fields)
        super().__init__(examples, fields, filter_pred=filter_pred)


class DataFrameBucketIterator(BucketIterator):
    def data(self):
        if isinstance(self.dataset.examples, DataFrameExampleSet):
            if self.shuffle:
                self.dataset.examples.shuffle()
            dataset = self.dataset
        else:
            dataset = super().data()
        return dataset

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [11]:
# Concatenate content and context
train_df['content_q'] = train_df.context +' '+ train_df.content
test_df['content_q'] = test_df.context +' '+ test_df.content

In [12]:
import torchtext
from typing import *
from torchtext.data import *
from tqdm.notebook import tqdm
from torchtext.data.utils import get_tokenizer
import dill
load = False
TRG_LEN=15

if load:
    with open("model/CONTEXT_Q.Field","rb") as f:
        CONTEXT_Q=dill.load(f)
    with open("model/TRG.Field","rb") as f:
        TRG=dill.load(f)
else:
    # Init Fields 
    
    # Here will be context and question
    CONTEXT_Q = torchtext.data.Field(tokenize = get_tokenizer("basic_english"),
                          init_token = '<sos>',
                          eos_token = '<eos>',
                          lower = False,
                          batch_first = False)
    # here the target 
    TRG = torchtext.data.Field(tokenize = get_tokenizer("basic_english"), 
                         init_token = '<sos>',
                         eos_token = '<eos>',
                         lower = False,
                         batch_first = False,
                         fix_length=TRG_LEN)
    
# Will store id to later check correctness
ID = torchtext.data.Field(is_target=True, sequential=False)

In [13]:
train_dataset = DataFrameDataset(train_df, fields={'content_q':CONTEXT_Q,'answer':TRG, 'id':ID})
test_dataset = DataFrameDataset(test_df, fields={'content_q':CONTEXT_Q,'answer':TRG, 'id':ID})

In [14]:
if load:
    pass
else:
    # Build vocabulary from our data, target will have the same vocab as context + questions
    CONTEXT_Q.build_vocab(train_dataset, min_freq=100)
    TRG.build_vocab([''], min_freq=10)
    TRG.vocab = CONTEXT_Q.vocab
    
    
    with open("model/CONTEXT_Q.Field","wb+")as f:
        dill.dump(CONTEXT_Q,f)
    with open("model/TRG.Field","wb+")as f:
        dill.dump(TRG,f)
ID.build_vocab(list(train_df.id)+ list(test_df.id))

HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




In [15]:
batch_size = 64
# Create iterators
train_iterator, test_iterator = DataFrameBucketIterator.splits((train_dataset, test_dataset), 
                                    batch_size = batch_size,
                                    device = device)

In [16]:
context_q_vocab = len(CONTEXT_Q.vocab)
target_vocab = len(TRG.vocab)
emb_size = 256
hidden_dim = 512
dropout=0.2

INPUT_DIM = len(CONTEXT_Q.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)

model.to(device)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(11814, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(11814, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=11814, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [17]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'Model has a total of {count_trainable_parameters(model):,} of trainable parameters')

Model has a total of 33,664,550 of trainable parameters


In [18]:
optimizer = torch.optim.Adam(model.parameters(),lr=2e-4)
loss_func = nn.CrossEntropyLoss(ignore_index=1)

In [19]:
def train(model, iterator, optimizer, loss_func):
    """
    Runs training loop for whole dataset in iterator
    
    model - model to be trained
    iterator - data loader from which we take source and target
    optimizer - our optimizer
    loss_func - function which will compute loss
    return average loss
    """
    model.train() # Switch to train
    epoch_loss = [] # We will calculate cumulative loss
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        
        context = batch.content_q
        tgt = batch.answer
        
        output = model(context, tgt)
        tgt = tgt[1:].reshape(-1)
        output = output[1:].view(-1, output.shape[-1]) 
        
        loss = loss_func(output, tgt)
        writer.add_scalar(f'Loss/train', loss, i)
        
        epoch_loss.append(loss.item())
        
        loss.backward()
        optimizer.step()
    return epoch_loss

In [20]:
def evaluate(model, iterator, loss_func):
    """
    Runs an evaluation loop and returns average loss
    
    model - model to be evaluated
    iterator - data loader with validation set
    loss_func - function which will compute loss
    returns average loss
    """
    model.eval() # Switch to eval
    epoch_loss = 0 # We will calculate cumulative loss
    
    with torch.no_grad():
        to_return = []
        
        for i, batch in enumerate(iterator):
            optimizer.zero_grad()

            context = batch.content_q
            tgt = batch.answer
            
            output = model(context, tgt, teacher_forcing_ratio=0)
            
            softmaxed = nn.functional.softmax(output, dim=2)
            # store ids of batch and result
            to_return.append((softmaxed.topk(1)[1].squeeze().cpu().detach().numpy(), batch.id)) 
            
            tgt = tgt[1:].reshape(-1)
            output = output[1:].view(-1, output.shape[-1]) 
            
            loss = loss_func(output, tgt)

            epoch_loss += loss.item()

            optimizer.step()
    return epoch_loss / len(iterator), to_return

## Training

In [21]:
import torch.nn.functional as F
import numpy.random as random 

best_loss = float('inf')
epochs = 4
for epoch in range(epochs):
    train_loss = train(model, train_iterator, optimizer, loss_func)
    
    eval_loss, preds = evaluate(model, test_iterator, loss_func)
    
    # save "best" model
    if best_loss > eval_loss:
        best_loss = eval_loss
        torch.save(model.state_dict(), 'baseline.model')
    print(f"Epoch {epoch}. Train loss: {np.mean(train_loss)}. Eval loss: {eval_loss}")

HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


Epoch 0. Train loss: 5.075189847482672. Eval loss: 4.545719545374634


HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))

KeyboardInterrupt: 

## Evaluation

In [22]:
_, preds = evaluate(model, test_iterator, loss_func)

HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))

KeyboardInterrupt: 

In [23]:
# flatten output
predictions = []
labels = []
for i in preds:
    for seq, tgt in zip(i[0].T, i[1]):
        if isinstance(seq, np.ndarray):
            predictions.append(seq)
        else:
            predictions.append(np.array(seq))
        labels.append(tgt.T)

In [24]:
def to_string(predictions):
    # Converting to string our predictions
    to_return = []
    for pred in predictions:
        result = []
        for word in pred:
            if word == 3: # If eos token - break
                break
            if word != 2: # If sos token - skip
                result.append(TRG.vocab.itos[word])
        to_return.append(' '.join(result))
    return to_return

In [25]:
def check_correctness(predictions, labels, df):
    # Transform our predictions and check their correctness
    tokenizer = get_tokenizer("basic_english")
    answers = {}
    for i, row in df.iterrows():
        answers[row.id] = ' '.join(tokenizer(row.answer))
    
    correct = {}
    my_preds = {}
    for pred, tgt in zip(predictions, labels):
        correct[ID.vocab.itos[tgt]] = answers[ID.vocab.itos[tgt]] == pred
        my_preds[ID.vocab.itos[tgt]] = pred
    correct_preds = answers
    return correct, correct_preds, my_preds

In [26]:
predictions_str = to_string(predictions)
correct, correct_preds, my_preds = check_correctness(predictions_str, labels, test_df)

In [27]:
from evaluate import *

In [28]:
# For the more representetive results we have taken script that squad owner's have written to check predictions

dataset = test_sq.data
preds = my_preds
na_probs = {k: 0.0 for k in preds}

qid_to_has_ans = make_qid_to_has_ans(dataset) 
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact_raw, f1_raw = get_raw_scores(dataset, preds)
exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
                                      1.0)
f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
                                   1.0)
out_eval = make_eval_dict(exact_thresh, f1_thresh)
if has_ans_qids:
    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
    merge_eval(out_eval, has_ans_eval, 'HasAns')
if no_ans_qids:
    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
    merge_eval(out_eval, no_ans_eval, 'NoAns')
print(json.dumps(out_eval, indent=2))

{
  "exact": 0.0,
  "f1": 0.0,
  "total": 11873,
  "HasAns_exact": 0.0,
  "HasAns_f1": 0.0,
  "HasAns_total": 5928,
  "NoAns_exact": 0.0,
  "NoAns_f1": 0.0,
  "NoAns_total": 5945
}


In [29]:
my_preds

{'56de148dcffd8e1900b4b5bc': '<unk> <unk>',
 '5ad3ea79604f3c001a3ff6eb': '<unk> <unk>',
 '5ad3ea79604f3c001a3ff6ea': '<unk> <unk>',
 '5ad3ea79604f3c001a3ff6e9': '<unk> <unk>',
 '56de11154396321400ee25aa': '<unk> <unk>',
 '5ad3e96b604f3c001a3ff68c': '<unk> <unk> <unk>',
 '5ad3e96b604f3c001a3ff68b': '<unk> <unk>',
 '5ad3e96b604f3c001a3ff68a': '<unk> <unk> <unk>',
 '5ad3e96b604f3c001a3ff689': '<unk> <unk> <unk>',
 '56de10b44396321400ee2595': '<unk> <unk> <unk>',
 '56de10b44396321400ee2594': '<unk> <unk> <unk>',
 '56de10b44396321400ee2593': '<unk> <unk> <unk>',
 '5ad3de8b604f3c001a3ff46a': '<unk> <unk>',
 '5ad3de8b604f3c001a3ff469': '<unk> <unk> <unk>',
 '5ad3de8b604f3c001a3ff468': '<unk> <unk>',
 '5ad3de8b604f3c001a3ff467': '<unk> <unk>',
 '56de0ffd4396321400ee258f': '<unk> <unk> <unk>',
 '56de0ffd4396321400ee258e': '<unk> <unk>',
 '56de0ffd4396321400ee258d': '<unk> <unk>',
 '5ad3dbc6604f3c001a3ff3ec': '<unk>',
 '5ad3dbc6604f3c001a3ff3eb': '<unk>',
 '5ad3dbc6604f3c001a3ff3ea': '<unk>',
 '

## Some samples of the results

In [None]:
for i in range(5):
    choice = np.random.choice(list(my_preds))
    row = test_df[test_df.id == choice].iloc[0]
    print("Context: ", str(row.context))
    print()
    print("Question: ", str(row.content))
    print()
    if row.is_impossible:
        print("Impossible to answer")
    else:
        print("Answer: ", row.answer)
    print()
    if my_preds[choice]:
        print("Predicted answer: ", my_preds[choice])
    else:
        print("Predicted impossbile to answer")
    print("\n//////////////////// \n")