# Baseline Model

For the baseline we have build a simple transformer that just has as an input context+question and tries to predict an answer

In [1]:
import pandas as pd
import numpy as np
import json

Helper class to covert json to dataframe for easier batch processing

In [2]:
class Squad:
    def __init__(self, input_location):
        self.location = input_location
        file = open(input_location)
        json_file = json.load(file)
        # Save version and data
        self.version = json_file['version']
        self.data = json_file['data']
        
        df_builder = [] # We will store every row of dataframe here
        for sample in self.data:
            title = sample['title'] # Get title
            paragraphs = sample['paragraphs']
            
            for paragraph in paragraphs:
                context = paragraph['context'] # Get context, e.g. a paragraph
                questions = paragraph['qas']
                
                for question in questions:
                    q_id = question['id'] # Question id
                    q_content = question['question'] # Question itself
                    answers = question['answers'] # Possible answers
                    is_impossible = question['is_impossible'] # If it is possible to answer
                    
                    # Build a row of dataframe
                    qas = {
                        'id':q_id,
                        'wiki_title':title,
                        'context':context,
                        'content':q_content,
                        'is_impossible':is_impossible
                    }
                    if is_impossible:
                        qas['answer'] = ""
                        qas['answer_start'] = -1
                    else:
                        answer = answers[0]
                        qas['answer'] = answer['text']
                        qas['answer_start'] = answer['answer_start']
                    df_builder.append(qas) 
        self.df = pd.DataFrame(df_builder)

In [3]:
train_sq = Squad('./data/train-v2.0.json')
test_sq = Squad('./data/dev-v2.0.json')
train_df  = train_sq.df
test_df  = test_sq.df

In [4]:
import torch
from torch import nn

In [5]:
class BaselineModel(nn.Module):
    """A very simplistic model to predict answers"""
    def __init__(self, context_vocab, tgt_vocab, emb_size, num_head, num_encoder_layers=2,
                 num_decoder_layers=2, dim_feedforward=756, dropout=0.1, pad_token=1):
        super(BaselineModel, self).__init__()
        
        self.pad_token = pad_token
        self.context_emb = nn.Embedding(context_vocab, emb_size) # We embed context + question here
        
        self.tgt_emb = nn.Embedding(tgt_vocab, emb_size) # We embed target here
        
        # Transformer in which we input the context and target
#         self.transformer = nn.Transformer(emb_size,nhead=num_head,num_encoder_layers=num_encoder_layers,
#                                          num_decoder_layers=num_decoder_layers,dim_feedforward=dim_feedforward,
#                                          dropout=dropout)
        
#         encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
#         encoder_norm = LayerNorm(d_model)
#         self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)


#         decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
#         decoder_norm = LayerNorm(d_model)
#         self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
        
        # Linear layer to predict the output
        self.fc_out = nn.Linear(emb_size, tgt_vocab)
        self._init_params()
        self.tgt_vocab = tgt_vocab
    
    def forward(self, context, tgt):
        
        # Embed context and target
        context_embedded = self.context_emb(context)
        tgt_embedded = self.tgt_emb(tgt)
        
        src_key_padding_mask = context == self.pad_token
        tgt_key_padding_mask = tgt == self.pad_token
        
        # Transform context and target
        answ = self.transformer(context_embedded, tgt_embedded, src_key_padding_mask=src_key_padding_mask.T,
                                tgt_key_padding_mask=tgt_key_padding_mask.T)
        
        output = torch.zeros((answ.shape[0], answ.shape[1], self.tgt_vocab))
        for i in answ:
            output = model.fc_out(answ)
        
        return output
    
    def _init_params(self):
        for p in self.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform_(p)

In [6]:
from torchtext import *
from torchtext.data import *

# Taken from here for easier work with dataframe and torchtext
# https://gist.github.com/notnami/3c4d636f2b79e206b26acfe349f2657a
class DataFrameExampleSet:
    def __init__(self, df, fields):
        self._df = df
        self._fields = fields
        self._fields_dict = {field_name: (field_name, field)
                             for field_name, field in fields.items()
                             if field is not None}

    def __iter__(self):
        for item in tqdm(self._df.itertuples(), total=len(self)):
            example = Example.fromdict(item._asdict(), fields=self._fields_dict)
            yield example

    def __len__(self):
        return len(self._df)

    def shuffle(self, random_state=None):
        self._df = self._df.sample(frac=1.0, random_state=random_state)


class DataFrameDataset(Dataset):
    def __init__(self, df, fields, filter_pred=None):
        examples = DataFrameExampleSet(df, fields)
        super().__init__(examples, fields, filter_pred=filter_pred)


class DataFrameBucketIterator(BucketIterator):
    def data(self):
        if isinstance(self.dataset.examples, DataFrameExampleSet):
            if self.shuffle:
                self.dataset.examples.shuffle()
            dataset = self.dataset
        else:
            dataset = super().data()
        return dataset

In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [8]:
# Concatenate content and context
train_df['content_q'] = train_df.context +' '+ train_df.content
test_df['content_q'] = test_df.context +' '+ test_df.content

In [9]:
import torchtext
from typing import *
from torchtext.data import *
from tqdm.notebook import tqdm
from torchtext.data.utils import get_tokenizer
import dill
load= False
if load:
    with open("model/CONTEXT_Q.Field","rb") as f:
        CONTEXT=dill.load(f)
    with open("model/TRG.Field","rb") as f:
        TRG=dill.load(f)
else:
    # Init Fields 
    
    # Here will be context and question
    CONTEXT_Q = torchtext.data.Field(tokenize = get_tokenizer("toktok"),
                          init_token = '<sos>',
                          eos_token = '<eos>',
                          lower = False,
                          batch_first = False)
    # here the target 
    TRG = torchtext.data.Field(tokenize = get_tokenizer("toktok"), 
                         init_token = '<sos>',
                         eos_token = '<eos>',
                         lower = False,
                         batch_first = False)
    
# Will store id to later check correctness
ID = torchtext.data.Field(is_target=True, sequential=False)

In [10]:
train_dataset = DataFrameDataset(train_df, fields={'content_q':CONTEXT_Q,'answer':TRG, 'id':ID})
test_dataset = DataFrameDataset(test_df, fields={'content_q':CONTEXT_Q,'answer':TRG, 'id':ID})


In [11]:
if load:
    pass
else:
    # Build vocabulary from our data, target will have the same vocab as context + questions
    CONTEXT_Q.build_vocab(train_dataset, min_freq=50)
    TRG.build_vocab([''], min_freq=50)
    TRG.vocab = CONTEXT_Q.vocab
    
    
    with open("model/CONTEXT_Q.Field","wb+")as f:
        dill.dump(CONTEXT_Q,f)
    with open("model/TRG.Field","wb+")as f:
        dill.dump(TRG,f)
ID.build_vocab(list(train_df.id)+ list(test_df.id))

HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




In [12]:
batch_size= 64
# Create iterators
train_iterator, test_iterator = DataFrameBucketIterator.splits((train_dataset, test_dataset), 
                                    batch_size = batch_size,
                                    device = device)

In [13]:
context_q_vocab = len(CONTEXT_Q.vocab)
target_vocab = len(TRG.vocab)
emb_size = 512
dim_feedforward = 512
num_head = 2

# Init model
model = BaselineModel(context_q_vocab, target_vocab, emb_size, num_head, dim_feedforward=dim_feedforward)
model.to(device)

BaselineModel(
  (context_emb): Embedding(21931, 512)
  (tgt_emb): Embedding(21931, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (linear1)

In [14]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'Model has a total of {count_trainable_parameters(model):,} of trainable parameters')

Model has a total of 42,125,227 of trainable parameters


In [15]:
optimizer = torch.optim.Adam(model.parameters(),lr=2e-4)
loss_func = nn.CrossEntropyLoss(ignore_index=1)

In [16]:
def train(model, iterator, optimizer, loss_func):
    """
    Runs training loop for whole dataset in iterator
    
    model - model to be trained
    iterator - data loader from which we take source and target
    optimizer - our optimizer
    loss_func - function which will compute loss
    return average loss
    """
    model.train() # Switch to train
    epoch_loss = [] # We will calculate cumulative loss
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        
        context = batch.content_q
        tgt = batch.answer
        
        output = model(context, tgt)
        tgt = tgt.reshape(-1)
        output = output.view(-1, output.shape[-1]) 

        loss = loss_func(output, tgt)
        
        epoch_loss.append(loss.item())
        
        loss.backward()
        optimizer.step()
    return epoch_loss

In [17]:
def evaluate(model, iterator, loss_func):
    """
    Runs an evaluation loop and returns average loss
    
    model - model to be evaluated
    iterator - data loader with validation set
    loss_func - function which will compute loss
    returns average loss
    """
    model.eval() # Switch to eval
    epoch_loss = 0 # We will calculate cumulative loss
    
    with torch.no_grad():
        to_return = []
        
        for i, batch in enumerate(iterator):
            optimizer.zero_grad()

            context = batch.content_q
            tgt = batch.answer
            
            output = model(context, tgt)
            
            softmaxed = nn.functional.softmax(output, dim=2)
            # store ids of batch and result
            to_return.append((softmaxed.topk(1)[1].squeeze().cpu().detach().numpy(), batch.id)) 
            
            tgt = tgt.reshape(-1)
            output = output.view(-1, output.shape[-1]) 
            
            loss = loss_func(output, tgt)

            epoch_loss += loss.item()

            optimizer.step()
    return epoch_loss / len(iterator), to_return

## Training

In [18]:
best_loss = float('inf')
epochs = 4
for epoch in range(epochs):
    train_loss = train(model, train_iterator, optimizer, loss_func)
    
    eval_loss, preds = evaluate(model, test_iterator, loss_func)
    
    # save "best" model
    if best_loss > eval_loss:
        best_loss = eval_loss
        torch.save(model.state_dict(), 'baseline.model')
    print(f"Epoch {epoch}. Train loss: {np.mean(train_loss)}. Eval loss: {eval_loss}")

HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


Epoch 0. Train loss: 0.9920720200533953. Eval loss: 0.1669959529473256


HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


Epoch 1. Train loss: 0.07054533362387468. Eval loss: 0.052245992279651585


HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


Epoch 2. Train loss: 0.010890705061757664. Eval loss: 0.0357376340998403


HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


Epoch 3. Train loss: 0.0006800501947731825. Eval loss: 0.03633174848578996


## Evaluation

In [19]:
_, preds = evaluate(model, test_iterator, loss_func)

HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))




In [20]:
# flatten output
predictions = []
labels = []
for i in preds:
    for seq, tgt in zip(i[0].T, i[1]):
        predictions.append(seq)
        labels.append(tgt.T)

In [21]:
def to_string(predictions):
    # Converting to string our predictions
    to_return = []
    for pred in predictions:
        result = []
        for word in pred:
            if word == 3: # If eos token - break
                break
            if word != 2: # If sos token - skip
                result.append(TRG.vocab.itos[word])
        to_return.append(' '.join(result))
    return to_return

In [29]:
def check_correctness(predictions, labels, df):
    # Transform our predictions and check their correctness
    tokenizer = get_tokenizer("toktok")
    answers = {}
    for i, row in df.iterrows():
        answers[row.id] = ' '.join(tokenizer(row.answer))
    
    correct = {}
    my_preds = {}
    for pred, tgt in zip(predictions, labels):
        correct[ID.vocab.itos[tgt]] = answers[ID.vocab.itos[tgt]] == pred
        my_preds[ID.vocab.itos[tgt]] = pred
        
    correct_preds = answers
    return correct, correct_preds, my_preds

In [31]:
predictions = to_string(predictions)
correct, correct_preds, my_preds = check_correctness(predictions, labels, test_df)

In [24]:
from evaluate import *

In [25]:
# For the more representetive results we have taken script that squad owner's have written to check predictions

dataset = test_sq.data
preds = my_preds
na_probs = {k: 0.0 for k in preds}

qid_to_has_ans = make_qid_to_has_ans(dataset) 
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact_raw, f1_raw = get_raw_scores(dataset, preds)
exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
                                      1.0)
f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
                                   1.0)
out_eval = make_eval_dict(exact_thresh, f1_thresh)
if has_ans_qids:
    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
    merge_eval(out_eval, has_ans_eval, 'HasAns')
if no_ans_qids:
    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
    merge_eval(out_eval, no_ans_eval, 'NoAns')
print(json.dumps(out_eval, indent=2))

{
  "exact": 77.98366040596311,
  "f1": 86.79409211178795,
  "total": 11873,
  "HasAns_exact": 55.904183535762485,
  "HasAns_f1": 73.55031303023961,
  "HasAns_total": 5928,
  "NoAns_exact": 100.0,
  "NoAns_f1": 100.0,
  "NoAns_total": 5945
}


## Some samples of the results

In [28]:
for i in range(5):
    choice = np.random.choice(list(my_preds))
    row = test_df[test_df.id == choice].iloc[0]
    print("Context: ", str(row.context))
    print()
    print("Question: ", str(row.content))
    print()
    if row.is_impossible:
        print("Impossible to answer")
    else:
        print("Answer: ", row.answer)
    print()
    if my_preds[choice]:
        print("Predicted answer: ", my_preds[choice])
    else:
        print("Predicted impossbile to answer")
    print("\n//////////////////// \n")

Context:  The French population numbered about 75,000 and was heavily concentrated along the St. Lawrence River valley, with some also in Acadia (present-day New Brunswick and parts of Nova Scotia, including Île Royale (present-day Cape Breton Island)). Fewer lived in New Orleans, Biloxi, Mississippi, Mobile, Alabama and small settlements in the Illinois Country, hugging the east side of the Mississippi River and its tributaries. French fur traders and trappers traveled throughout the St. Lawrence and Mississippi watersheds, did business with local tribes, and often married Indian women. Traders married daughters of chiefs, creating high-ranking unions.

Question:  What was French population in South America?

Impossible to answer

Predicted impossbile to answer

//////////////////// 

Context:  Specialty pharmacies supply high cost injectable, oral, infused, or inhaled medications that are used for chronic and complex disease states such as cancer, hepatitis, and rheumatoid arthritis.