# Baseline Model

For the baseline we have build a simple transformer that just takes into the encoder the questions and the context as the input to the decoder and tries to predict the beginning and the end of the sentence.

In [3]:
import pandas as pd
import numpy as np
import json
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
import spacy
from torchtext import *
from torchtext.data import *
import torchtext
from typing import *
from tqdm.notebook import tqdm
from torchtext.data.utils import get_tokenizer
import torch.nn.functional as F
import numpy.random as random 
from evaluate_answers import *

spacy_en = spacy.load('en_core_web_sm') # or use any other tokenizer model for tokenization
use_glove = True  ### CHANGE this if you don't want to use GloVe embeddings
writer = SummaryWriter()

if use_glove:
    glove_vocab = torchtext.vocab.GloVe(name='840B', dim=300)

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
def tokenizer(text): # create a tokenizer function, you can try using anything else than spacy
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [4]:
# For reproducibility
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

Helper class to covert json to dataframe for easier batch processing

In [9]:
class Squad:
    """Storage for SQuAD dataset"""
    def __init__(self, input_location):
        self.location = input_location # Input location to be read
        file = open(input_location)
        json_file = json.load(file)
        # Save version and data
        self.version = json_file['version']
        self.data = json_file['data']
        
        df_builder = [] # We will store every row of dataframe here
        for sample in self.data:
            title = sample['title'] # Get title
            paragraphs = sample['paragraphs']
            
            for paragraph in paragraphs:
                context = paragraph['context'] # Get context, e.g. a paragraph
                questions = paragraph['qas']
                
                for question in questions:
                    q_id = question['id'] # Question id
                    q_content = question['question'] # Question itself
                    answers = question['answers'] # Possible answers
                    is_impossible = question['is_impossible'] # If it is possible to answer
                    
                    # Build a row of dataframe
                    qas = {
                        'id':q_id,
                        'wiki_title':title,
                        'context':context,
                        'content':q_content,
                        'is_impossible':is_impossible
                    }
                    if is_impossible:
                        qas['answer'] = ""
                        qas['answer_start'] = len(context)-1
                        qas['answer_end'] =len(context)-1
                    else:
                        answer = answers[0]
                        qas['answer'] = answer['text']
                        qas['answer_start'] = answer['answer_start']
                        qas['answer_end'] = answer['answer_start']+len(answer['text'])
                    df_builder.append(qas) 
        self.df = pd.DataFrame(df_builder)

In [10]:
train_sq = Squad('./data/train-v2.0.json') # Load test and train data
test_sq = Squad('./data/dev-v2.0.json')
train_df  = train_sq.dzf
test_df  = test_sq.df

In [5]:
# Taken from here for easier work with dataframe and torchtext
# https://gist.github.com/notnami/3c4d636f2b79e206b26acfe349f2657a
class DataFrameExampleSet:
    def __init__(self, df, fields):
        self._df = df
        self._fields = fields
        self._fields_dict = {field_name: (field_name, field)
                             for field_name, field in fields.items()
                             if field is not None}

    def __iter__(self):
        for item in tqdm(self._df.itertuples(), total=len(self)):
            example = Example.fromdict(item._asdict(), fields=self._fields_dict)
            yield example

    def __len__(self):
        return len(self._df)

    def shuffle(self, random_state=None):
        self._df = self._df.sample(frac=1.0, random_state=random_state)


class DataFrameDataset(Dataset):
    def __init__(self, df, fields, filter_pred=None):
        examples = DataFrameExampleSet(df, fields)
        super().__init__(examples, fields, filter_pred=filter_pred)


class DataFrameBucketIterator(BucketIterator):
    def data(self):
        if isinstance(self.dataset.examples, DataFrameExampleSet):
            if self.shuffle:
                self.dataset.examples.shuffle()
            dataset = self.dataset
        else:
            dataset = super().data()
        return dataset

In [7]:
# Init Fields 
# Here will be context and question
CONTEXT = torchtext.data.Field(tokenize = tokenizer,
                              lower = False,
                              batch_first = False)
# here the target 
QUESTION = torchtext.data.Field(tokenize = tokenizer, 
                                lower = False,
                                batch_first = False)

START = torchtext.data.Field(sequential=False, is_target=True, use_vocab=False)
END = torchtext.data.Field(sequential=False, is_target=True, use_vocab=False)
# Will store id to later check correctness
ID = torchtext.data.Field(is_target=True, sequential=False)

In [8]:
# Create trochtext dataset
train_dataset = DataFrameDataset(train_df, fields={'context':CONTEXT,'content':QUESTION, 'id':ID,
                                                   'answer_start':START, 'answer_end':END})
test_dataset = DataFrameDataset(test_df, fields={'context':CONTEXT,'content':QUESTION, 'id':ID,
                                                'answer_start':START, 'answer_end':END})

In [9]:
# Build vocabulary from our data
# If use glove we will also have vectors for our representations

if use_glove:
    CONTEXT.build_vocab(train_dataset, vectors='glove.840B.300d')
    CONTEXT.vocab.load_vectors('glove.840B.300d')
else:
    CONTEXT.build_vocab(train_dataset, min_freq=100)
QUESTION.build_vocab([''])
QUESTION.vocab = CONTEXT.vocab

ID.build_vocab(list(train_df.id)+ list(test_df.id))

HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




In [10]:
batch_size = 64
# Create iterators for test and train
train_iterator, test_iterator = DataFrameBucketIterator.splits((train_dataset, test_dataset), 
                                    batch_size = batch_size,
                                    device = 'cpu')

In [11]:
class BaselineModel(nn.Module):
    """Baseline model"""
    def __init__(self, context_vocab, hidden_size, dropout=0.2):
        super(BaselineModel, self).__init__()
        
        # If we are using glove that create pretrained embedding layer
        if use_glove:
            self.context_emb = nn.Embedding.from_pretrained(torch.FloatTensor(CONTEXT.vocab.vectors), freeze=True)
            self.question_emb = nn.Embedding.from_pretrained(torch.FloatTensor(CONTEXT.vocab.vectors), freeze=True)
            emb_dim = CONTEXT.vocab.vectors.shape[1]
        else:
            # If not create normal layer
            emb_dim = 256
            self.context_emb = nn.Embedding(context_vocab, emb_dim)
            self.question_emb = nn.Embedding(context_vocab, emb_dim)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
        # Simple transformer with 4 heads and 2 decoder encoder layers
        self.rnn = nn.Transformer(d_model=emb_dim, nhead=4, num_encoder_layers=2,
                                        num_decoder_layers=2, dim_feedforward=1024,
                                        dropout=0.1, activation='relu')
                
        self.fc_out = nn.Linear(emb_dim, 2)
    
    def forward(self, context, question, start_positions=None, end_positions=None):
        context_embedded = self.dropout_1(self.context_emb(context)) # [context_seq_len, batch_size, emb_size]
        question_embedded = self.dropout_2(self.question_emb(question)) # [question_seq_len, batch_size, emb_size]
        
        output = self.rnn(question_embedded, context_embedded, src_key_padding_mask=question.T==1,
                          tgt_key_padding_mask=context.T==1) # [context_seq_len, batch_size, emb_size]
        
        output_context = output.permute(1,0,2) # [batch_size, context_seq_len, emb_size]
        logits = self.fc_out(output_context) # [batch_size, context_seq_len, 2]
        start_logits, end_logits = logits.split(1, dim=-1) # split into start and end logits
        start_logits = start_logits.squeeze(-1).squeeze(1)
        end_logits = end_logits.squeeze(-1).squeeze(1)
        
        if not start_positions is None and not end_positions is None:
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)

            # compute loss of start and end (if given)
            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss)/2
        return total_loss, start_logits, end_logits

In [12]:
context_vocab = len(CONTEXT.vocab)
hidden_size=512

model = BaselineModel(context_vocab, hidden_size).to(device)

In [13]:
# Xavier initialization
def init_weights(m):
    if not isinstance(m, nn.Embedding):
        for name, param in m.named_parameters():
            if param.data.dim() > 1:
                nn.init.xavier_uniform_(param.data)

model.apply(init_weights)

BaselineModel(
  (context_emb): Embedding(100028, 300)
  (question_emb): Embedding(100028, 300)
  (dropout_1): Dropout(p=0.2, inplace=False)
  (dropout_2): Dropout(p=0.2, inplace=False)
  (rnn): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=300, out_features=300, bias=True)
          )
          (linear1): Linear(in_features=300, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=1024, out_features=300, bias=True)
          (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_

In [14]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'Model has a total of {count_trainable_parameters(model):,} of trainable parameters')

Model has a total of 4,637,898 of trainable parameters


In [15]:
optimizer = torch.optim.Adam(model.parameters(),lr=2e-4)

In [16]:
def train(model, iterator, optimizer):
    """
    Runs training loop for whole dataset in iterator
    
    model - model to be trained
    iterator - data loader from which we take source and target
    optimizer - our optimizer
    return average loss
    """
    model.train() # Switch to train
    epoch_loss = [] # We will calculate cumulative loss
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        
        context = batch.context.to(device)
        content = batch.content.to(device)
        start_positions = batch.answer_start.to(device)
        end_positions = batch.answer_end.to(device)
        
        total_loss, start_logits, end_logits = model(context, content, start_positions, end_positions)
    
        writer.add_scalar(f'Loss/train Epoch {epoch}', total_loss, i)
        
        epoch_loss.append(total_loss.item())
        
        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
    return epoch_loss

In [17]:
def evaluate(model, iterator):
    """
    Runs an evaluation loop and returns average loss
    
    model - model to be evaluated
    iterator - data loader with validation set
    returns average loss
    """
    model.eval() # Switch to eval
    epoch_loss = 0 # We will calculate cumulative loss
    
    with torch.no_grad():
        to_return = []
        
        for i, batch in enumerate(iterator):
            optimizer.zero_grad()

            context = batch.context.to(device)
            content = batch.content.to(device)
            start_positions = batch.answer_start.to(device)
            end_positions = batch.answer_end.to(device)

            total_loss, start_logits, end_logits = model(context, content, start_positions, end_positions)
            
            start_pred = start_logits.softmax(dim=1).topk(1, dim=1)[1].squeeze().cpu().detach().numpy()
            end_pred = end_logits.softmax(dim=1).topk(1, dim=1)[1].squeeze().cpu().detach().numpy()
            
            to_return.append((start_pred, end_pred, batch.id))
            epoch_loss += total_loss.item()

            optimizer.step()
    return epoch_loss / len(iterator), to_return

Code for evaluation

In [18]:
def get_preds(predictions, labels, df):
    # Transform our predictions from start-end to the text
    my_preds = {}

    for pred, tgt in zip(predictions, labels):
        start, end = pred
        tg_id = ID.vocab.itos[tgt]
        res = df[df.id == tg_id].context.values[0][start:end]
        my_preds[ID.vocab.itos[tgt]] = res
    return my_preds

In [19]:
def compute_results(preds, squad):
    # flatten output
    predictions = []
    labels = []
    for i in preds:
        for seq1,seq2, tgt in zip(i[0], i[1], i[2]):
            predictions.append((seq1,seq2))
            labels.append(tgt)
    my_preds = get_preds(predictions, labels, test_df) # Get predictions
    
    # For the more representetive results we have taken script that squad owner's have written to check predictions
    dataset = squad.data
    preds = my_preds
    na_probs = {k: 0.0 for k in preds}

    qid_to_has_ans = make_qid_to_has_ans(dataset) 
    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
    exact_raw, f1_raw = get_raw_scores(dataset, preds)
    exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
                                          1.0)
    f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
                                       1.0)
    out_eval = make_eval_dict(exact_thresh, f1_thresh)
    if has_ans_qids:
        has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
        merge_eval(out_eval, has_ans_eval, 'HasAns')
    if no_ans_qids:
        no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
        merge_eval(out_eval, no_ans_eval, 'NoAns')
    print(json.dumps(out_eval, indent=2))
    
    return my_preds

## Training

In [20]:
best_loss = float('inf')
epochs = 1
for epoch in range(epochs):
    train_loss = train(model, train_iterator, optimizer)
    
    eval_loss, model_preds = evaluate(model, test_iterator)
    
    compute_results(model_preds, test_sq)
    # save "best" model
    if best_loss > eval_loss:
        best_loss = eval_loss
        torch.save(model.state_dict(), 'baseline.model')
    print(f"Epoch {epoch}. Train loss: {np.mean(train_loss)}. Eval loss: {eval_loss}")

HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


{
  "exact": 43.37572643813695,
  "f1": 43.8093533605187,
  "total": 11873,
  "HasAns_exact": 0.0,
  "HasAns_f1": 0.8684973767608978,
  "HasAns_total": 5928,
  "NoAns_exact": 86.62741799831791,
  "NoAns_f1": 86.62741799831791,
  "NoAns_total": 5945
}
Epoch 0. Train loss: 5.823935319584963. Eval loss: 5.379476803605274


## Evaluation

Get results and show them

In [21]:
res = compute_results(model_preds, test_sq)

{
  "exact": 43.37572643813695,
  "f1": 43.8093533605187,
  "total": 11873,
  "HasAns_exact": 0.0,
  "HasAns_f1": 0.8684973767608978,
  "HasAns_total": 5928,
  "NoAns_exact": 86.62741799831791,
  "NoAns_f1": 86.62741799831791,
  "NoAns_total": 5945
}


## Some samples of the results

In [22]:
for i in range(5):
    choice = np.random.choice(list(res))
    row = test_df[test_df.id == choice].iloc[0]
    print("Context: ", str(row.context))
    print()
    print("Question: ", str(row.content))
    print()
    if row.is_impossible:
        print("Impossible to answer")
    else:
        print("Answer: ", row.answer)
    print()
    if res[choice]:
        print("Predicted answer: ", res[choice])
    else:
        print("Predicted impossbile to answer")
    print("\n//////////////////// \n")

Context:  Harvard has been highly ranked by many university rankings. In particular, it has consistently topped the Academic Ranking of World Universities (ARWU) since 2003, and the THE World Reputation Rankings since 2011, when the first time such league tables were published. When the QS and Times were published in partnership as the THE-QS World University Rankings during 2004-2009, Harvard had also been regarded the first in every year. The University's undergraduate program has been continuously among the top two in the U.S. News & World Report. In 2014, Harvard topped the University Ranking by Academic Performance (URAP). It was ranked 8th on the 2013-2014 PayScale College Salary Report and 14th on the 2013 PayScale College Education Value Rankings. From a poll done by The Princeton Review, Harvard is the second most commonly named "dream college", both for students and parents in 2013, and was the first nominated by parents in 2009. In 2011, the Mines ParisTech : Professional Ra