In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
class Squad:
    def __init__(self, input_location):
        self.location = input_location
        file = open(input_location)
        json_file = json.load(file)
        self.version = json_file['version']
        self.data = json_file['data']
        
        df_builder = []
        for sample in self.data:
            title = sample['title']
            paragraphs = sample['paragraphs']
            
            for paragraph in paragraphs:
                context = paragraph['context']
                questions = paragraph['qas']
                
                for question in questions:
                    q_id = question['id']
                    q_content = question['question']
                    answers = question['answers']
                    is_impossible = question['is_impossible']
                    qas = {
                        'id':q_id,
                        'wiki_title':title,
                        'context':context,
                        'content':q_content,
                        'is_impossible':is_impossible
                    }
                    if is_impossible:
                        qas['answer'] = ""
                        qas['answer_start'] = -1
                    else:
                        answer = answers[0]
                        qas['answer'] = answer['text']
                        qas['answer_start'] = answer['answer_start']
                    df_builder.append(qas)
        self.df = pd.DataFrame(df_builder)

In [3]:
train_df  = Squad('./data/train-v2.0.json').df
test_df  = Squad('./data/dev-v2.0.json').df

In [4]:
import torch
from torch import nn

In [5]:
class BaselineModel(nn.Module):
    def __init__(self, context_vocab, tgt_vocab, emb_size, num_head, num_encoder_layers=2,
                 num_decoder_layers=2, dim_feedforward=756, dropout=0.1, pad_token=1):
        super(BaselineModel, self).__init__()
        
        self.pad_token = pad_token
        self.context_emb = nn.Embedding(context_vocab, emb_size)
        
        self.tgt_emb = nn.Embedding(tgt_vocab, emb_size)
        
        self.transformer = nn.Transformer(emb_size,nhead=num_head,num_encoder_layers=num_encoder_layers,
                                         num_decoder_layers=num_decoder_layers,dim_feedforward=dim_feedforward,
                                         dropout=dropout)
        self.fc_out = nn.Linear(emb_size, tgt_vocab)
        self._init_params()
        self.tgt_vocab = tgt_vocab
    
    def forward(self, context, tgt):
        context_embedded = self.context_emb(context)
        tgt_embedded = self.tgt_emb(tgt)
        
        src_key_padding_mask = context == self.pad_token
        tgt_key_padding_mask = tgt == self.pad_token
        
        answ = self.transformer(context_embedded, tgt_embedded, src_key_padding_mask=src_key_padding_mask.T,
                                tgt_key_padding_mask=tgt_key_padding_mask.T)
        
        output = torch.zeros((answ.shape[0], answ.shape[1], self.tgt_vocab))
        for i in answ:
            output = model.fc_out(answ)
        
        return output
    
    def _init_params(self):
        for p in self.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform_(p)

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [7]:
train_df['content_q'] = train_df.context +' '+ train_df.content
test_df['content_q'] = test_df.context +' '+ test_df.content

In [8]:
import torchtext
from typing import *
from torchtext.data import *
from tqdm.notebook import tqdm
from torchtext.data.utils import get_tokenizer
import dill
load= False
if load:
    with open("model/CONTEXT_Q.Field","rb") as f:
        CONTEXT=dill.load(f)
    with open("model/TRG.Field","rb") as f:
        TRG=dill.load(f)
else:
    CONTEXT_Q = torchtext.data.Field(tokenize = get_tokenizer("basic_english"),
                          init_token = '<sos>',
                          eos_token = '<eos>',
                          lower = True,
                          batch_first = False)
    
    TRG = torchtext.data.Field(tokenize = get_tokenizer("basic_english"), 
                         init_token = '<sos>',
                         eos_token = '<eos>',
                         lower = True,
                         batch_first = False) 

In [9]:
from torchtext import *
class DataFrameExampleSet:
    def __init__(self, df, fields):
        self._df = df
        self._fields = fields
        self._fields_dict = {field_name: (field_name, field)
                             for field_name, field in fields.items()
                             if field is not None}

    def __iter__(self):
        for item in tqdm(self._df.itertuples(), total=len(self)):
            example = Example.fromdict(item._asdict(), fields=self._fields_dict)
            yield example

    def __len__(self):
        return len(self._df)

    def shuffle(self, random_state=None):
        self._df = self._df.sample(frac=1.0, random_state=random_state)


class DataFrameDataset(Dataset):
    def __init__(self, df, fields, filter_pred=None):
        examples = DataFrameExampleSet(df, fields)
        super().__init__(examples, fields, filter_pred=filter_pred)


class DataFrameBucketIterator(BucketIterator):
    def data(self):
        if isinstance(self.dataset.examples, DataFrameExampleSet):
            if self.shuffle:
                self.dataset.examples.shuffle()
            dataset = self.dataset
        else:
            dataset = super().data()
        return dataset

In [10]:
train_dataset = DataFrameDataset(train_df, fields={'content_q':CONTEXT_Q,'answer':TRG})
test_dataset = DataFrameDataset(test_df, fields={'content_q':CONTEXT_Q,'answer':TRG})


In [11]:
if load:
    pass
else:
    CONTEXT_Q.build_vocab(train_dataset, min_freq=100)
    TRG.build_vocab([''], min_freq=100)
    TRG.vocab = CONTEXT_Q.vocab
    
    with open("model/CONTEXT_Q.Field","wb+")as f:
        dill.dump(CONTEXT_Q,f)
    with open("model/TRG.Field","wb+")as f:
        dill.dump(TRG,f)

HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




In [12]:
batch_size= 64
context_q_vocab = len(CONTEXT_Q.vocab)
target_vocab = len(TRG.vocab)
emb_size = 512
dim_feedforward = 512
num_head = 2

model = BaselineModel(context_q_vocab, target_vocab, emb_size, num_head, dim_feedforward=dim_feedforward)
model.to(device)

BaselineModel(
  (context_emb): Embedding(11814, 512)
  (tgt_emb): Embedding(11814, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (linear1)

In [13]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'Model has a total of {count_trainable_parameters(model):,} of trainable parameters')

Model has a total of 26,575,398 of trainable parameters


In [14]:
train_iterator, test_iterator = DataFrameBucketIterator.splits((train_dataset, test_dataset), 
                                    batch_size = batch_size,
                                    device = device)

In [15]:
optimizer = torch.optim.Adam(model.parameters(),lr=2e-4)
loss_func = nn.CrossEntropyLoss(ignore_index=1)

In [16]:
def train(model, iterator, optimizer, loss_func):
    """
    Runs training loop for whole dataset in iterator
    
    model - model to be trained
    iterator - data loader from which we take source and target
    optimizer - our optimizer
    loss_func - function which will compute loss
    return average loss
    """
    model.train() # Switch to train
    epoch_loss = [] # We will calculate cumulative loss
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        
        context = batch.content_q
        tgt = batch.answer
        
        output = model(context, tgt)
        tgt = tgt.reshape(-1)
        output = output.view(-1, output.shape[-1]) 

        loss = loss_func(output, tgt)
        
        epoch_loss.append(loss.item())
        
        loss.backward()
        optimizer.step()
    return epoch_loss

In [17]:
import matplotlib.pyplot as plt
def update_line(hl, x,y):
    hl.set_xdata(np.append(hl.get_xdata(), x))
    hl.set_ydata(np.append(hl.get_ydata(), y))
    plt.draw()

In [18]:
def evaluate(model, iterator, loss_func):
    """
    Runs an evaluation loop and returns average loss
    
    model - model to be evaluated
    iterator - data loader with validation set
    loss_func - function which will compute loss
    returns average loss
    """
    model.eval() # Switch to eval
    epoch_loss = 0 # We will calculate cumulative loss
    
    with torch.no_grad():
        to_return = []
        
        for i, batch in enumerate(iterator):
            optimizer.zero_grad()

            context = batch.content_q
            tgt = batch.answer
            
            output = model(context, tgt)
            
            softmaxed = nn.functional.softmax(output, dim=2)
            to_return.append(softmaxed.topk(1)[1].squeeze().cpu().detach().numpy())
            tgt = tgt.reshape(-1)
            output = output.view(-1, output.shape[-1]) 
            
            loss = loss_func(output, tgt)

            epoch_loss += loss.item()

            optimizer.step()
    return epoch_loss / len(iterator), to_return

In [19]:
best_loss = float('inf')
epochs = 4
for epoch in range(epochs):
    train_loss = train(model, train_iterator, optimizer, loss_func)
    
    eval_loss, preds = evaluate(model, test_iterator, loss_func)
    
    # save "best" model
    if best_loss > eval_loss:
        best_loss = eval_loss
        torch.save(model.state_dict(), 'baseline.model')
    print(f"Epoch {epoch}. Train loss: {np.mean(train_loss)}. Eval loss: {eval_loss}")

HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


Epoch 0. Train loss: 0.6893772773145642. Eval loss: 0.06276180865167971


HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


Epoch 1. Train loss: 0.025575447855066925. Eval loss: 0.017461498555857487


HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


Epoch 2. Train loss: 0.003849184537743857. Eval loss: 0.011721179549822018


HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11873.0), HTML(value='')))


Epoch 3. Train loss: 0.00037737141440325725. Eval loss: 0.011264411689847303


HBox(children=(FloatProgress(value=0.0, max=130319.0), HTML(value='')))

KeyboardInterrupt: 

In [None]:
first = preds[0].T
first.shape

In [None]:
for i in first:
    TRG.vocab.stoi(''i)

In [None]:
[' '.join([TRG.vocab.itos[j] for j in i]) for i in first]

In [None]:
i.content_q.T

In [None]:
output = model(i.content_q, i.answer)
            
softmaxed = nn.functional.softmax(output, dim=2)
softmaxed.topk(1)[1].squeeze().cpu().detach().numpy()