In [None]:
pip install -q transformers

In [None]:
import os, requests


DATA_DIR = "PATH-YOU-WANT-TO-SAVE"
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'

if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)

    # loop through
    for filename in ['train-v2.0.json', 'dev-v2.0.json']:
        # make the request to download data over HTTP
        res = requests.get(f'{url}{filename}')
        # write to file
        with open(f'{os.path.join(DATA_DIR, filename)}', 'wb') as f:
            for chunk in res.iter_content(chunk_size=4):
                f.write(chunk)

        print(f"{filename} downloaded.")

In [None]:
# modify if you have run above code
DATA_DIR = "../input/squad-20"

In [None]:
import os, json


def read_squad_json(filename: str) -> tuple:
    """
    Give the datapath (representing train or dev set of SQuAD 2.0) and return the contexts, questions and answers
    """
    path = os.path.join(DATA_DIR, filename)
    with open(path, "rb") as json_file:
        squad_dict = json.load(json_file)
    
    contexts, questions, answers = list(), list(), list()
    
    # # iterate through all data in squad data
    for sample in squad_dict['data']:
        for passage in sample['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # check if we need to be extracting from 'answers' or 'plausible_answers'
                access = "plausible_answers" if "plausible_answers" in qa.keys() else 'answers'
                for answer in qa[access]:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    
    return contexts, questions, answers


train_contexts, train_questions, train_answers = read_squad_json('train-v2.0.json')
valid_contexts, valid_questions, valid_answers = read_squad_json('dev-v2.0.json')

In [None]:
# print some instances of training set. if you want see another batch of instances, change seed.

from pprint import pprint

import random
random.seed(0)

indices = random.sample(range(0, len(train_contexts)), 5)
for index in indices:
    print(f'Q:  {train_questions[index]}\n')
    print("Context:\n")
    pprint(train_contexts[index])
    print(f"\nAnswer:[{train_answers[index]}]\n")
    print("-" * 100)

In [None]:
def apply_end_index(answers: list, contexts: list) -> list:
    '''
    the dataset has already character start_index of answers' 
    '''
    _answers = answers.copy()
    for answer, context in zip(_answers, contexts):
        # this is the answer which is extracted from context 
        answer_bound = answer['text']
        # we already know the start character position of answer from context
        start_idx = answer['answer_start']
        
        answer['answer_end'] = start_idx + len(answer_bound)
    return _answers



train_answers = apply_end_index(train_answers, train_contexts)
valid_answers = apply_end_index(valid_answers, valid_contexts)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)

In [None]:
def encode_data(contexts: list, questions: list, answers: list) -> dict:
    encodings = tokenizer(contexts, questions, truncation=True, padding=True, return_tensors="pt")

    # add start and end positions to encodings
    start_positions, end_positions = list(), list()

    for index in range(len(answers)):
        start_value = encodings.char_to_token(index, answers[index]['answer_start'])
        end_value   = encodings.char_to_token(index, answers[index]['answer_end'])

        # if start position is None, the answer passage has been truncated
        if start_value is None:
            start_value = tokenizer.model_max_length
        
        # end position cannot be found, char_to_token found space, so shift position until found
        shift = 1
        while end_value is None:
            end_value = encodings.char_to_token(index, answers[index]['answer_end'] - shift)
            shift += 1

        start_positions.append(start_value)
        end_positions.append(end_value)

    encodings.update({
        'start_positions': start_positions, 'end_positions': end_positions
    })

    return encodings



train_encodings = encode_data(train_contexts, train_questions, train_answers)
valid_encodings = encode_data(valid_contexts, valid_questions, valid_answers)

train_encodings.keys()

In [None]:
del train_contexts, train_questions, train_answers
del valid_contexts, valid_questions, valid_answers

In [None]:
import torch


class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings: dict) -> None:
        self.encodings = encodings

    def __getitem__(self, index: int) -> dict:
        return {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings['input_ids'])


train_ds = SquadDataset(train_encodings)
valid_ds = SquadDataset(valid_encodings)

In [None]:
del train_encodings, valid_encodings

In [None]:
#Fine-tune the QuestionAnswering Transformer Model
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased')

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()

In [None]:
"""
This cell is adopted from `https://github.com/michaelrzhang/lookahead/blob/master/lookahead_pytorch.py`, which is the
source code of `Lookahead Optimizer: k steps forward, 1 step back` paper (https://arxiv.org/abs/1907.08610).
"""


from collections import defaultdict

import torch
from torch.optim.optimizer import Optimizer


class Lookahead(Optimizer):
    r"""PyTorch implementation of the lookahead wrapper.
    Lookahead Optimizer: https://arxiv.org/abs/1907.08610
    """

    def __init__(self, optimizer, la_steps=5, la_alpha=0.8, pullback_momentum="none"):
        """optimizer: inner optimizer
        la_steps (int): number of lookahead steps
        la_alpha (float): linear interpolation factor. 1.0 recovers the inner optimizer.
        pullback_momentum (str): change to inner optimizer momentum on interpolation update
        """
        self.optimizer = optimizer
        self._la_step = 0  # counter for inner optimizer
        self.la_alpha = la_alpha
        self._total_la_steps = la_steps
        pullback_momentum = pullback_momentum.lower()
        assert pullback_momentum in ["reset", "pullback", "none"]
        self.pullback_momentum = pullback_momentum

        self.state = defaultdict(dict)

        # Cache the current optimizer parameters
        for group in optimizer.param_groups:
            for p in group['params']:
                param_state = self.state[p]
                param_state['cached_params'] = torch.zeros_like(p.data)
                param_state['cached_params'].copy_(p.data)
                if self.pullback_momentum == "pullback":
                    param_state['cached_mom'] = torch.zeros_like(p.data)

    def __getstate__(self):
        return {
            'state': self.state,
            'optimizer': self.optimizer,
            'la_alpha': self.la_alpha,
            '_la_step': self._la_step,
            '_total_la_steps': self._total_la_steps,
            'pullback_momentum': self.pullback_momentum
        }

    def zero_grad(self):
        self.optimizer.zero_grad()

    def get_la_step(self):
        return self._la_step

    def state_dict(self):
        return self.optimizer.state_dict()

    def load_state_dict(self, state_dict):
        self.optimizer.load_state_dict(state_dict)

    def _backup_and_load_cache(self):
        """Useful for performing evaluation on the slow weights (which typically generalize better)
        """
        for group in self.optimizer.param_groups:
            for p in group['params']:
                param_state = self.state[p]
                param_state['backup_params'] = torch.zeros_like(p.data)
                param_state['backup_params'].copy_(p.data)
                p.data.copy_(param_state['cached_params'])

    def _clear_and_load_backup(self):
        for group in self.optimizer.param_groups:
            for p in group['params']:
                param_state = self.state[p]
                p.data.copy_(param_state['backup_params'])
                del param_state['backup_params']

    @property
    def param_groups(self):
        return self.optimizer.param_groups

    def step(self, closure=None):
        """Performs a single Lookahead optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = self.optimizer.step(closure)
        self._la_step += 1

        if self._la_step >= self._total_la_steps:
            self._la_step = 0
            # Lookahead and cache the current optimizer parameters
            for group in self.optimizer.param_groups:
                for p in group['params']:
                    param_state = self.state[p]
                    p.data.mul_(self.la_alpha).add_(param_state['cached_params'], alpha=1.0 - self.la_alpha)  # crucial line
                    param_state['cached_params'].copy_(p.data)
                    if self.pullback_momentum == "pullback":
                        internal_momentum = self.optimizer.state[p]["momentum_buffer"]
                        self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.la_alpha).add_(
                            1.0 - self.la_alpha, param_state["cached_mom"])
                        param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"]
                    elif self.pullback_momentum == "reset":
                        self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data)

        return loss

In [None]:
from transformers import AdamW

# initialize adam optimizer with weight decay (reduces chance of overfitting)
base  = AdamW(model.parameters(), lr=1e-4)
optim = Lookahead(base)

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm

import warnings
warnings.simplefilter("ignore")


# initialize data loader for training data
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)


for epoch in range(3):
    # set model to train mode
    model.train()
    
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                        start_positions=start_positions, end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        
        # update parameters
        optim.step()
        
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
MODEL_DIR = "./model"

if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)


tokenizer.save_pretrained(MODEL_DIR)
model.save_pretrained(MODEL_DIR)

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

MODEL_DIR = "./model"

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DIR)

In [None]:
from torch.utils.data import DataLoader


# switch model out of training mode
model.eval()
model = model.to(device)

# initialize validation set data loader
val_loader = DataLoader(valid_ds, batch_size=16)

# initialize list to store accuracies
acc = list()

# loop through batches
for batch in val_loader:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # we will use true positions for accuracy calc
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull prediction tensors out and argmax to get predicted tokens
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
        
# calculate average accuracy in total
print(f"Score of the model based on EM: {sum(acc)/len(acc)}") 

In [None]:
def answer_to_questions(context: str, questions: list) -> list:
    '''
    return a list of answers to list of questions based on context.
    '''
    # encode the inputs
    encodings = tokenizer([context]*len(questions), questions, truncation=True, padding=True, return_tensors="pt")
    encodings = encodings.to(device)
    # make predictions
    outputs = model(**encodings)
    # pull prediction tensors out and argmax to get predicted tokens
    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)
    
    answers = list()
    for index, (start_idx, end_idx) in enumerate(zip(start_pred, end_pred)):
        tokens = tokenizer.convert_ids_to_tokens(encodings['input_ids'][index][start_idx:end_idx+1])
        answers.append( tokenizer.convert_tokens_to_string(tokens) )
        
        
    # print the results
    print("Context:")
    pprint(context)
    print()
    for question, answer in zip(questions, answers):
        print(f"Q:  {question}")
        print(f"A:  {answer}")
        print("-"*60)
    
    
    
    return answers

In [None]:
context = "The modern Olympic Games or Olympics (French: Jeux olympiques)[1][2] are leading international sporting events featuring summer and winter sports competitions in which thousands of athletes from around the world participate in a variety of competitions. The Olympic Games are considered the world's foremost sports competition with more than 200 nations participating.[3] The Olympic Games are normally held every four years, alternating between the Summer and Winter Olympics every two years in the four-year period."
questions = [
    "How often do the Olympic games hold?",
    "How many nations do participate in each Olympic?"
]

_ = answer_to_questions(context, questions)

In [None]:
context = "Vikings is the modern name given to seafaring people primarily from Scandinavia (present-day Denmark, Norway and Sweden), who from the late 8th to the late 11th centuries raided, pirated, traded and settled throughout parts of Europe. They also voyaged as far as the Mediterranean, North Africa, the Middle East, and North America. In some of the countries they raided and settled in, this period is popularly known as the Viking Age, and the term \"Viking\" also commonly includes the inhabitants of the Scandinavian homelands as a collective whole. The Vikings had a profound impact on the Early medieval history of Scandinavia, the British Isles, France, Estonia, and Kievan Rus'."
questions = [
    "When vikings started raided?",
]

_ = answer_to_questions(context, questions)