In [1]:
!pip install datasets transformers pytorch-lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [3]:
from datasets import load_dataset
from transformers import BertTokenizerFast
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.nn import functional as F

# Create datasets

In [4]:
squadv2 = load_dataset("squad_v2")

Reusing dataset squad_v2 (/root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
squadv2 # train and validation sets (we will treat validation set as test set)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [6]:
from transformers.utils.dummy_pt_objects import LayoutLMv2ForQuestionAnswering

def create_dataset(squad_data, split=None):
    print("FIRST PASS")
    contexts = set()
    for row in tqdm(squad_data):
        contexts.add(row["context"])
    
    if split:
        contexts = tuple(contexts)
        n_valid = int(split*len(contexts))
        splits = [contexts[:n_valid], contexts[n_valid:]]
    else:
        splits = [tuple(contexts)]    

    full_data = {s: {
        'question': [],
        'context': [],
        'orig_answer': [],
        'answer_begin': [],
        'answer_end': [],
    } for s in splits}

    print("SECOND PASS")
    for row in tqdm(squad_data):
        # Let's ignore all impossible answers for now

        answers_start, answers_text = row['answers']["answer_start"], row["answers"]["text"]
        answers_full = list(set(list(zip(answers_start, answers_text))))
        for start_idx, answer_text in answers_full:
            text = row['context']
            end_idx = start_idx + len(answer_text)

            for key, data in full_data.items():
                if text in key:
                    data['question'].append(row['question'])
                    data['context'].append(text)
                    data['orig_answer'].append(answer_text)
                    data['answer_begin'].append(start_idx)

                    # Deal with the problem of 1 or 2 more characters 
                    if text[start_idx:end_idx] == answer_text:
                        data['answer_end'].append(end_idx)
                    else:
                        raise RuntimeError("There are only 1 or 2 character shifts in the dataset so this error should never happen")
    
    if len(splits) == 1:
        return full_data[splits[0]]
    return full_data[splits[1]], full_data[splits[0]]

In [7]:
train, valid = create_dataset(squadv2["train"], 0.1)
test = create_dataset(squadv2["validation"])

FIRST PASS


100%|██████████| 130319/130319 [00:14<00:00, 8923.46it/s]


SECOND PASS


100%|██████████| 130319/130319 [00:48<00:00, 2695.35it/s]


FIRST PASS


100%|██████████| 11873/11873 [00:01<00:00, 8357.90it/s]


SECOND PASS


100%|██████████| 11873/11873 [00:01<00:00, 6980.47it/s]


In [8]:
train_df = pd.DataFrame(train)
train_df

Unnamed: 0,question,context,orig_answer,answer_begin,answer_end
0,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269,286
1,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207,226
2,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526,530
3,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166,180
4,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276,286
...,...,...,...,...,...
78116,In what US state did Kathmandu first establish...,"Kathmandu Metropolitan City (KMC), in order to...",Oregon,229,235
78117,What was Yangon previously known as?,"Kathmandu Metropolitan City (KMC), in order to...",Rangoon,414,421
78118,With what Belorussian city does Kathmandu have...,"Kathmandu Metropolitan City (KMC), in order to...",Minsk,476,481
78119,In what year did Kathmandu create its initial ...,"Kathmandu Metropolitan City (KMC), in order to...",1975,199,203


In [9]:
valid_df = pd.DataFrame(valid)
valid_df

Unnamed: 0,question,context,orig_answer,answer_begin,answer_end
0,What mental health issue did Beyonce go through?,LeToya Luckett and Roberson became unhappy wit...,depression,169,179
1,What event occured after she was publicly crit...,LeToya Luckett and Roberson became unhappy wit...,boyfriend left her,320,338
2,Who supported Beyonce through her depression?,LeToya Luckett and Roberson became unhappy wit...,her mother,714,724
3,What event caused Beyonce's depression?,LeToya Luckett and Roberson became unhappy wit...,split with Luckett and Rober,194,222
4,How long was Beyonce depressed?,LeToya Luckett and Roberson became unhappy wit...,a couple of years,396,413
...,...,...,...,...,...
8695,Contemporary Kathmandu art combines traditiona...,"Kathmandu is a center for art in Nepal, displa...",modern,292,298
8696,How many types are Nepali works of art typical...,"Kathmandu is a center for art in Nepal, displa...",two,411,414
8697,What do the Tibetans call traditional idealist...,"Kathmandu is a center for art in Nepal, displa...",Thangkas,519,527
8698,What UK charity works on behalf of Kathmandu art?,"Kathmandu is a center for art in Nepal, displa...",Kathmandu Contemporary Art Centre,846,879


In [10]:
test_df = pd.DataFrame(test)
test_df

Unnamed: 0,question,context,orig_answer,answer_begin,answer_end
0,In what country is Normandy located?,The Normans (Norman: Nourmands; French: Norman...,France,159,165
1,When were the Normans in Normandy?,The Normans (Norman: Nourmands; French: Norman...,10th and 11th centuries,94,117
2,When were the Normans in Normandy?,The Normans (Norman: Nourmands; French: Norman...,in the 10th and 11th centuries,87,117
3,From which countries did the Norse originate?,The Normans (Norman: Nourmands; French: Norman...,"Denmark, Iceland and Norway",256,283
4,Who was the Norse leader?,The Normans (Norman: Nourmands; French: Norman...,Rollo,308,313
...,...,...,...,...,...
10383,What is a very seldom used unit of mass in the...,"The pound-force has a metric counterpart, less...",slug,274,278
10384,What is a very seldom used unit of mass in the...,"The pound-force has a metric counterpart, less...",the metric slug,263,278
10385,What is a very seldom used unit of mass in the...,"The pound-force has a metric counterpart, less...",metric slug,267,278
10386,What seldom used term of a unit of force equal...,"The pound-force has a metric counterpart, less...",kip,712,715


In [11]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True, clean_text=True)
# tokenize
train_tokenizer = tokenizer(train['context'], train['question'],
                  truncation=True, padding='max_length',
                  max_length=512, return_tensors='pt')
valid_tokenizer = tokenizer(valid['context'], valid['question'],
                  truncation=True, padding='max_length',
                  max_length=512, return_tensors='pt')
test_tokenizer = tokenizer(test['context'], test['question'],
                  truncation=True, padding='max_length',
                  max_length=512, return_tensors='pt')

In [12]:
def add_token_positions(encodings, data):
  start_positions = []
  end_positions = []

  count = 0
  for i in range(len(data['context'])):
    start_positions.append(encodings.char_to_token(i, data['answer_begin'][i]))
    end_positions.append(encodings.char_to_token(i, data['answer_end'][i]))

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
      
    # if end position is None, the 'char_to_token' function points to the space after the correct token, so add - 1
    if end_positions[-1] is None:
      end_positions[-1] = encodings.char_to_token(i, data['answer_end'][i] - 1)
      # if end position is still None the answer passage has been truncated
      if end_positions[-1] is None:
        count += 1
        end_positions[-1] = tokenizer.model_max_length

  # Update the data in dictionary
  encodings.update({ 'start_positions': torch.tensor(start_positions), 'end_positions': torch.tensor(end_positions) })

In [13]:
add_token_positions(train_tokenizer, train)
add_token_positions(valid_tokenizer, valid)
add_token_positions(test_tokenizer, test)

In [14]:
def clean_tokenizer(tokenizer):
    del tokenizer['token_type_ids']
    del tokenizer['attention_mask']

In [15]:
clean_tokenizer(train_tokenizer)
clean_tokenizer(valid_tokenizer)
clean_tokenizer(test_tokenizer)

In [16]:
train_tokenizer

{'input_ids': tensor([[  101, 20773, 21025,  ...,     0,     0,     0],
        [  101, 20773, 21025,  ...,     0,     0,     0],
        [  101, 20773, 21025,  ...,     0,     0,     0],
        ...,
        [  101, 28045,  4956,  ...,     0,     0,     0],
        [  101, 28045,  4956,  ...,     0,     0,     0],
        [  101, 28045,  4956,  ...,     0,     0,     0]]), 'start_positions': tensor([ 67,  55, 128,  ...,  97,  36,   1]), 'end_positions': tensor([ 70,  57, 129,  ...,  97,  36,   3])}

In [17]:
train_tokenizer.input_ids.shape

torch.Size([78121, 512])

In [48]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'], dtype=torch.float32)[idx].unsqueeze(dim=1),
            'start_positions': self.encodings['start_positions'][idx],
            'end_positions': self.encodings['end_positions'][idx],
        }

    def __len__(self):
        return len(self.encodings.input_ids)

In [198]:
BATCH_SIZE=64

train_dataset = SquadDataset(train_tokenizer)
val_dataset = SquadDataset(valid_tokenizer)
test_dataset = SquadDataset(test_tokenizer)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Model definition

In [199]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [200]:
class QABiLSTM(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, output_dim, lstm_layers, lstm_dropout, fc_dropout):
        super().__init__()
        # LAYER 1: BiLSTM
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=lstm_layers,
            bidirectional=True,
            dropout=lstm_dropout if lstm_layers > 1 else 0,
            batch_first=True
        )

        # LAYER 2: Fully-connected
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # times 2 for bidirectional

        self.init_weights()

    def forward(self, x):
        # lstm_out = [batch size, sentence length, hidden dim * 2]
        lstm_out, (hn, cn) = self.lstm(x)
        # logits = [batch size, sentence length, output dim]
        logits = self.fc(self.fc_dropout(lstm_out))

        (start, end) = logits.split(1, dim=-1)
        start = start.squeeze(-1).contiguous()
        end = end.squeeze(-1).contiguous()

        return start, end

    def init_weights(self):
        for name, param in self.named_parameters():
            nn.init.normal_(param.data, mean=0, std=0.1)

    # def init_embeddings(self, word_pad_idx):
    #     # initialize embedding for padding as zero
    #     self.embedding.weight.data[word_pad_idx] = torch.zeros(self.embedding_dim)

    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, patience=2),
                "monitor": "val_loss",
                "frequency": 2
            },
        }

    def step(self, batch):
        x = batch['input_ids'].to(device)
        y_start_idx = batch['start_positions'].to(device)
        y_end_idx = batch['end_positions'].to(device)
        y_start, y_end = self(x)
        return y_start, y_end, y_start_idx, y_end_idx

    @staticmethod
    def compute_loss(y_start, y_end, y_start_idx, y_end_idx):
        loss1 = F.cross_entropy(y_start, y_start_idx)
        loss2 = F.cross_entropy(y_end, y_end_idx)
        return (loss1 + loss2) / 2


    def training_step(self, train_batch, batch_idx):
        torch.cuda.empty_cache()
        y_start, y_end, y_start_idx, y_end_idx = self.step(train_batch)
        loss = self.compute_loss(y_start, y_end, y_start_idx, y_end_idx)

        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        y_start, y_end, y_start_idx, y_end_idx = self.step(val_batch)
        loss = self.compute_loss(y_start, y_end, y_start_idx, y_end_idx)
        
        self.log('val_loss', loss)

    def test_step(self, test_batch, batch_idx):
        y_start, y_end, y_start_idx, y_end_idx = self.step(test_batch)
        loss = self.compute_loss(y_start, y_end, y_start_idx, y_end_idx)
        
        self.log('test_loss', loss)

In [201]:
model = QABiLSTM(
    input_dim=1,
    hidden_dim=256,
    output_dim=2,
    lstm_layers=2,
    lstm_dropout=0.1,
    fc_dropout=0.25,
)

In [202]:
model.count_parameters()

2108418

# Training

In [None]:
# training
checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_loss")
trainer = pl.Trainer(max_epochs=100, gpus=1, precision=16, log_every_n_steps=10, callbacks=[EarlyStopping(monitor="val_loss"), checkpoint_callback])
trainer.fit(model, train_loader, val_loader)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /content/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type    | Params
---------------------------------------
0 | lstm       | LSTM    | 2.1 M 
1 | fc_dropout | Dropout | 0     
2 | fc         | Linear  | 1.0 K 
---------------------------------------
2.1 M     Trainable params
0         Non-trainable params
2.1 M     Total params
4.217     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  import sys


Training: 0it [00:00, ?it/s]

In [None]:
trainer.test(model, dataloaders=test_loader)