In [None]:
!pip install datasets transformers pytorch-lightning

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [None]:
from datasets import load_dataset
from transformers import BertTokenizerFast
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.nn import functional as F

# Create datasets

In [None]:
squadv2 = load_dataset("squad_v2")

In [None]:
squadv2 # train and validation sets (we will treat validation set as test set)

In [None]:
from transformers.utils.dummy_pt_objects import LayoutLMv2ForQuestionAnswering

def create_dataset(squad_data, split=None):
    print("FIRST PASS")
    contexts = set()
    for row in tqdm(squad_data):
        contexts.add(row["context"])
    
    if split:
        contexts = tuple(contexts)
        n_valid = int(split*len(contexts))
        splits = [contexts[:n_valid], contexts[n_valid:]]
    else:
        splits = [tuple(contexts)]    

    full_data = {s: {
        'question': [],
        'context': [],
        'orig_answer': [],
        'answer_begin': [],
        'answer_end': [],
    } for s in splits}

    print("SECOND PASS")
    for row in tqdm(squad_data):
        # Let's ignore all impossible answers for now

        answers_start, answers_text = row['answers']["answer_start"], row["answers"]["text"]
        answers_full = list(set(list(zip(answers_start, answers_text))))
        for start_idx, answer_text in answers_full:
            text = row['context']
            end_idx = start_idx + len(answer_text)

            for key, data in full_data.items():
                if text in key:
                    data['question'].append(row['question'])
                    data['context'].append(text)
                    data['orig_answer'].append(answer_text)
                    data['answer_begin'].append(start_idx)

                    # Deal with the problem of 1 or 2 more characters 
                    if text[start_idx:end_idx] == answer_text:
                        data['answer_end'].append(end_idx)
                    else:
                        raise RuntimeError("There are only 1 or 2 character shifts in the dataset so this error should never happen")
    
    if len(splits) == 1:
        return full_data[splits[0]]
    return full_data[splits[1]], full_data[splits[0]]

In [None]:
train, valid = create_dataset(squadv2["train"], 0.1)
test = create_dataset(squadv2["validation"])

In [None]:
train_df = pd.DataFrame(train)
train_df

In [None]:
valid_df = pd.DataFrame(valid)
valid_df

In [None]:
test_df = pd.DataFrame(test)
test_df

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True, clean_text=True)
# tokenize
train_tokenizer = tokenizer(train['context'], train['question'],
                  truncation=True, padding='max_length',
                  max_length=512, return_tensors='pt')
valid_tokenizer = tokenizer(valid['context'], valid['question'],
                  truncation=True, padding='max_length',
                  max_length=512, return_tensors='pt')
test_tokenizer = tokenizer(test['context'], test['question'],
                  truncation=True, padding='max_length',
                  max_length=512, return_tensors='pt')

In [None]:
def add_token_positions(encodings, data):
    start_positions = []
    end_positions = []

    count = 0
    for i in range(len(data['context'])):
        start_positions.append(encodings.char_to_token(i, data['answer_begin'][i]))
        end_positions.append(encodings.char_to_token(i, data['answer_end'][i]))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length - 1
        else:
            start_positions[-1] -= 1

        # if end position is None, the 'char_to_token' function points to the space after the correct token, so add - 1
        if end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, data['answer_end'][i] - 1)
            # if end position is still None the answer passage has been truncated
            if end_positions[-1] is None:
                count += 1
                end_positions[-1] = tokenizer.model_max_length - 1
            else:
                end_positions[-1] -= 1
        else:
            end_positions[-1] -= 1
    # Update the data in dictionary
    encodings.update({ 'start_positions': torch.tensor(start_positions, device=device), 'end_positions': torch.tensor(end_positions, device=device) })

In [None]:
add_token_positions(train_tokenizer, train)
add_token_positions(valid_tokenizer, valid)
add_token_positions(test_tokenizer, test)

In [None]:
def clean_tokenizer(tokenizer):
    del tokenizer['token_type_ids']
    del tokenizer['attention_mask']

In [None]:
clean_tokenizer(train_tokenizer)
clean_tokenizer(valid_tokenizer)
clean_tokenizer(test_tokenizer)

In [None]:
train_tokenizer

In [None]:
train_tokenizer.input_ids.shape

In [None]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'], dtype=torch.float32, device=device)[idx].unsqueeze(dim=1),
            'start_positions': self.encodings['start_positions'][idx],
            'end_positions': self.encodings['end_positions'][idx],
        }

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
BATCH_SIZE=64

train_dataset = SquadDataset(train_tokenizer)
val_dataset = SquadDataset(valid_tokenizer)
test_dataset = SquadDataset(test_tokenizer)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Model definition

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
class QABiLSTM(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, output_dim, lstm_layers, lstm_dropout, fc_dropout):
        super().__init__()
        # LAYER 1: BiLSTM
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=lstm_layers,
            bidirectional=True,
            dropout=lstm_dropout if lstm_layers > 1 else 0,
            batch_first=True
        )

        # LAYER 2: Fully-connected
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # times 2 for bidirectional

        self.init_weights()

    def forward(self, x):
        # lstm_out = [batch size, sentence length, hidden dim * 2]
        lstm_out, (hn, cn) = self.lstm(x)
        # logits = [batch size, sentence length, output dim]
        logits = self.fc(self.fc_dropout(lstm_out))

        (start, end) = logits.split(1, dim=-1)
        start = start.squeeze(-1).contiguous()
        end = end.squeeze(-1).contiguous()

        return start, end

    def init_weights(self):
        for name, param in self.named_parameters():
            nn.init.normal_(param.data, mean=0, std=0.1)

    # def init_embeddings(self, word_pad_idx):
    #     # initialize embedding for padding as zero
    #     self.embedding.weight.data[word_pad_idx] = torch.zeros(self.embedding_dim)

    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, patience=2),
                "monitor": "val_loss",
                "frequency": 2
            },
        }

    def step(self, batch):
        x = batch['input_ids']
        y_start_idx = batch['start_positions']
        y_end_idx = batch['end_positions']
        y_start, y_end = self(x)
        return y_start, y_end, y_start_idx, y_end_idx

    @staticmethod
    def compute_loss(y_start, y_end, y_start_idx, y_end_idx):
        loss1 = F.cross_entropy(y_start, y_start_idx)
        loss2 = F.cross_entropy(y_end, y_end_idx)
        return (loss1 + loss2) / 2


    def training_step(self, train_batch, batch_idx):
        torch.cuda.empty_cache()
        y_start, y_end, y_start_idx, y_end_idx = self.step(train_batch)
        loss = self.compute_loss(y_start, y_end, y_start_idx, y_end_idx)

        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        y_start, y_end, y_start_idx, y_end_idx = self.step(val_batch)
        loss = self.compute_loss(y_start, y_end, y_start_idx, y_end_idx)
        
        self.log('val_loss', loss)

    def test_step(self, test_batch, batch_idx):
        y_start, y_end, y_start_idx, y_end_idx = self.step(test_batch)
        loss = self.compute_loss(y_start, y_end, y_start_idx, y_end_idx)
        
        self.log('test_loss', loss)

In [None]:
model = QABiLSTM(
    input_dim=1,
    hidden_dim=64,
    output_dim=2,
    lstm_layers=1,
    lstm_dropout=0.1,
    fc_dropout=0.25,
)

In [None]:
model.count_parameters()

# Training

In [None]:
# training
checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_loss", every_n_train_steps=100)
trainer = pl.Trainer(max_epochs=100, gpus=1, precision=16, log_every_n_steps=10, callbacks=[EarlyStopping(monitor="val_loss"), checkpoint_callback])
trainer.fit(model, train_loader, val_loader, ckpt_path='../input/modell/epoch5-step6200.ckpt')

In [None]:
torch.save(model.state_dict(), './final_BiLSTM.pt')

# Testing

In [None]:
trainer.test(model, dataloaders=test_loader)