# ALBERT + BiLSTM Encoder + ALBERT-SQuAD-OUT

Implementaion of ALBERT is taken from Hugging Face library

In [1]:
import os
import torch
import time
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import json
import pandas as pd
from squad import Squad
import torch
from torch import nn

from transformers import (
    AlbertConfig,
    AlbertModel,
    AlbertTokenizer,
    squad_convert_examples_to_features
)

from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
from transformers.data.metrics.squad_metrics import compute_predictions_logits, get_final_text
from evaluate_answers import *

In [2]:
output_dir = ""
do_lower_case = True

# Tokenizer for ALBERT's input format
tokenizer_class = AlbertTokenizer
tokenizer = tokenizer_class.from_pretrained(
    "albert-base-v2", do_lower_case=True)

In [3]:
# Creating train and test dataframes
train_sq = Squad("./data/train-v2.0.json")
test_sq = Squad("./data/dev-v2.0.json")
train_df = train_sq.get_dataframe()
test_df = test_sq.get_dataframe()

In [4]:
def create_train_dataset(train_df, tokenizer):
    """
    Create dataset from DataFrame
    
    returns: 
        dataset - pytorch dataset of training data features
    """
    examples = []
    for i, question in enumerate(train_df['content']):
        example = SquadExample(
            qas_id=str(i),
            question_text=question,
            context_text=train_df['context'][i],
            answer_text=train_df['answer'][i],
            start_position_character=train_df['answer_start'][i],
            title="Train",
            is_impossible=False,
            answers=None,
        )
        examples.append(example)
    
    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=True,
        return_dataset="pt",
        threads=32,
    )
    
    
    return dataset, features, examples

In [5]:
dataset, _, _ = create_train_dataset(train_df, tokenizer)


train_sampler = SequentialSampler(dataset)
train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=32)

convert squad examples to features: 100%|██████████| 130319/130319 [01:54<00:00, 1140.35it/s]
add example index and unique id: 100%|██████████| 130319/130319 [00:00<00:00, 579110.50it/s]


In [6]:
class AlbertBiLSTM4QA(nn.Module):
    def __init__(self, freeze_albert = True):
        super(AlbertBiLSTM4QA, self).__init__()
        
        # create model's config
        config_class, model_class = (AlbertConfig, AlbertModel)
        config = config_class.from_pretrained("albert-base-v2")
        config.output_hidden_states=True
        self.backbone = model_class.from_pretrained("albert-base-v2", config=config)
        
        # freeze ALBERT layers if freeze_albert is True
        if freeze_albert:
            for param in self.backbone.embeddings.parameters():
                param.requires_grad = False

            for param in self.backbone.encoder.parameters():
                param.requires_grad = False

            for param in self.backbone.pooler.parameters():
                param.requires_grad = False

            for param in self.backbone.pooler_activation.parameters():
                param.requires_grad = False
        
        self.encoder = nn.LSTM(768, 384, num_layers=3, dropout=0.2, bidirectional=True)
        
        self.QA = nn.Sequential(
            nn.Linear(768,2)
        )
        
    
    def forward(self, batch, device='cpu'):
        # inference through ALBERT
        self.backbone.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            features, _, _ = self.backbone(**inputs)
        
        # permute features dimensions to satisfy BiLSTM's input (seq_inp, batch_size, hidden_state)
        features = features.permute(1,0,2)
        x, _ = self.encoder(features)
        # permute back
        x = x.permute(1,0,2)
        logits = self.QA(x)
        
        # get start and end logits also calculate loss
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).squeeze(1)
        end_logits = end_logits.squeeze(-1).squeeze(1)
        
        start_positions = batch[3]
        end_positions = batch[4]
        
        
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss)/2
        return total_loss, start_logits, end_logits

In [7]:
import torch.optim as optim

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
modelA = AlbertBiLSTM4QA(freeze_albert=True).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(modelA.parameters(), lr=3e-5)

In [8]:
def train(model, epochs, optimizer):
    model.zero_grad()
    f = open("logs.txt", "w")
    for epoch in range(epochs):
        loss = 0
        for idx, batch in enumerate(train_dataloader):
            model_loss ,start_logits, end_logits = model(batch,device=device)
            loss += model_loss.item()          
            model_loss.backward()
            optimizer.step()
            model.zero_grad()
            
            if idx % 100 == 0:
                start_pred = torch.argmax(start_logits, dim=1).cpu()
                end_pred = torch.argmax(end_logits, dim=1).cpu()
                pair_accuracy = ((start_pred==batch[3])*(end_pred==batch[4])).sum().float() / len(batch[3])
                start_accuracy = (start_pred==batch[3]).sum().float() / len(batch[3])
                end_accuracy = (end_pred==batch[4]).sum().float() / len(batch[4])
                string = f"[{idx+1}/{len(train_dataloader)}]Epoch: {epoch+1}/{epochs} Loss: {model_loss.item()} Pair Accuracy: {pair_accuracy} Start Accuracy: {start_accuracy} End Accuracy: {end_accuracy}"
                print(string)
                f.write(string)
                torch.save(model.state_dict(), "AlbertBiLSTM.pth")
    f.close()
    
    return model

In [9]:
train(modelA, 2, optimizer)

[1/4132]Epoch: 1/2 Loss: 5.955636501312256 Pair Accuracy: 0.0 Start Accuracy: 0.0 End Accuracy: 0.0
[101/4132]Epoch: 1/2 Loss: 4.423079490661621 Pair Accuracy: 0.0625 Start Accuracy: 0.21875 End Accuracy: 0.09375
[201/4132]Epoch: 1/2 Loss: 2.959162712097168 Pair Accuracy: 0.125 Start Accuracy: 0.25 End Accuracy: 0.28125
[301/4132]Epoch: 1/2 Loss: 3.1466174125671387 Pair Accuracy: 0.34375 Start Accuracy: 0.34375 End Accuracy: 0.5
[401/4132]Epoch: 1/2 Loss: 3.51151967048645 Pair Accuracy: 0.09375 Start Accuracy: 0.15625 End Accuracy: 0.34375
[501/4132]Epoch: 1/2 Loss: 3.3394079208374023 Pair Accuracy: 0.0 Start Accuracy: 0.1875 End Accuracy: 0.0
[601/4132]Epoch: 1/2 Loss: 2.8850038051605225 Pair Accuracy: 0.0625 Start Accuracy: 0.25 End Accuracy: 0.21875
[701/4132]Epoch: 1/2 Loss: 1.8648595809936523 Pair Accuracy: 0.28125 Start Accuracy: 0.46875 End Accuracy: 0.59375
[801/4132]Epoch: 1/2 Loss: 1.7171783447265625 Pair Accuracy: 0.3125 Start Accuracy: 0.5 End Accuracy: 0.5
[901/4132]Epoch:

[3101/4132]Epoch: 2/2 Loss: 1.6774897575378418 Pair Accuracy: 0.28125 Start Accuracy: 0.46875 End Accuracy: 0.4375
[3201/4132]Epoch: 2/2 Loss: 1.8956737518310547 Pair Accuracy: 0.3125 Start Accuracy: 0.40625 End Accuracy: 0.46875
[3301/4132]Epoch: 2/2 Loss: 2.358614444732666 Pair Accuracy: 0.375 Start Accuracy: 0.53125 End Accuracy: 0.4375
[3401/4132]Epoch: 2/2 Loss: 2.3760995864868164 Pair Accuracy: 0.21875 Start Accuracy: 0.375 End Accuracy: 0.375
[3501/4132]Epoch: 2/2 Loss: 2.8074779510498047 Pair Accuracy: 0.1875 Start Accuracy: 0.3125 End Accuracy: 0.34375
[3601/4132]Epoch: 2/2 Loss: 1.4766547679901123 Pair Accuracy: 0.375 Start Accuracy: 0.5625 End Accuracy: 0.5625
[3701/4132]Epoch: 2/2 Loss: 1.2715520858764648 Pair Accuracy: 0.5 Start Accuracy: 0.625 End Accuracy: 0.59375
[3801/4132]Epoch: 2/2 Loss: 1.5756105184555054 Pair Accuracy: 0.34375 Start Accuracy: 0.59375 End Accuracy: 0.5
[3901/4132]Epoch: 2/2 Loss: 1.393688440322876 Pair Accuracy: 0.375 Start Accuracy: 0.5 End Accurac

AlbertBiLSTM4QA(
  (backbone): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
  

## Evaluation

Load saved model

In [18]:
modelA.load_state_dict(torch.load("AlbertBiLSTM.pth"))

<All keys matched successfully>

In [19]:
test_dataset, test_features, test_examples = create_train_dataset(test_df, tokenizer)

test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)

convert squad examples to features: 100%|██████████| 11873/11873 [00:18<00:00, 655.36it/s]
add example index and unique id: 100%|██████████| 11873/11873 [00:00<00:00, 507123.00it/s]


In [20]:
from tqdm.notebook import tqdm

def predict(model):
    model.eval()
    with torch.no_grad():
        to_return = []
        for idx, batch in enumerate(tqdm(test_dataloader)):
            _ ,start_logits, end_logits = model(batch,device=device)      
            
            # Predict start and end
            start_pred = torch.argmax(start_logits, dim=1).cpu()
            end_pred = torch.argmax(end_logits, dim=1).cpu()
            
            for start, end in zip(start_pred, end_pred):
                to_return.append((start.item(), end.item()))
    return to_return

In [21]:
def evaluate_preds(results):
    to_return = []
    for res, feat, example in zip(results, test_features, test_examples):
        if res[0] == 0 and res[1] == 0:
            to_return.append('')
        else:
            #  Clamp to min max start and end
            tok_tokens = feat.tokens[res[0] : (res[1] + 1)]
            if res[0] < min(feat.token_to_orig_map):
                start = min(feat.token_to_orig_map)
            elif res[0] > max(feat.token_to_orig_map):
                start = max(feat.token_to_orig_map)
            else:
                start = res[0]
                
            if res[1] < min(feat.token_to_orig_map):
                end = min(feat.token_to_orig_map)
            elif res[1] > max(feat.token_to_orig_map):
                end = max(feat.token_to_orig_map)
            else:
                end = res[1]
            
            # Convert to predicted text from albert tokenizer tokens
            orig_doc_start = feat.token_to_orig_map[start]
            orig_doc_end = feat.token_to_orig_map[end]
            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)

            tok_text = tok_text.strip()
            tok_text = " ".join(tok_text.split())
            orig_text = " ".join(orig_tokens)
            final_text = get_final_text(tok_text, orig_text, True, True)

            to_return.append(final_text)
    
    answers = {}
    for text,row in zip(to_return, test_df.loc):
        answers[row.id] = text
    return answers

In [22]:
my_preds = predict(modelA)

HBox(children=(FloatProgress(value=0.0, max=384.0), HTML(value='')))




In [23]:
res = evaluate_preds(my_preds)

In [24]:
# For the more representetive results we have taken script that squad owner's have written to check predictions
dataset = test_sq.data
preds = res
na_probs = {k: 0.0 for k in preds}

qid_to_has_ans = make_qid_to_has_ans(dataset) 
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact_raw, f1_raw = get_raw_scores(dataset, preds)
exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
                                      1.0)
f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
                                   1.0)
out_eval = make_eval_dict(exact_thresh, f1_thresh)
if has_ans_qids:
    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
    merge_eval(out_eval, has_ans_eval, 'HasAns')
if no_ans_qids:
    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
    merge_eval(out_eval, no_ans_eval, 'NoAns')
print(json.dumps(out_eval, indent=2))

{
  "exact": 17.05550408489851,
  "f1": 18.155467079522154,
  "total": 11873,
  "HasAns_exact": 0.47233468286099867,
  "HasAns_f1": 2.675415086903988,
  "HasAns_total": 5928,
  "NoAns_exact": 33.59125315391085,
  "NoAns_f1": 33.59125315391085,
  "NoAns_total": 5945
}


In [25]:
for i in range(5):
    choice = np.random.choice(list(res))
    row = test_df[test_df.id == choice].iloc[0]
    print("Context: ", str(row.context))
    print()
    print("Question: ", str(row.content))
    print()
    if row.is_impossible:
        print("Impossible to answer")
    else:
        print("Answer: ", row.answer)
    print()
    if res[choice]:
        print("Predicted answer: ", res[choice])
    else:
        print("Predicted impossbile to answer")
    print("\n//////////////////// \n")

Context:  The Scotland Act 1998, which was passed by the Parliament of the United Kingdom and given royal assent by Queen Elizabeth II on 19 November 1998, governs the functions and role of the Scottish Parliament and delimits its legislative competence. The Scotland Act 2012 extends the devolved competencies. For the purposes of parliamentary sovereignty, the Parliament of the United Kingdom at Westminster continues to constitute the supreme legislature of Scotland. However, under the terms of the Scotland Act, Westminster agreed to devolve some of its responsibilities over Scottish domestic policy to the Scottish Parliament. Such "devolved matters" include education, health, agriculture and justice. The Scotland Act enabled the Scottish Parliament to pass primary legislation on these issues. A degree of domestic authority, and all foreign policy, remain with the UK Parliament in Westminster. The Scottish Parliament has the power to pass laws and has limited tax-varying capability. An