# Simple Albert

Implementaion of ALBERT is taken from Hugging Face library

In [1]:
import os
import torch
import time
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import json
import pandas as pd
from squad import Squad

from transformers import (
    AlbertConfig,
    AlbertModel,
    AlbertTokenizer,
    squad_convert_examples_to_features
)

from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
from transformers.data.metrics.squad_metrics import compute_predictions_logits, get_final_text
from evaluate_answers import *

In [2]:
output_dir = ""
do_lower_case = True

def to_list(tensor):
    return tensor.detach().cpu().tolist() 

# Tokenizer for ALBERT's input format
tokenizer_class = AlbertTokenizer
tokenizer = tokenizer_class.from_pretrained(
    "albert-base-v2", do_lower_case=True)

Train test val split

In [3]:
# Creating train and test dataframes
train_sq = Squad("./data/train-v2.0.json")
test_sq = Squad("./data/dev-v2.0.json")
train_df = train_sq.get_dataframe()
test_df = test_sq.get_dataframe()

In [4]:
def create_train_dataset(train_df, tokenizer):
    """
    Create dataset from DataFrame
    
    returns: 
        dataset - pytorch dataset of training data features
    """
    examples = []
    for i, question in enumerate(train_df['content']):
        example = SquadExample(
            qas_id=str(i),
            question_text=question,
            context_text=train_df['context'][i],
            answer_text=train_df['answer'][i],
            start_position_character=train_df['answer_start'][i],
            title="Train",
            is_impossible=False,
            answers=None,
        )
        examples.append(example)
    
    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=True,
        return_dataset="pt",
        threads=32,
    )
    
    
    return dataset, features, examples

In [5]:
dataset, _, _ = create_train_dataset(train_df, tokenizer)

train_sampler = SequentialSampler(dataset)
train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=32)

convert squad examples to features: 100%|██████████| 130319/130319 [01:55<00:00, 1128.32it/s]
add example index and unique id: 100%|██████████| 130319/130319 [00:00<00:00, 606754.82it/s]


In [6]:
import torch
from torch import nn

## Model description

In [10]:
class Transformer4QA(nn.Module):
    def __init__(self, freeze_albert = True):
        super(Transformer4QA, self).__init__()
        # create model's config
        config_class, model_class = (AlbertConfig, AlbertModel)
        config = config_class.from_pretrained("albert-base-v2")
        config.output_hidden_states=True
        self.backbone = model_class.from_pretrained("albert-base-v2", config=config)
        
        # freeze ALBERT layers if freeze_albert is True
        if freeze_albert:
            for param in self.backbone.embeddings.parameters():
                param.requires_grad = False

            for param in self.backbone.encoder.parameters():
                param.requires_grad = False

            for param in self.backbone.pooler.parameters():
                param.requires_grad = False

            for param in self.backbone.pooler_activation.parameters():
                param.requires_grad = False
        
        self.QA = nn.Sequential(
            nn.Linear(768,2)
        )
        
    
    def forward(self, batch, device='cpu'):
        # inference through ALBERT
        self.backbone.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            features, _, _ = self.backbone(**inputs)
        
        logits = self.QA(features)
        
        # get start and end logits also calculate loss
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).squeeze(1)
        end_logits = end_logits.squeeze(-1).squeeze(1)
        
        start_positions = batch[3]
        end_positions = batch[4]
        
        
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss)/2
        return total_loss, start_logits, end_logits

## Training

In [11]:
import torch.optim as optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
modelA = Transformer4QA(freeze_albert=True).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(modelA.parameters())

In [12]:
def train(model, epochs, optimizer):
    model.zero_grad()
    f = open("logs.txt", "w")
    for epoch in range(epochs):
        loss = 0
        for idx, batch in enumerate(train_dataloader):
            model_loss ,start_logits, end_logits = model(batch,device=device)
            loss += model_loss.item()          
            
            model_loss.backward()
            optimizer.step()
            model.zero_grad()
            
            if idx % 100 == 0:
                start_pred = torch.argmax(start_logits, dim=1).cpu()
                end_pred = torch.argmax(end_logits, dim=1).cpu()
                pair_accuracy = ((start_pred==batch[3])*(end_pred==batch[4])).sum().float() / len(batch[3])
                start_accuracy = (start_pred==batch[3]).sum().float() / len(batch[3])
                end_accuracy = (end_pred==batch[4]).sum().float() / len(batch[4])
                string = f"[{idx+1}/{len(train_dataloader)}]Epoch: {epoch+1}/{epochs} Loss: {model_loss.item()} Pair Accuracy: {pair_accuracy} Start Accuracy: {start_accuracy} End Accuracy: {end_accuracy}"
                print(string)
                f.write(string)
                torch.save(model.state_dict(), "albert.pth")
    f.close()
    
    return model

In [13]:
train(modelA, 2, optimizer)

[1/4132]Epoch: 1/2 Loss: 6.05572509765625 Pair Accuracy: 0.0 Start Accuracy: 0.0 End Accuracy: 0.0
[101/4132]Epoch: 1/2 Loss: 3.359996795654297 Pair Accuracy: 0.0625 Start Accuracy: 0.25 End Accuracy: 0.15625
[201/4132]Epoch: 1/2 Loss: 2.316701889038086 Pair Accuracy: 0.09375 Start Accuracy: 0.40625 End Accuracy: 0.3125
[301/4132]Epoch: 1/2 Loss: 3.0504703521728516 Pair Accuracy: 0.34375 Start Accuracy: 0.34375 End Accuracy: 0.5
[401/4132]Epoch: 1/2 Loss: 3.8048295974731445 Pair Accuracy: 0.09375 Start Accuracy: 0.15625 End Accuracy: 0.34375
[501/4132]Epoch: 1/2 Loss: 3.7506325244903564 Pair Accuracy: 0.03125 Start Accuracy: 0.09375 End Accuracy: 0.0625
[601/4132]Epoch: 1/2 Loss: 3.5632386207580566 Pair Accuracy: 0.0625 Start Accuracy: 0.0625 End Accuracy: 0.28125
[701/4132]Epoch: 1/2 Loss: 2.8157577514648438 Pair Accuracy: 0.03125 Start Accuracy: 0.3125 End Accuracy: 0.15625
[801/4132]Epoch: 1/2 Loss: 2.004009485244751 Pair Accuracy: 0.3125 Start Accuracy: 0.5625 End Accuracy: 0.46875

[3201/4132]Epoch: 2/2 Loss: 3.0476303100585938 Pair Accuracy: 0.15625 Start Accuracy: 0.1875 End Accuracy: 0.3125
[3301/4132]Epoch: 2/2 Loss: 2.7580151557922363 Pair Accuracy: 0.3125 Start Accuracy: 0.375 End Accuracy: 0.59375
[3401/4132]Epoch: 2/2 Loss: 3.816824436187744 Pair Accuracy: 0.0625 Start Accuracy: 0.09375 End Accuracy: 0.25
[3501/4132]Epoch: 2/2 Loss: 3.6575026512145996 Pair Accuracy: 0.0 Start Accuracy: 0.0625 End Accuracy: 0.15625
[3601/4132]Epoch: 2/2 Loss: 2.5974173545837402 Pair Accuracy: 0.125 Start Accuracy: 0.25 End Accuracy: 0.21875
[3701/4132]Epoch: 2/2 Loss: 2.0401740074157715 Pair Accuracy: 0.40625 Start Accuracy: 0.59375 End Accuracy: 0.46875
[3801/4132]Epoch: 2/2 Loss: 2.9091100692749023 Pair Accuracy: 0.125 Start Accuracy: 0.34375 End Accuracy: 0.1875
[3901/4132]Epoch: 2/2 Loss: 2.093696355819702 Pair Accuracy: 0.15625 Start Accuracy: 0.5 End Accuracy: 0.3125
[4001/4132]Epoch: 2/2 Loss: 3.4725441932678223 Pair Accuracy: 0.03125 Start Accuracy: 0.125 End Accur

Transformer4QA(
  (backbone): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
   

In [14]:
modelA.load_state_dict(torch.load("albert.pth"))

<All keys matched successfully>

## Evaluation

In [15]:
test_dataset, test_features, test_examples = create_train_dataset(test_df, tokenizer)

test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)

convert squad examples to features: 100%|██████████| 11873/11873 [00:17<00:00, 671.21it/s]
add example index and unique id: 100%|██████████| 11873/11873 [00:00<00:00, 625104.77it/s]


In [16]:
from tqdm.notebook import tqdm
def predict(model):
    model.eval()
    with torch.no_grad():
        to_return = []
        for idx, batch in enumerate(tqdm(test_dataloader)):
            _ ,start_logits, end_logits = model(batch,device=device)      
            
            start_pred = torch.argmax(start_logits, dim=1).cpu()
            end_pred = torch.argmax(end_logits, dim=1).cpu()
            
            for start, end in zip(start_pred, end_pred):
                to_return.append((start.item(), end.item()))
    return to_return

In [17]:
def evaluate_preds(results):
    to_return = []
    for res, feat, example in zip(results, test_features, test_examples):
        if res[0] == 0 and res[1] == 0:
            to_return.append('')
        else:
            tok_tokens = feat.tokens[res[0] : (res[1] + 1)]
            if res[0] < min(feat.token_to_orig_map):
                start = min(feat.token_to_orig_map)
            elif res[0] > max(feat.token_to_orig_map):
                start = max(feat.token_to_orig_map)
            else:
                start = res[0]
                
            if res[1] < min(feat.token_to_orig_map):
                end = min(feat.token_to_orig_map)
            elif res[1] > max(feat.token_to_orig_map):
                end = max(feat.token_to_orig_map)
            else:
                end = res[1]
            
            orig_doc_start = feat.token_to_orig_map[start]
            orig_doc_end = feat.token_to_orig_map[end]
            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)

            tok_text = tok_text.strip()
            tok_text = " ".join(tok_text.split())
            orig_text = " ".join(orig_tokens)
            final_text = get_final_text(tok_text, orig_text, True, True)

            to_return.append(final_text)
    
    answers = {}
    for text,row in zip(to_return, test_df.loc):
        answers[row.id] = text
    return answers

In [18]:
my_preds = predict(modelA)

HBox(children=(FloatProgress(value=0.0, max=384.0), HTML(value='')))




In [19]:
res = evaluate_preds(my_preds)

In [20]:
# For the more representetive results we have taken script that squad owner's have written to check predictions
dataset = test_sq.data
preds = res
na_probs = {k: 0.0 for k in preds}

qid_to_has_ans = make_qid_to_has_ans(dataset) 
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact_raw, f1_raw = get_raw_scores(dataset, preds)
exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
                                      1.0)
f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
                                   1.0)
out_eval = make_eval_dict(exact_thresh, f1_thresh)
if has_ans_qids:
    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
    merge_eval(out_eval, has_ans_eval, 'HasAns')
if no_ans_qids:
    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
    merge_eval(out_eval, no_ans_eval, 'NoAns')
print(json.dumps(out_eval, indent=2))

{
  "exact": 5.558830961003959,
  "f1": 8.6014493757091,
  "total": 11873,
  "HasAns_exact": 0.18556005398110662,
  "HasAns_f1": 6.279522341058474,
  "HasAns_total": 5928,
  "NoAns_exact": 10.916736753574432,
  "NoAns_f1": 10.916736753574432,
  "NoAns_total": 5945
}


In [21]:
for i in range(5):
    choice = np.random.choice(list(res))
    row = test_df[test_df.id == choice].iloc[0]
    print("Context: ", str(row.context))
    print()
    print("Question: ", str(row.content))
    print()
    if row.is_impossible:
        print("Impossible to answer")
    else:
        print("Answer: ", row.answer)
    print()
    if res[choice]:
        print("Predicted answer: ", res[choice])
    else:
        print("Predicted impossbile to answer")
    print("\n//////////////////// \n")

Context:  Ctenophora (/tᵻˈnɒfərə/; singular ctenophore, /ˈtɛnəfɔːr/ or /ˈtiːnəfɔːr/; from the Greek κτείς kteis 'comb' and φέρω pherō 'carry'; commonly known as comb jellies) is a phylum of animals that live in marine waters worldwide. Their most distinctive feature is the ‘combs’ – groups of cilia which they use for swimming – they are the largest animals that swim by means of cilia. Adults of various species range from a few millimeters to 1.5 m (4 ft 11 in) in size. Like cnidarians, their bodies consist of a mass of jelly, with one layer of cells on the outside and another lining the internal cavity. In ctenophores, these layers are two cells deep, while those in cnidarians are only one cell deep. Some authors combined ctenophores and cnidarians in one phylum, Coelenterata, as both groups rely on water flow through the body cavity for both digestion and respiration. Increasing awareness of the differences persuaded more recent authors to classify them as separate phyla.

Question:  