# ALBERT + Autoencoder

Implementaion of ALBERT is taken from Hugging Face library

In [1]:
import os
import torch
import time
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import json
import pandas as pd
from squad import Squad

from transformers import (
    AlbertConfig,
    AlbertModel,
    AlbertTokenizer,
    squad_convert_examples_to_features
)

from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample
from transformers.data.metrics.squad_metrics import compute_predictions_logits, get_final_text
from evaluate_answers import *

In [25]:
output_dir = ""
do_lower_case = True

def to_list(tensor):
    return tensor.detach().cpu().tolist() 

# Tokenizer for ALBERT's input format
tokenizer_class = AlbertTokenizer
tokenizer = tokenizer_class.from_pretrained(
    "albert-base-v2", do_lower_case=True)

Train test val split

In [9]:
# Creating train and test dataframes
train_sq = Squad("../data/train-v2.0.json")
test_sq = Squad("../data/dev-v2.0.json")
train_df = train_sq.get_dataframe()
test_df = test_sq.get_dataframe()

In [53]:
def create_train_dataset(train_df, tokenizer):
    """
    Create dataset from DataFrame
    
    returns: 
        dataset - pytorch dataset of training data features
    """
    examples = []
    for i, question in enumerate(train_df['content']):
        example = SquadExample(
            qas_id=str(i),
            question_text=question,
            context_text=train_df['context'][i],
            answer_text=train_df['answer'][i],
            start_position_character=train_df['answer_start'][i],
            title="Train",
            is_impossible=False,
            answers=None,
        )
        examples.append(example)
    
    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=True,
        return_dataset="pt",
        threads=32,
    )
    
    
    return dataset, features, examples

In [31]:
dataset, _, _ = create_train_dataset(train_df, tokenizer)

train_sampler = SequentialSampler(dataset)
train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=32)

convert squad examples to features: 100%|██████████| 130319/130319 [00:49<00:00, 2630.26it/s]
add example index and unique id: 100%|██████████| 130319/130319 [00:00<00:00, 419727.35it/s]


In [6]:
import torch
from torch import nn

In [7]:
class TransformerAE4QA(nn.Module):
    def __init__(self, freeze_albert = True):
        super(TransformerAE4QA, self).__init__()
        # create model's config
        config_class, model_class = (AlbertConfig, AlbertModel)
        config = config_class.from_pretrained("albert-base-v2")
        config.output_hidden_states=True
        self.backbone = model_class.from_pretrained("albert-base-v2", config=config)
        
        # freeze ALBERT layers if freeze_albert is True
        if freeze_albert:
            for param in self.backbone.embeddings.parameters():
                param.requires_grad = False

            for param in self.backbone.encoder.parameters():
                param.requires_grad = False

            for param in self.backbone.pooler.parameters():
                param.requires_grad = False

            for param in self.backbone.pooler_activation.parameters():
                param.requires_grad = False
        
        self.encoder = nn.Sequential(
            # [384x768x1]
            nn.Conv2d(1,32,kernel_size = (8,8),padding = 2,bias = True),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(kernel_size = (2,2)),
            # [190*382*64]
            nn.Conv2d(32,32,kernel_size = (5,5),padding = 1,bias = True),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(kernel_size = (2,2))
        )
        
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(32,32,kernel_size = (6,6),stride = 2,padding = 1),
            nn.LeakyReLU(0.1),
            nn.ConvTranspose2d(32,1,kernel_size = (8,8), stride = 2, padding = 1),
            nn.Tanh()
        )
        
        self.QA = nn.Sequential(
            nn.Linear(768,2)
        )
        
    
    def forward(self, batch, device='cpu'):
        # inference through ALBERT
        self.backbone.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            features, _, _ = self.backbone(**inputs)
        
        # add specific "channel" dimension (need for Convolution Layer)
        features = features.unsqueeze(1)
        x = self.encoder(features)
        x = self.decoder(x)
        logits = self.QA(x)
        
        # get start and end logits also calculate loss
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).squeeze(1)
        end_logits = end_logits.squeeze(-1).squeeze(1)
        
        start_positions = batch[3]
        end_positions = batch[4]
        
        
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss)/2
        return total_loss, start_logits, end_logits

In [26]:
import torch.optim as optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
modelA = TransformerAE4QA(freeze_albert=True).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(modelA.parameters())

In [27]:
def train(model, epochs, optimizer):
    model.zero_grad()
    f = open("logs.txt", "w")
    for epoch in range(epochs):
        loss = 0
        for idx, batch in enumerate(train_dataloader):
            model_loss ,start_logits, end_logits = model(batch,device=device)
            loss += model_loss.item()          
            
            model_loss.backward()
            optimizer.step()
            model.zero_grad()
            
            if idx % 100 == 0:
                start_pred = torch.argmax(start_logits, dim=1).cpu()
                end_pred = torch.argmax(end_logits, dim=1).cpu()
                pair_accuracy = ((start_pred==batch[3])*(end_pred==batch[4])).sum().float() / len(batch[3])
                start_accuracy = (start_pred==batch[3]).sum().float() / len(batch[3])
                end_accuracy = (end_pred==batch[4]).sum().float() / len(batch[4])
                string = f"[{idx+1}/{len(train_dataloader)}]Epoch: {epoch+1}/{epochs} Loss: {model_loss.item()} Pair Accuracy: {pair_accuracy} Start Accuracy: {start_accuracy} End Accuracy: {end_accuracy}"
                print(string)
                f.write(string)
                torch.save(model.state_dict(), "model2freezed.pth")
    f.close()
    
    return model

In [None]:
train(modelA, 2, optimizer)

In [255]:
modelA.load_state_dict(torch.load("model.pth"))

<All keys matched successfully>

In [123]:
test_dataset, test_features, test_examples = create_train_dataset(test_df, tokenizer)

test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)

convert squad examples to features: 100%|██████████| 11873/11873 [00:04<00:00, 2804.02it/s]
add example index and unique id: 100%|██████████| 11873/11873 [00:00<00:00, 501187.29it/s]


In [250]:
from tqdm.notebook import tqdm
def predict(model):
    model.eval()
    with torch.no_grad():
        to_return = []
        for idx, batch in enumerate(tqdm(test_dataloader)):
            _ ,start_logits, end_logits = model(batch,device=device)      
            
            start_pred = torch.argmax(start_logits, dim=1).cpu()
            end_pred = torch.argmax(end_logits, dim=1).cpu()
            
            for start, end in zip(start_pred, end_pred):
                to_return.append((start.item(), end.item()))
    return to_return

In [241]:
def evaluate_preds(results):
    to_return = []
    for res, feat, example in zip(results, test_features, test_examples):
        if res[0] == 0 and res[1] == 0:
            to_return.append('')
        else:
            tok_tokens = feat.tokens[res[0] : (res[1] + 1)]
            if res[0] < min(feat.token_to_orig_map):
                start = min(feat.token_to_orig_map)
            elif res[0] > max(feat.token_to_orig_map):
                start = max(feat.token_to_orig_map)
            else:
                start = res[0]
                
            if res[1] < min(feat.token_to_orig_map):
                end = min(feat.token_to_orig_map)
            elif res[1] > max(feat.token_to_orig_map):
                end = max(feat.token_to_orig_map)
            else:
                end = res[1]
            
            orig_doc_start = feat.token_to_orig_map[start]
            orig_doc_end = feat.token_to_orig_map[end]
            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)

            tok_text = tok_text.strip()
            tok_text = " ".join(tok_text.split())
            orig_text = " ".join(orig_tokens)
            final_text = get_final_text(tok_text, orig_text, True, True)

            to_return.append(final_text)
    
    answers = {}
    for text,row in zip(to_return, test_df.loc):
        answers[row.id] = text
    return answers

In [256]:
my_preds = predict(modelA)

HBox(children=(FloatProgress(value=0.0, max=384.0), HTML(value='')))




In [257]:
res = evaluate_preds(my_preds)

In [258]:
# For the more representetive results we have taken script that squad owner's have written to check predictions
dataset = test_sq.data
preds = res
na_probs = {k: 0.0 for k in preds}

qid_to_has_ans = make_qid_to_has_ans(dataset) 
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact_raw, f1_raw = get_raw_scores(dataset, preds)
exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
                                      1.0)
f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
                                   1.0)
out_eval = make_eval_dict(exact_thresh, f1_thresh)
if has_ans_qids:
    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
    merge_eval(out_eval, has_ans_eval, 'HasAns')
if no_ans_qids:
    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
    merge_eval(out_eval, no_ans_eval, 'NoAns')
print(json.dumps(out_eval, indent=2))

{
  "exact": 20.390802661500885,
  "f1": 21.2241808311376,
  "total": 11873,
  "HasAns_exact": 0.20242914979757085,
  "HasAns_f1": 1.8715754062241696,
  "HasAns_total": 5928,
  "NoAns_exact": 40.52144659377628,
  "NoAns_f1": 40.52144659377628,
  "NoAns_total": 5945
}


In [259]:
for i in range(5):
    choice = np.random.choice(list(res))
    row = test_df[test_df.id == choice].iloc[0]
    print("Context: ", str(row.context))
    print()
    print("Question: ", str(row.content))
    print()
    if row.is_impossible:
        print("Impossible to answer")
    else:
        print("Answer: ", row.answer)
    print()
    if res[choice]:
        print("Predicted answer: ", res[choice])
    else:
        print("Predicted impossbile to answer")
    print("\n//////////////////// \n")

Context:  Many questions regarding prime numbers remain open, such as Goldbach's conjecture (that every even integer greater than 2 can be expressed as the sum of two primes), and the twin prime conjecture (that there are infinitely many pairs of primes whose difference is 2). Such questions spurred the development of various branches of number theory, focusing on analytic or algebraic aspects of numbers. Primes are used in several routines in information technology, such as public-key cryptography, which makes use of properties such as the difficulty of factoring large numbers into their prime factors. Prime numbers give rise to various generalizations in other mathematical domains, mainly algebra, such as prime elements and prime ideals.

Question:  What is the application of prime numbers used in information technology which utilizes the fact that factoring very large prime numbers is expressed in the sum of two primes?

Impossible to answer

Predicted answer:  the

////////////////