In [1]:
%matplotlib inline
import json
import os
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from joblib import Parallel, delayed
from tqdm import tqdm
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataset import random_split
from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertForQuestionAnswering
from transformers import AlbertModel, AlbertPreTrainedModel, AlbertForQuestionAnswering, AlbertTokenizer
from transformers import get_linear_schedule_with_warmup, squad_convert_examples_to_features
from transformers.data.processors import SquadV1Processor, SquadExample, SquadFeatures

squadv_data_path = "./squadv1.1/"
train_data_path = "./squadv1.1/train-v1.1.json"
dev_data_path = "./squadv1.1/dev-v1.1.json"
pickle_path = "./pickles/"

def save2pk(filename, obj):
    try:
        os.mkdir(pickle_path)
    except:
        pass
    with open(pickle_path+filename+".pk", "wb") as pk:
        pkl.dump(obj, pk)

def readPk(filename):
    with open(pickle_path+filename, "rb") as pk:
        obj = pkl.load(pk)
        

In [2]:
# model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
# model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

In [3]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2')

In [4]:
processor = SquadV1Processor()
examples = processor.get_train_examples(squadv_data_path)

100%|██████████| 442/442 [00:26<00:00, 16.97it/s]


In [5]:
max_seq_length = 512
max_query_length = 128
doc_stride = 3
parallel_size_per_worker = 10000

features = squad_convert_examples_to_features(
    examples=examples,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    doc_stride=doc_stride,
    max_query_length=max_query_length,
    is_training=True,
    return_dataset='pt',
    threads=6
)

convert squad examples to features: 100%|██████████| 87599/87599 [04:31<00:00, 322.75it/s]
add example index and unique id: 100%|██████████| 87599/87599 [00:00<00:00, 1051258.18it/s]


In [6]:
save2pk("features", features)

In [206]:
pt_input_ids = [i[0] for i in features[1]]
pt_attention_mask = [i[1] for i in features[1]]
pt_token_type_ids = [i[2] for i in features[1]]
pt_start_logits = [i[3] for i in features[1]]
pt_end_logits = [i[4] for i in features[1]]

In [208]:
pt_dict = {
    "size": len(features[1]),
    "input_ids": pt_input_ids,
    "token_type_ids": pt_token_type_ids,
    "attention_mask": pt_attention_mask,
    "start_logits": pt_start_logits,
    "end_logits": pt_end_logits
}

In [4]:
with open(train_data_path, 'r') as f:
    train_data = json.load(f)

with open(dev_data_path, 'r') as f:
    dev_data = json.load(f)

In [5]:
def data2df(data):
    samples = []
    for i in data['data']:
        for j in i['paragraphs']:
            for t in j['qas']:
                for k in t['answers']:
                    sample = [i['title'], j['context']]
                    sample += k.values()
                sample += list(t.values())[1:]
                samples.append(tuple(sample))
    full_df = pd.DataFrame(samples, columns=['title', 'context', 'answer_start', 'answer', 'question', 'q_id'])
    return full_df, full_df[['q_id', 'context', 'question', 'answer_start', 'answer']]

In [6]:
full_df, df = data2df(train_data)

In [7]:
df.head(3)

Unnamed: 0,q_id,context,question,answer_start,answer
0,5733be284776f41900661182,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,515,Saint Bernadette Soubirous
1,5733be284776f4190066117f,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,188,a copper statue of Christ
2,5733be284776f41900661180,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,279,the Main Building


In [8]:
test_context = df.loc[0]['context']
test_quest = df.loc[0]['question']

indexed_tokens = tokenizer.encode(test_context, test_quest)
tokens = tokenizer.convert_ids_to_tokens(indexed_tokens)

In [10]:
class ContextQuestionDataset(Dataset):
    
    def __init__(self, data, answer=None):
        super(ContextQuestionDataset, self).__init__()
        self._data = data
        self._answer = answer
        if answer is not None:
            self._q2answer = {i[1]:answer[idx] for idx, i in enumerate(data)}
            
    def __getitem__(self, i):
        return self._data[i]

    def __len__(self):
        return len(self._data)

    def __iter__(self):
        for x in self._data:
            yield x

    def get_answer(self, question):
        if self._answer is not None:
            return self._q2answer[question]
        else:
            raise "No answer given."

def buildDataset(df, answer=None):
    train_data = df[['context', 'question']].values
    train_data = [(i[0], i[1]) for i in train_data]
    return ContextQuestionDataset(train_data, answer)



def tokenize(batch, answer):
    return tokenizer.batch_encode_plus(batch, return_tensors="pt", pad_to_max_length=True), answer

def buildBatch(dataset):
    num_batches = int(len(dataset) / BATCH_SIZE) + 1
    
    batches = []
    answers_batches = []
    for i in range(num_batches-1):
        batch = [dataset[i*BATCH_SIZE + j] for j in range(BATCH_SIZE)]
        batches.append(batch)
        answer = [dataset.get_answer(dataset[i*BATCH_SIZE + j][1]) for j in range(BATCH_SIZE)]
        answers_batches.append(answer)
    
    rest = len(dataset) - len(batches) * BATCH_SIZE
    batch = [dataset[j] for j in range(len(dataset)-rest, len(dataset))]
    batches.append(batch)
    answer = [dataset.get_answer(dataset[j][1]) for j in range(len(dataset)-rest, len(dataset))]
    answers_batches.append(answer)
    
    assert len(dataset) == (len(batches)-1)*BATCH_SIZE + rest
    
    tokens_batches = Parallel(n_jobs=-2, backend="threading", verbose=10)(delayed(tokenize)(batch, answers_batches[i]) for i, batch in enumerate(batches))
    
    return tokens_batches

In [12]:
train_data = buildDataset(df.loc[:10000], df.loc[:10000].answer.values)

train_len = int(len(train_data) * 0.95)
train, test = random_split(train_data, [train_len, len(train_data) - train_len])

train_batches = buildBatch(train_data)

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-2)]: Done  11 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-2)]: Done  18 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-2)]: Done  47 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-2)]: Done  58 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-2)]: Done  71 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-2)]: Done  84 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-2)]: Done  99 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-2)]: Done 114 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-2)]: Done 131 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-2)]: Done 157 out of 157 | elapsed:    8.2s finished


In [None]:
for tokens_batch, answers_batch in train_batches:
    with torch.no_grad():
        start_logits, end_logits = model(input_ids=tokens_batch['input_ids'],
                                         token_type_ids=tokens_batch['token_type_ids'],
                                         attention_mask=tokens_batch['attention_mask'])
        start_logits = torch.argmax(start_logits, axis=1)
        end_logits = torch.argmax(end_logits, axis=1)+1        
        answer = tokenizer.batch_decode([tokens_batch['input_ids'][i][start_logits[i]:end_logits[i]] for i in range(len(start_logits))])
        for i, ans in enumerate(answer):
            print(ans,"  |  ", answers_batch[i])
        input()

   |   Saint Bernadette Soubirous
   |   a copper statue of Christ
mary. immediately in front of the main building and facing it, is a copper statue of christ with arms upraised with the legend "venite ad me omnes". next to the main building is the basilica of the sacred   |   the Main Building
   |   a Marian place of prayer and reflection
   |   a golden statue of the Virgin Mary
one-page journal in september 1876, the scholastic magazine is issued twice monthly and claims to be the oldest continuous collegiate publication in the united states. the other magazine, the juggler, is released twice a year and focuses on student literature and artwork. the dome yearbook is published annually. the newspapers have varying publication interests, with the observer published daily and mainly reporting university and other news, and staffed by students from both notre dame and saint mary's college. unlike scholastic and the dome, the observer is an independent publication and does not have a fa

In [211]:
pt_dict.keys()

dict_keys(['size', 'input_ids', 'token_type_ids', 'attention_mask', 'start_logits', 'end_logits'])

In [218]:
class QADataset(Dataset):
    
    def __init__(self, pt_dict):
        super(QADataset, self).__init__()
        self.size = pt_dict["size"]
        self.input_ids = pt_dict["input_ids"]
        self.token_type_ids = pt_dict["token_type_ids"]
        self.attention_mask = pt_dict["attention_mask"]
        self.start_logits = pt_dict["start_logits"]
        self.end_logits = pt_dict["end_logits"]
        
        
    def __getitem__(self, i):
        return (self.input_ids[i], 
                self.token_type_ids[i], 
                self.attention_mask[i],
                (self.start_logits[i], self.end_logits[i]))

    def __len__(self):
        return self.size

    def __iter__(self):
        for x in range(self.size):
            yield (self.input_ids[i],
                   self.token_type_ids[i],
                   self.attention_mask[i],
                   (self.start_logits[i], self.end_logits[i]))

    def get_labels(self):
        return (self.start_logits, self.end_logits)

In [188]:
class albertForQA_SquADv1(AlbertPreTrainedModel):
    def __init__(self, config):
        super(albertForQA_SquADv1, self).__init__(config)
        self.albert = AlbertModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)
        self.init_weights()
        
    
    def forward(self, input_ids, 
                attention_mask=None, token_type_ids=None, 
                position_ids=None, head_mask=None):
        
        answer = self.albert(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
                             position_ids=position_ids,
                             head_mask=head_mask)
        
        answer = self.qa_outputs(answer)
        start_logits, end_logits = answer.split(1, dim=-1)
        
        return start_logits, end_logits

In [189]:
def lossCrossEntropy(predicts, labels):
    start_logits, end_logits = predicts
    start_labels, end_labels = labels
    
    CrossEntropy = nn.CrossEntropyLoss(ignore_index=-1)
    start_loss = CrossEntropy(start_logits, start_labels)
    end_loss = CrossEntropy(end_logits, end_labels)

    return start_loss, end_loss

In [238]:
# model.to(device)

params_to_tune = list(model.named_parameters())
params_no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

params_with_weight = [
    {'params': [p for n, p in params_to_tune if not any(i in n for i in params_no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in params_to_tune if any(i in n for i in params_no_decay)], 'weight_decay': 0.0}]

EPOCHS = 100
TRAIN_RATE = 0.9
TRAIN_SIZE = int(pt_dict["size"] * TRAIN_RATE)
BATCH_SIZE = 64
ACCUMULATE_STRIDE = 5
LR = 1e-5
WARMUP_RATE = 0.1
num_training_steps = int(EPOCHS * TRAIN_SIZE / BATCH_SIZE / ACCUMULATE_STRIDE)
num_warmup_steps = int(num_training_steps * WARMUP_RATE)

# training tools
model = albertForQA_SquADv1.from_pretrained('albert-base-v2')
optimizer = AdamW(params=params_with_weight, lr=LR, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, 
                                            num_warmup_steps=num_warmup_steps,
                                            num_training_steps=num_training_steps)

# Activate training mode
model.zero_grad()
model = model.train()

# Prepare data
train_data = QADataset(pt_dict)
sub_train_,
sub_test_ = random_split(train_data, 
                         [TRAIN_SIZE, 
                          train_data.size - TRAIN_SIZE])
train_loader = DataLoader(sub_train_,
                          batch_size=BATCH_SIZE,
                          shuffle=True)

In [237]:
def train_func():
    

64
64
64


In [None]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [169]:
loss = nn.CrossEntropyLoss()
_input = torch.tensor([[0.99,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2],[0.99,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2],[0.99,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2],[0.99,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2],[0.99,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2],[0.99,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2]], requires_grad=True)
target = torch.tensor([2,2,2,2,2,2], dtype=torch.long)
output = loss(_input, target)
print(_input, target, output)

tensor([[0.9900, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
        [0.9900, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
        [0.9900, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
        [0.9900, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
        [0.9900, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
        [0.9900, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000]],
       requires_grad=True) tensor([2, 2, 2, 2, 2, 2]) tensor(2.3227, grad_fn=<NllLossBackward>)
