In [1]:
import numpy as np
import pandas as pd
import torch.nn as nn

import keras
import os
import re
import torch 
import transformers
import gzip

from collections import Counter, defaultdict
from itertools import islice
from tqdm import trange, tqdm
from torch.optim import Adam
from transformers import BertTokenizer, BertModel, BertForQuestionAnswering
from transformers import AdamW
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup
from transformers import get_linear_schedule_with_warmup

from models.QAModels import *
from models.utils import *
from utils import *

Using TensorFlow backend.


GPU not available, CPU used
GPU not available, CPU used


In [2]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

## Load QA data into memory

In [3]:
# load SubjQA_data into memory
subjqa_data_train = get_data(
                             source='/SubjQA/',
                             split='/train',
                             domain='all',
)

subjqa_data_dev = get_data(
                           source='/SubjQA/',
                           split='/dev',
                           domain='all',
)

subjqa_data_test = get_data(
                            source='/SubjQA/',
                            split='/test',
                            domain='all',
)

# convert pd.DataFrames into list of dictionaries (as many dicts as examples)
subjqa_data_train = convert_df_to_dict(
                                       subjqa_data_train,
                                       split='train',
)
subjqa_data_dev = convert_df_to_dict(
                                     subjqa_data_dev,
                                     split='dev',
)
subjqa_data_test = convert_df_to_dict(
                                      subjqa_data_test,
                                      split='test',
)

In [4]:
# load SQuAD_data into memory
squad_data_train = get_data(
                            source='/SQuAD/',
                            split='train',
)

#NOTE: we don't have correct answer spans (i.e., start and end positions) for SQuAD dev set (predictions need to be submitted)
squad_data_test = get_data(
                          source='/SQuAD/',
                          split='dev',
)

## Create train and dev QA examples

In [5]:
# TODO: figure out, whether we should use pretrained weights from 'bert-base-cased' or 'bert-base-uncased' model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# choose pretrained weights
pretrained_weights = 'bert-large-cased-whole-word-masking-finetuned-squad'

# BERT cannot deal with sequences, where T > 512
max_seq_length = 512

# defin mini-batch size
batch_size = 32

# create domain_to_idx and dataset_to_idx mappings
domains = ['books', 'electronics', 'grocery', 'movies', 'restaurants', 'trustyou', 'tripadvisor', 'all', 'wikipedia']
datasets = ['SQuAD', 'SubjQA']

idx_to_domain = dict(enumerate(domains))
domain_to_idx = {domain: idx for idx, domain in enumerate(domains)}

idx_to_dataset = dict(enumerate(datasets))
dataset_to_idx = {dataset: idx for idx, dataset in enumerate(datasets)}

In [6]:
subjqa_examples_train = create_examples(
                                        subjqa_data_train,
                                        source='SubjQA',
                                        is_training=True,
)

subjqa_examples_dev = create_examples(
                                      subjqa_data_dev,
                                      source='SubjQA',
                                      is_training=True,
)

In [7]:
squad_examples_train = create_examples(
                                       squad_data_train,
                                       source='SQuAD',
                                       is_training=True,
)

# create train and dev examples from train set only
squad_examples_train, squad_examples_dev = split_into_train_and_dev(squad_examples_train)

In [8]:
subjqa_features_train = convert_examples_to_features(
                                                    subjqa_examples_train, 
                                                    bert_tokenizer,
                                                    max_seq_length=max_seq_length,
                                                    doc_stride=100,
                                                    max_query_length=50,
                                                    is_training=True,
                                                    domain_to_idx=domain_to_idx,
                                                    dataset_to_idx=dataset_to_idx,
)

subjqa_features_dev = convert_examples_to_features(
                                                    subjqa_examples_dev, 
                                                    bert_tokenizer,
                                                    max_seq_length=max_seq_length,
                                                    doc_stride=100,
                                                    max_query_length=50,
                                                    is_training=True,
                                                    domain_to_idx=domain_to_idx,
                                                    dataset_to_idx=dataset_to_idx,
)

HBox(children=(IntProgress(value=0, max=15246), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1671), HTML(value='')))




In [9]:
squad_features_train = convert_examples_to_features(
                                                    squad_examples_train, 
                                                    bert_tokenizer,
                                                    max_seq_length=max_seq_length,
                                                    doc_stride=100,
                                                    max_query_length=50,
                                                    is_training=True,
                                                    domain_to_idx=domain_to_idx,
                                                    dataset_to_idx=dataset_to_idx,
)

squad_features_dev = convert_examples_to_features(
                                                squad_examples_dev, 
                                                bert_tokenizer,
                                                max_seq_length=max_seq_length,
                                                doc_stride=100,
                                                max_query_length=50,
                                                is_training=True,
                                                domain_to_idx=domain_to_idx,
                                                dataset_to_idx=dataset_to_idx,
)

HBox(children=(IntProgress(value=0, max=15228), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3807), HTML(value='')))




In [19]:
subjqa_tensor_dataset_train = create_tensor_dataset(
                                                   subjqa_features_train,
                                                   evaluate=False,
)

subjqa_tensor_dataset_dev = create_tensor_dataset(
                                                  subjqa_features_dev,
                                                  evaluate=False,
)

In [20]:
squad_tensor_dataset_train = create_tensor_dataset(
                                                   squad_features_train,
                                                   evaluate=False,
)

squad_tensor_dataset_dev = create_tensor_dataset(
                                                 squad_features_dev,
                                                 evaluate=False,
)

## Create train and dev dataloaders

In [10]:
squad_train_dl = create_batches(
                                dataset=squad_tensor_dataset_train,
                                batch_size=batch_size,
                                split='train',
)

squad_dev_dl = create_batches(
                              dataset=squad_tensor_dataset_dev,
                              batch_size=batch_size,
                              split='eval',
)

In [11]:
# initialise QA model
model = BertForQA.from_pretrained(
                                 pretrained_weights,
                                 qa_head_name='RecurrentQAHead',
                                 max_seq_length=max_seq_length,
                                 highway_connection=True,
                                 multitask=False,
)

# set model to device
model.to(device)

BertForQA(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_af

In [12]:
args = {
        "n_epochs": 3,
        "lr": 1e-3,
        "warmup_steps": 100,
        "max_grad_norm": 10,
        "squad": True,
}

In [14]:
train(
     model,
     squad_train_dl,
     squad_dev_dl,
     args,
)

------ Pre-trained BERT model is frozen -------


Epoch:   0%|                                                                                     | 0/3 [00:00<?, ?it/s]
Iteration:   0%|                                                                               | 0/955 [00:00<?, ?it/s]

Current loss: 6.2040863037109375



Iteration:   0%|                                                                    | 1/955 [01:01<16:10:08, 61.02s/it]

Current loss: 6.201016902923584



Iteration:   0%|▏                                                                   | 2/955 [02:05<16:24:25, 61.98s/it]

Current loss: 6.137643814086914



Iteration:   0%|▏                                                                   | 3/955 [03:07<16:24:47, 62.07s/it]

Current loss: 5.92195463180542



Iteration:   0%|▎                                                                   | 4/955 [04:12<16:38:15, 62.98s/it]

Current loss: 5.5823211669921875



Iteration:   1%|▎                                                                   | 5/955 [05:17<16:47:56, 63.66s/it]

Current loss: 5.169783592224121



Iteration:   1%|▍                                                                   | 6/955 [06:21<16:47:04, 63.67s/it]

Current loss: 4.727959156036377



Iteration:   1%|▍                                                                   | 7/955 [07:23<16:39:43, 63.27s/it]

Current loss: 4.0111188888549805



Iteration:   1%|▌                                                                   | 8/955 [08:27<16:37:53, 63.22s/it]

Current loss: 3.333834648132324



Iteration:   1%|▋                                                                   | 9/955 [09:30<16:36:14, 63.19s/it]

Current loss: 2.8745691776275635



Iteration:   1%|▋                                                                  | 10/955 [10:32<16:32:07, 62.99s/it]

KeyboardInterrupt: 

In [13]:
def train(
          model,
          train_dl,
          val_dl,
          args,
):
    
    t_total = len(train_dl) * args['n_epochs'] # total number of training steps (i.e., step = iteration)

    if args["squad"]:
        model = freeze_transformer_layers(model)
        print("------ Pre-trained BERT model is frozen -------")
        
    optimizer = AdamW(
                      model.parameters(), 
                      lr=args['lr'], 
                      correct_bias=False,
    )

    scheduler = get_linear_schedule_with_warmup(
                                                optimizer, 
                                                num_warmup_steps=args["warmup_steps"], 
                                                num_training_steps=t_total,
    )
    
    # store loss and accuracy for plotting
    train_loss = []
    train_accs = []
    train_f1s = []
    val_accs = []
    val_f1s = []
    
    
    loss_func = nn.CrossEntropyLoss()


    for _ in trange(args['n_epochs'],  desc="Epoch"):

        ### Training ###

        model.train()

        tr_loss, tr_acc = 0, 0
        nb_tr_examples, nb_tr_steps = 0, 0
        
        total_loss = 0

        for step, batch in enumerate(tqdm(train_dl, desc="Iteration")):

            # add batch to GPU
            batch = tuple(t.to(device) for t in batch)

            # unpack inputs from dataloader            
            b_input_ids, b_attn_masks, b_token_type_ids, b_input_lengths, b_start_pos, b_end_pos, b_cls_indexes, _ = batch
            
            # sort sequences in batch in decreasing order w.r.t. to (original) sequence length
            b_input_ids, b_attn_masks, b_type_ids, b_input_lengths, b_start_pos, b_end_pos = sort_batch(
                                                                                                        b_input_ids,
                                                                                                        b_attn_masks,
                                                                                                        b_token_type_ids,
                                                                                                        b_input_lengths,
                                                                                                        b_start_pos,
                                                                                                        b_end_pos,
            )
            
            # zero-out gradients
            optimizer.zero_grad()
            
            # compute start and end logits respectively
            start_logits, end_logits = model(
                                             input_ids=b_input_ids,
                                             attention_masks=b_attn_masks,
                                             token_type_ids=b_token_type_ids,
                                             input_lengths=b_input_lengths,
            )
            
            # start and end loss must be computed separately
            start_loss = loss_func(start_logits, b_start_pos)
            end_loss = loss_func(end_logits, b_end_pos)
            
            total_loss = (start_loss + end_loss) / 2
            print("Current loss: {}".format(total_loss))

            train_loss.append(total_loss.item())
                
            #start_logits = _to_cpu(start_logits)
            
            #log_probas = F.log_softmax(logits, dim=1).numpy() # or F.softmax(logits).numpy()
            #preds_flat = np.argmax(log_probas, axis=1).flatten()


            # backpropagate error
            #start_loss.backward()
            #end_loss.backward()
            total_loss.backward()
            
            # clip gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"])

            # update model parameters and take a step using the computed gradient
            optimizer.step()
            scheduler.step()

            tr_loss += total_loss.item()
            
            #tr_acc += tr_acc_current
            #tr_f1 += tr_f1_current

            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        print("Train loss: {}".format(tr_loss/nb_tr_steps))
        break
        
        #print("Train acc: {}".format(tr_acc/nb_tr_steps))
        #print("Train f1: {}".format(tr_f1/nb_tr_steps))
        
        #train_accs.append(tr_acc/nb_tr_steps)
        #train_f1s.append(tr_f1/nb_tr_steps)

        

        ### Validation ###

        # set models to eval mode to evaluate loss on the validation set
        if not args["squad"]:
            bert_encoder.eval()
            
        qa_head.eval()

        eval_acc, eval_f1 = 0, 0
        nb_eval_steps = 0

        for batch in val_dl:

            # add batch to GPU
            batch = tuple(t.to(device) for t in batch)

            # unpack inputs from dataloader
            b_input_ids, b_input_mask, b_labels = batch

            # telling the model not to compute or store gradients, saving memory and speeding up validation

            with torch.no_grad():
                if model_name == 'DistilBERT':
                    logits = model(b_input_ids, attention_mask=b_input_mask)
                else:
                    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

            # move logits and labels to CPU to compute NumPy operations
            logits = logits[0].detach().cpu()
            label_ids = b_labels.to('cpu').numpy()
            
            log_probas = F.log_softmax(logits, dim=1).numpy() # or F.softmax(logits).numpy()
            preds_flat = np.argmax(log_probas, axis=1).flatten()

            tmp_eval_acc = accuracy(preds_flat, label_ids)
            tmp_eval_f1 = f1(preds_flat, label_ids)

            eval_acc += tmp_eval_acc
            eval_f1 += tmp_eval_f1
            nb_eval_steps += 1

        print("Val acc: {}".format(eval_acc/nb_eval_steps))
        print("Val f1: {}".format(eval_f1/nb_eval_steps))
        val_accs.append(eval_acc/nb_eval_steps)
        val_f1s.append(eval_f1/nb_eval_steps)
        
    return train_loss, train_accs, train_f1s, val_accs, val_f1s, model

In [None]:
### SOME DEBUGGING ###

In [38]:
support, question = "Who was Jim Henson?", "Jim Henson was a nice puppet"

In [39]:
input_ids = tokenizer.encode(support, question)

In [27]:
encoded_dict = tokenizer.encode_plus(support, question, max_length=19, pad_to_max_length=True)

In [29]:
len(encoded_dict['input_ids'])

19

In [40]:
tokenizer.decode(input_ids)

'[CLS] who was jim henson? [SEP] jim henson was a nice puppet [SEP]'

In [41]:
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
bert_outputs = bert_encoder(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
start_scores, end_scores = linear_head(bert_outputs)

all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])

#assert answer == "a nice puppet"

In [42]:
all_tokens

['[CLS]',
 'who',
 'was',
 'jim',
 'henson',
 '?',
 '[SEP]',
 'jim',
 'henson',
 'was',
 'a',
 'nice',
 'puppet',
 '[SEP]']

In [45]:
end_scores

tensor([ 0.3921,  0.7497,  0.8149,  0.7072,  0.1412,  0.7454,  0.3920,  0.6636,
         0.1638,  0.5663,  0.5980, -0.2040, -0.1426,  0.3924],
       grad_fn=<SqueezeBackward1>)