In [1]:
import numpy as np
import pandas as pd
import torch.nn as nn

import keras
import os
import re
import torch 
import transformers
import gzip

from collections import Counter, defaultdict
from itertools import islice
from tqdm import trange, tqdm
from torch.optim import Adam
from transformers import BertTokenizer, BertModel, BertForQuestionAnswering
from transformers import AdamW
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup
from transformers import get_linear_schedule_with_warmup

from models.QAheads import *
from models.utils import *
from utils import *

Using TensorFlow backend.


GPU is available


In [2]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

## Load data into memory

In [None]:
# load SubjQA_data into memory
subjqa_data_train = get_data(source='/SubjQA/', split='/train', domain='all')
subjqa_data_dev = get_data(source='/SubjQA/', split='/dev', domain='all')
subjqa_data_test = get_data(source='/SubjQA/', split='/test', domain='all')

In [3]:
# load SQuAD_data into memory
squad_data_train = get_data(
                            source='/SQuAD/',
                            split='train',
                            )

#squad_data_dev = get_data(
#                          source='/SQuAD/',
#                          split='dev',
#                          )

## Create train and dev QA examples

In [None]:
# TODO: figure out, whether we should use pretrained weights from 'bert-base-cased' or 'bert-base-uncased' model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# BERT cannot deal with sequences, where T > 512
max_seq_length = 512

In [None]:
squad_examples_train = create_examples(
                                       squad_data_train,
                                       is_training=True,
                                       )

# create train and dev examples from train set only
squad_examples_train, squad_examples_dev = split_into_train_and_dev(squad_examples_train)

In [None]:
squad_features_train = convert_examples_to_features(
                                                    squad_examples_train, 
                                                    bert_tokenizer,
                                                    max_seq_length=max_seq_length,
                                                    doc_stride=100,
                                                    max_query_length=50,
                                                    is_training=True,
                                                    )

In [None]:
squad_features_dev = convert_examples_to_features(
                                                squad_examples_dev, 
                                                bert_tokenizer,
                                                max_seq_length=max_seq_length,
                                                doc_stride=100,
                                                max_query_length=50,
                                                is_training=True,
                                                )

In [None]:
squad_tensor_dataset_train = create_tensor_dataset(
                                                   squad_features_train,
                                                   evaluate=False,
                                                   )

squad_tensor_dataset_dev = create_tensor_dataset(
                                                 squad_features_dev,
                                                 evaluate=False,
                                                 )

## Create train and dev dataloaders

In [None]:
squad_train_dl = create_batches(
                                dataset=squad_tensor_dataset_train,
                                batch_size=8,
                                split='train',
                                )

squad_dev_dl = create_batches(
                              dataset=squad_tensor_dataset_dev,
                              batch_size=8,
                              split='eval',
                               )

In [None]:
# bert_encoder = BertModel.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
bert_encoder = BertModel.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

# initialise QA heads
linear_head = LinearQAHead()
recurrent_head = RecurrentQAHead(max_seq_length=max_seq_length)

# cast models to device
bert_encoder.to(device)
linear_head.to(device)
recurrent_head.to(device)

# set current QA head
qa_head = recurrent_head

In [11]:
args = {
        "n_epochs": 3,
        "lr": 1e-3,
        "warmup_steps": 100,
        "max_grad_norm": 10,
        "squad": True,
        }

In [16]:
train(
     bert_encoder,
     qa_head,
     squad_train_dl,
     squad_dev_dl,
     args,
    )

------ BERT model is set to evaluation mode. -------


Epoch:   0%|                                                                                     | 0/3 [00:00<?, ?it/s]

------ Training of QA head. BERT model is frozen. ------



Iteration:   0%|                                                                              | 0/1909 [00:00<?, ?it/s]

AttributeError: 'EncoderLSTM' object has no attribute 'bidir'

In [13]:
def train(
          bert_encoder, 
          qa_head,
          train_dl,
          val_dl,
          args,
          ):
    
    t_total = len(train_dl) * args['n_epochs'] # total number of training steps (i.e., step = iteration)

    # set pre-trained BERT model to eval mode, if we want to evaluate model's performance on SQuAD
    if args["squad"]:
        bert_encoder.eval()
        print("------ BERT model is set to evaluation mode. -------")
        
    else:        
        bert_optimizer = AdamW(
                          bert_encoder.parameters(), 
                          lr=args['lr'], 
                          correct_bias=False,
                          )
    
        bert_scheduler = get_linear_schedule_with_warmup(
                                                        bert_optimizer, 
                                                        num_warmup_steps=args["warmup_steps"], 
                                                        num_training_steps=t_total,
                                                        )
    
    # store loss and accuracy for plotting
    train_loss = []
    train_accs = []
    train_f1s = []
    val_accs = []
    val_f1s = []
    
    
    loss_func = nn.CrossEntropyLoss()
        
    qa_head_optimizer = Adam(
                            qa_head.parameters(), 
                            lr=args['lr'],
                            )


    for _ in trange(args['n_epochs'],  desc="Epoch"):

        ### Training ###
        
        # set model to train mode (as opposed to eval mode)
        if not args["squad"]:
            print("------ Fine-tuning pre-trained BERT model AND training of QA head. ------")
            bert_encoder.train()
            
        else:
            print("------ Training of QA head. BERT model is frozen. ------")

        # we need to train the QA head for both SQuAD and SubjQA
        qa_head.train()

        tr_loss, tr_acc = 0, 0
        nb_tr_examples, nb_tr_steps = 0, 0
        
        total_loss = 0


        for step, batch in enumerate(tqdm(train_dl, desc="Iteration")):

            # add batch to GPU
            batch = tuple(t.to(device) for t in batch)

            # unpack inputs from dataloader            
            b_input_ids, b_attn_masks, b_token_type_ids, b_input_lengths, b_start_pos, b_end_pos, b_cls_indexes, _ = batch
            
            if args["squad"]:
                # zero-out gradients
                qa_head_optimizer.zero_grad()
            else:
                bert_optimizer.zero_grad()
                bert_scheduler.zero_grad()
                qa_head_optimizer.zero_grad()
            
            bert_reps = bert_encoder(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_attn_masks)
            
            if hasattr(qa_head, 'lstm_encoder'):
                start_logits, end_logits = qa_head(bert_reps, b_input_lengths)
            else:
                start_logits, end_logits = qa_head(bert_reps)
            
            # start and end loss need to be computed separately
            
            start_loss = loss_func(start_logits, b_start_pos)
            end_loss = loss_func(end_logits, b_end_pos)
            
            total_loss = (start_loss + end_loss) / 2
            print("Current loss: {}".format(total_loss))

            train_loss.append(total_loss.item())
                
            #start_logits = _to_cpu(start_logits)
            
            #log_probas = F.log_softmax(logits, dim=1).numpy() # or F.softmax(logits).numpy()
            #preds_flat = np.argmax(log_probas, axis=1).flatten()


            # backpropagation
            #start_loss.backward()
            #end_loss.backward()
            total_loss.backward()
            
            if args["squad"]:
                torch.nn.utils.clip_grad_norm_(qa_head.parameters(), args["max_grad_norm"])
                
                # update qa_head parameters and take a step using the computed gradient
                qa_head_optimizer.step()

            else:
                torch.nn.utils.clip_grad_norm_(bert_encoder.parameters(), args["max_grad_norm"])
                torch.nn.utils.clip_grad_norm_(qa_head.parameters(), args["max_grad_norm"])

                # update model parameters and take a step using the computed gradient
                bert_optimizer.step()
                bert_scheduler.step()
                qa_head_optimizer.step()

            tr_loss += total_loss.item()
            
            #tr_acc += tr_acc_current
            #tr_f1 += tr_f1_current

            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        print("Train loss: {}".format(tr_loss/nb_tr_steps))
        break
        
        #print("Train acc: {}".format(tr_acc/nb_tr_steps))
        #print("Train f1: {}".format(tr_f1/nb_tr_steps))
        
        #train_accs.append(tr_acc/nb_tr_steps)
        #train_f1s.append(tr_f1/nb_tr_steps)

        

        ### Validation ###

        # set models to eval mode to evaluate loss on the validation set
        if not args["squad"]:
            bert_encoder.eval()
            
        qa_head.eval()

        eval_acc, eval_f1 = 0, 0
        nb_eval_steps = 0

        for batch in val_dl:

            # add batch to GPU
            batch = tuple(t.to(device) for t in batch)

            # unpack inputs from dataloader
            b_input_ids, b_input_mask, b_labels = batch

            # telling the model not to compute or store gradients, saving memory and speeding up validation

            with torch.no_grad():
                if model_name == 'DistilBERT':
                    logits = model(b_input_ids, attention_mask=b_input_mask)
                else:
                    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

            # move logits and labels to CPU to compute NumPy operations
            logits = logits[0].detach().cpu()
            label_ids = b_labels.to('cpu').numpy()
            
            log_probas = F.log_softmax(logits, dim=1).numpy() # or F.softmax(logits).numpy()
            preds_flat = np.argmax(log_probas, axis=1).flatten()

            tmp_eval_acc = accuracy(preds_flat, label_ids)
            tmp_eval_f1 = f1(preds_flat, label_ids)

            eval_acc += tmp_eval_acc
            eval_f1 += tmp_eval_f1
            nb_eval_steps += 1

        print("Val acc: {}".format(eval_acc/nb_eval_steps))
        print("Val f1: {}".format(eval_f1/nb_eval_steps))
        val_accs.append(eval_acc/nb_eval_steps)
        val_f1s.append(eval_f1/nb_eval_steps)
        
    return train_loss, train_accs, train_f1s, val_accs, val_f1s, model

In [38]:
support, question = "Who was Jim Henson?", "Jim Henson was a nice puppet"

In [39]:
input_ids = tokenizer.encode(support, question)

In [27]:
encoded_dict = tokenizer.encode_plus(support, question, max_length=19, pad_to_max_length=True)

In [29]:
len(encoded_dict['input_ids'])

19

In [40]:
tokenizer.decode(input_ids)

'[CLS] who was jim henson? [SEP] jim henson was a nice puppet [SEP]'

In [41]:
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
bert_outputs = bert_encoder(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
start_scores, end_scores = linear_head(bert_outputs)

all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])

#assert answer == "a nice puppet"

In [42]:
all_tokens

['[CLS]',
 'who',
 'was',
 'jim',
 'henson',
 '?',
 '[SEP]',
 'jim',
 'henson',
 'was',
 'a',
 'nice',
 'puppet',
 '[SEP]']

In [45]:
end_scores

tensor([ 0.3921,  0.7497,  0.8149,  0.7072,  0.1412,  0.7454,  0.3920,  0.6636,
         0.1638,  0.5663,  0.5980, -0.2040, -0.1426,  0.3924],
       grad_fn=<SqueezeBackward1>)

In [43]:
print(answer)


