In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import collections
import math
import itertools

%matplotlib inline 

import torch
import torch.nn as nn
from torchtext import data
#from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm # for progress bar

In [2]:
 !pip install transformers
 from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, AdamW

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 3.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 33.2 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 33.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [3]:
# if we want to run in GPU, set flag to True, else False for CPU
flag_gpu = True

device = torch.device('cuda' if (flag_gpu == True and torch.cuda.is_available()) else 'cpu')

In [4]:
import json, urllib


path = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'

fnames = ['train-v2.0.json', 'dev-v2.0.json']
data = list()
for fname in fnames:
    full_path = path + fname
    response = urllib.request.urlopen(full_path)
    dictionary = json.loads(response.read())
    data.append(dictionary['data'])     # we want to keep only the data

#some prints to understand the data 
for i, x in enumerate(data):
    print("Data of :", fnames[i])
    print("Length of data: ", len(x))
    print("Data Keys: ", x[0].keys())
    print("paragraphs Keys: ", x[0]['paragraphs'][0].keys())
    print("question Keys: ", x[0]['paragraphs'][0]['qas'][0].keys())
    print("answers Keys: ", x[0]['paragraphs'][0]['qas'][0]['answers'][0].keys())
    print("example title: ", x[0]['title'],'\n')

Data of : train-v2.0.json
Length of data:  442
Data Keys:  dict_keys(['title', 'paragraphs'])
paragraphs Keys:  dict_keys(['qas', 'context'])
question Keys:  dict_keys(['question', 'id', 'answers', 'is_impossible'])
answers Keys:  dict_keys(['text', 'answer_start'])
example title:  Beyoncé 

Data of : dev-v2.0.json
Length of data:  35
Data Keys:  dict_keys(['title', 'paragraphs'])
paragraphs Keys:  dict_keys(['qas', 'context'])
question Keys:  dict_keys(['question', 'id', 'answers', 'is_impossible'])
answers Keys:  dict_keys(['text', 'answer_start'])
example title:  Normans 



In [5]:
# function to modify data of json file parsed before, into list of dictionaries with all necessary info
# returns list of dictionaries( id, context, question, answer, answer_start )

def modify_data(data):
  data_list = []

  for paragraphs in data:
      for paragraph in paragraphs['paragraphs']:
          context = paragraph['context']
          for qa in paragraph['qas']:
              id = qa['id']
              question = qa['question']
              for ans in qa['answers']:
                  answer = ans['text']
                  ans_start = ans['answer_start']     # answer start is given
                  
                  dictionary = dict()
                  dictionary['id'] = id
                  dictionary['context'] = context
                  dictionary['question'] = question

                  #Sometimes answers are off by a character or two
                  ans_end = ans_start + len(answer)   # answer end is: start + length of answer
                  # If the answer text is off by 1 character
                  if context[ans_start-1:ans_end-1] == answer:
                    ans_start -= 1
                    ans_end -= 1     
                  # If the answer text is off by 2 characters
                  elif context[ans_start-2:ans_end-2] == answer:
                    ans_start -= 2
                    ans_end -= 2  
                  # If the answer text is off by any other difference, then set ans_end to None
                  elif context[ans_start:ans_end] != answer:
                    ans_end = None

                  dictionary['ans_idx'] = [ans_start, ans_end]

                  dictionary['answer'] = answer
                  data_list.append(dictionary)    
  return data_list



# ~~~~~~~  GET MODIFIED DATA  ~~~~~~~~ #

train_data = modify_data(data[0])
dev_data = modify_data(data[1])


In [6]:
###########################################
# Print mean of words in train and dev data
###########################################

def get_data_info(data):
  # get lengths of context+question and the length of answer
  concat = list()
  for x in data:
    concat.append( [ len(x['context'].split()) + len(x['question']) , len(x['answer'].split())])
  # create a dataframe with these lengths
  data_info_df = pd.DataFrame(concat, columns=['len_context_question' , 'len_answer'])
  len_data = data_info_df.shape[0]
  mean_len_cont_q = data_info_df['len_context_question'].mean()
  mean_len_answ = data_info_df['len_answer'].mean()
  print("Mean of words in context+question: ",mean_len_cont_q)
  print("Mean of words in answer: ", mean_len_answ)

get_data_info(train_data)
get_data_info(dev_data)

Mean of words in context+question:  179.40144665461122
Mean of words in answer:  3.1627716796627543
Mean of words in context+question:  190.89321249138015
Mean of words in answer:  3.0646734311890453


In [7]:
# Define tokenizer 

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased", do_lower_case=True)

# ~~~~~ DEFINE MAX LENGTH OF EACH EMBEDDING ~~~~~ #
max_len = 200
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [8]:
# function to find word-index by character-index
def get_word_index(text, char_idx):
  word_idx = 0
  curr_char_idx = 0
  text_list = text.split(' ')
  for word in text_list:
    curr_char_idx += int(len(word)+1)
    if curr_char_idx > char_idx[0]:
      break 
    else:
      word_idx += 1
  start_idx = word_idx

  word_idx = 0
  curr_char_idx = 0
  text_list = text.split(' ')
  end_idx=None
  for word in text_list:
    curr_char_idx += int(len(word)+1)
    if curr_char_idx > char_idx[1]:
      end_idx = word_idx
      break
    else:
      word_idx += 1
  return start_idx, end_idx

# function to find the segment id mask 
def get_segment_ids(special_tokens_mask, input_ids):
  mask = special_tokens_mask
  mask[0,0] = 0
  mask[0,mask.size(dim=1)-1] = 0 
  sep_idx =  torch.argmax(mask, dim=1)    # get separator index
  seg_qa = sep_idx+1      # 1st segment is question ( [CLS] until [SEP])
  seg_ans = input_ids.size(dim=1) - seg_qa    # 2nd segment is answer (middle [SEP] until end)
  seg_ids = [0] * seg_qa + [1]*seg_ans
  return seg_ids

# get token indices of answer by character indices before tokenization
def get_answer_indices(ans_idx, input):
  # get index of the token in the encoded output, with char_to_token
  start_idx = input.char_to_token(ans_idx[0])
  end_idx = input.char_to_token(ans_idx[1])
  # if start_idx is None, then the answer got truncated
  if start_idx is None:
    start_idx = max_len
    end_idx = max_len
    return start_idx, end_idx

  # if end_idx is None, shift left the end index, until find answer
  roffset = 1
  while end_idx is None and ans_idx[1] > roffset:
    end_idx = input.char_to_token(ans_idx[1] - roffset)
    roffset += 1
    
  if end_idx is None:
    end_idx = start_idx
  return start_idx, end_idx


def create_input(data_list):
  qa_list = list()
  max = 0
  for qa in tqdm(data_list):
    input  = tokenizer.encode_plus( qa['context'], qa['question'], add_special_tokens=True ,padding='max_length', max_length = max_len, truncation=True, return_tensors="pt", return_special_tokens_mask=True)    # tokenize with bert
   
   # compute segment ids
    seg_ids = get_segment_ids(input['special_tokens_mask'], input['input_ids'])

    # compute answer index
    start_idx, end_idx = get_answer_indices(qa['ans_idx'], input)

    #qa_list.append([input['input_ids'], input['attention_mask'], seg_ids, ans_idx, idxs,qa['ans_idx'], qa['answer'], tokenizer.convert_ids_to_tokens(input['input_ids'][0])])
    qa_list.append([input['input_ids'], input['attention_mask'], seg_ids, start_idx, end_idx])
  return qa_list


In [9]:
input_train = create_input(train_data)
input_dev = create_input(dev_data)

100%|██████████| 86821/86821 [01:34<00:00, 917.08it/s]
100%|██████████| 20302/20302 [00:23<00:00, 876.59it/s]


In [10]:
# class for custom dataset
class myDataset(torch.utils.data.Dataset):

  def __init__(self, data):
    self.items = data

  def __len__(self):
    return len(self.items)
  
  def __getitem__(self,idx):
    return list(torch.tensor(val) for val in self.items[idx])

In [11]:
# these functions are heavily influenced by the evaluation script on Rajpurkar & Jia et al. '18 paper
def normalize_text(s):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
  import string, re

  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)

  def white_space_fix(text):
    return " ".join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(pred, gold):
  return int(normalize_text(pred) == normalize_text(gold))

def compute_f1(pred, gold):
  pred_tokens = normalize_text(pred).split()
  gold_tokens = normalize_text(gold).split()
  
  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(gold_tokens) == 0:
    return int(pred_tokens == gold_tokens)
  
  common_tokens = set(pred_tokens) & set(gold_tokens)
  
  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0
  
  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(gold_tokens)
  
  return 2 * (prec * rec) / (prec + rec)

def get_scores(input, ans_start, ans_end, pred_start, pred_end):
  
  f1_scores = []
  EM_scores = []
  for real_start, real_end, pred_start, pred_end in zip(ans_start, ans_end, pred_start, pred_end):
    pred_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input[0][pred_start:pred_end]))
    gold_text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input[0][real_start:real_end]))

    f1_scores.append(compute_f1(pred_text, gold_text))
    EM_scores.append(compute_exact_match(pred_text, gold_text))

  return f1_scores, EM_scores

In [16]:

#########################################
######   CLASSIFICATION FUNCTION   ######

def nn_loop(model, optimizer, traindf, valdf, epochs, batch_siz):
  
  train_dataset = myDataset(traindf)
  eval_dataset = myDataset(valdf)

  train = torch.utils.data.DataLoader(train_dataset, batch_size=batch_siz, shuffle=True)
  eval = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_siz)

  losses_train = []
  losses_valid = []
  f1_scores_avg = []
  EM_scores_avg = []

  ########### epoch loop ##########
  for epoch in range(epochs):
    ##############
    # training
    ##############
    model.train()
    losses = 0
    for items in tqdm(train):       
      # items[0]=input_ids,  items[1]=attention_mask,  items[2]=segment_ids,  items[3]=start_idx,  items[4]=end_idx
      input = items[0].squeeze(1).to(device)
      attention_mask = items[1].squeeze(1).to(device)
      seg_ids = items[2].to(device)
      ans_start = items[3].to(device)
      ans_end = items[4].to(device)

      model.zero_grad() # clear any previously calculated gradients

      #loss, logits = model(input, attention_mask=attention_mask, labels=labels)
      outputs = model(input, attention_mask=attention_mask, start_positions=ans_start, end_positions=ans_end)

      losses += outputs.loss.item()

      #Perform backpropagation starting from the loss calculated in this epoch
      outputs.loss.backward()

      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      #Update net's weights based on the gradients calculated during backprop
      optimizer.step()

    # save loss for train in this epoch
    losses_train += [ losses / len(train)]
    ############
    # evaluation
    ############
    model.eval()
    losses=0
    f1_scores = 0
    EM_scores = 0
    counter = 0
    for items in tqdm(eval):

      input = items[0].squeeze(1).to(device)
      attention_mask = items[1].squeeze(1).to(device)
      seg_ids = items[2].to(device)
      ans_start = items[3].to(device)
      ans_end = items[4].to(device)

      with torch.no_grad():

        
        outputs = model(input, attention_mask=attention_mask, start_positions=ans_start, end_positions=ans_end)

        pred_start = torch.argmax(outputs[1],axis=1)  # get the most likely beginning of answer with the argmax of the score
        pred_end = torch.argmax(outputs[2],axis=1) + 1 

        f1, EM = get_scores(input, ans_start ,ans_end, pred_start, pred_end)
        f1_scores += sum(f1)
        EM_scores += sum(EM)
        counter += len(f1)

        losses += outputs.loss.item()

    # save loss for valid in this epoch
    losses_valid += [losses / len(eval)]
    f1_scores_avg.append(f1_scores / counter)
    EM_scores_avg.append(EM_scores / counter)

    print(losses_train, losses_valid, f1_scores_avg, EM_scores_avg)
  return losses_train, losses_valid, f1_scores_avg, EM_scores_avg

  



def classify(traindf, valdf, model, lr=2e-5, epochs=2, batch_siz=16):
  # use AdamW optimizer  
  optimizer = torch.optim.AdamW(model.parameters(), lr = lr, eps = 1e-8)

  train_losses, test_batch_losses, f1_scores, EM_scores = nn_loop(model, optimizer, traindf, valdf, epochs=epochs, batch_siz=batch_siz)

  for i , vals in enumerate(zip(train_losses, test_batch_losses, f1_scores, EM_scores)):
    print("Epoch: ", i+1)
    print("train loss: ", vals[0])
    print("validation loss: ", vals[1])
    print("F1 score avg: ", vals[2])
    print("EM score avg: ", vals[3], '\n')




In [None]:
# define our model. I am currently using distillBert.
bertModel = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased", output_attentions=False, output_hidden_states=False)
# move it to gpu if needed
if flag_gpu and torch.cuda.is_available():
  bertModel.cuda()


In [18]:
# ~~~~~~~~~   HYPER-PARAMETERS   ~~~~~~~~~~ #
lr = 5e-5
epochs = 2
batch_siz = 32
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #

In [None]:
classify(input_train, input_dev, bertModel, lr, epochs, batch_siz)