In [97]:
import os 

In [98]:
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'

In [99]:
import requests

In [100]:
for file in ['train-v2.0.json', 'dev-v2.0.json']:
  res = requests.get(f'{url}{file}')
  #after make request we save file to squad
  with open(f'squad/{file}', 'wb') as f: #wb for writing binary file
    for chunk in res.iter_content(chunk_size=4):
      f.write(chunk)
    res

In [101]:
#Again what we are really training is answer start and answer end, in the train file we have question, context, answer as 'text', and answer start
#defining the initial index
res

<Response [200]>

DATA PREP: All we want is the context, the questions, the answers! We are just gonna have a lists of stringest at the ends, we will also have the starting position.

In [102]:
import json

In [103]:
with open('squad/train-v2.0.json', 'rb') as f:
  squad_dict = json.load(f)

In [126]:
len(squad_dict['data']) 

442

In [105]:
squad_dict['data'][0]['paragraphs'][2]['context'] #as we can see from the top, all of the context info is within 'paragraphs'
#Then here we can extract the context 
#Seems like its dictionary->list->dictionary->list->dictionary for the value we need!

'A self-described "modern-day feminist", Beyoncé creates songs that are often characterized by themes of love, relationships, and monogamy, as well as female sexuality and empowerment. On stage, her dynamic, highly choreographed performances have led to critics hailing her as one of the best entertainers in contemporary popular music. Throughout a career spanning 19 years, she has sold over 118 million records as a solo artist, and a further 60 million with Destiny\'s Child, making her one of the best-selling music artists of all time. She has won 20 Grammy Awards and is the most nominated woman in the award\'s history. The Recording Industry Association of America recognized her as the Top Certified Artist in America during the 2000s decade. In 2009, Billboard named her the Top Radio Songs Artist of the Decade, the Top Female Artist of the 2000s and their Artist of the Millennium in 2011. Time listed her among the 100 most influential people in the world in 2013 and 2014. Forbes magaz

In [106]:
#So its clear to grab the context we have to use a for loop!

def read_data(path):
  with open(path, 'rb') as f:
    squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
  for group in squad_dict['data']:
    for passage in group['paragraphs']:
      context = passage['context']
      for qa in passage['qas']:
        question = qa['question']
        if 'plausible_answers' in qa.keys():
          access = 'plausible_answers'
        else:
          access = 'answers'
        for answer in qa[access]:
          contexts.append(context)
          questions.append(question)
          answers.append(answer)
#We want to do this for both the training and validation set so lets insert this into a function.
    return contexts, questions, answers

In [107]:
train_contexts, train_questions, train_answers = read_data('squad/train-v2.0.json')
test_contexts, test_questions, test_answers = read_data('squad/dev-v2.0.json')

In [127]:
len(train_answers[0])

3

In [109]:
test_contexts[:2]

['The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates 

In [110]:
#We have the starting positions, but we also need the end index
train_answers[:2] #We cant actually just grab length of text and add that on to the starting index because some of the answer starts are incorrect

[{'answer_start': 269, 'text': 'in the late 1990s'},
 {'answer_start': 207, 'text': 'singing and dancing'}]

In [111]:
def add_end_index(answers, contexts):
  for answer,context in zip(answers, contexts):
    gold_text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(gold_text)
    #Now we add in the logic where the characters index are off 
    if context[start_idx:end_idx] == gold_text:
      answer['answer_end'] = end_idx 
    else:
      for n in [1,2]:
        if context[start_idx-n,end_idx-n] == gold_text:
          answer['answer_start'] = context[start_idx-n]
          answer['answer_end'] = context[end_idx - n]
add_end_index(train_answers, train_contexts)
add_end_index(test_answers, test_contexts)

In [112]:
train_answers[0]['answer_end']

286

Tokenize/Encode text

In [113]:
import transformers 
from transformers import DistilBertTokenizerFast 

In [114]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [115]:
train_encoding = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
#This is is now going to merge the two strings together but there will be a seperator token 
#Truncation and padding set to true of course to make sure they are all of the same size 
test_encoding = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

In [116]:
train_encoding.keys() #We have the input ids and attention masks!
train_encoding['input_ids'][0] #As we can see the tokenizer swithced it from words to tokens to ids ! with a cls = 101, sep = 102


[101,
 20773,
 21025,
 19358,
 22815,
 1011,
 5708,
 1006,
 1013,
 12170,
 23432,
 29715,
 3501,
 29678,
 12325,
 29685,
 1013,
 10506,
 1011,
 10930,
 2078,
 1011,
 2360,
 1007,
 1006,
 2141,
 2244,
 1018,
 1010,
 3261,
 1007,
 2003,
 2019,
 2137,
 3220,
 1010,
 6009,
 1010,
 2501,
 3135,
 1998,
 3883,
 1012,
 2141,
 1998,
 2992,
 1999,
 5395,
 1010,
 3146,
 1010,
 2016,
 2864,
 1999,
 2536,
 4823,
 1998,
 5613,
 6479,
 2004,
 1037,
 2775,
 1010,
 1998,
 3123,
 2000,
 4476,
 1999,
 1996,
 2397,
 4134,
 2004,
 2599,
 3220,
 1997,
 1054,
 1004,
 1038,
 2611,
 1011,
 2177,
 10461,
 1005,
 1055,
 2775,
 1012,
 3266,
 2011,
 2014,
 2269,
 1010,
 25436,
 22815,
 1010,
 1996,
 2177,
 2150,
 2028,
 1997,
 1996,
 2088,
 1005,
 1055,
 2190,
 1011,
 4855,
 2611,
 2967,
 1997,
 2035,
 2051,
 1012,
 2037,
 14221,
 2387,
 1996,
 2713,
 1997,
 20773,
 1005,
 1055,
 2834,
 2201,
 1010,
 20754,
 1999,
 2293,
 1006,
 2494,
 1007,
 1010,
 2029,
 2511,
 2014,
 2004,
 1037,
 3948,
 3063,
 4969,
 1010,
 36

In [117]:
def add_token_positions(encodings, answer):
  start_positions = []
  end_positions = []
  for i in range(len(answer)):
    start_positions.append(encodings.char_to_token(i, answer[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answer[i]['answer_end']))
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length #Since there are a chances that context gets broken off to hit question in 
    go_back = -1
    while end_positions[-1] is None:
      end_positions[-1] = encodings.char_to_token(i, answer[i]['answer_end']-go_back)
      go_back +=1
  encodings.update({
      'start_positions':start_positions,
      'end_positions':end_positions
  })
add_token_positions(train_encoding, train_answers)
add_token_positions(test_encoding, test_answers)
train_encoding.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [118]:
import torch 
import tensorflow as tf
class SquadDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings 
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids) #This is how we prototype with torch. We are going to have to use our brains to figure out how to do
    #this with tensorflow which is my goal.

In [119]:
train_datasets = SquadDataset(train_encoding)
val_datasets = SquadDataset(test_encoding)

Fine tuning

In [120]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [121]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm #shows progress of training

In [122]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr = 5e-5)

In [123]:
train_loader = DataLoader(train_datasets, batch_size = 16, shuffle=True)

In [124]:
train_encoding.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [125]:
for epoch in range(3):
  loop = tqdm(train_loader)
  for batch in loop:
    optim.zero_grad() #Resets gradients at start of each loop so after we update the models parameters accordingly we dont start from last point
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, start_positions = start_positions, end_positions = end_positions)

    loss = outputs[0]
    loss.backward() #gradient work
    optim.step() 

    loop.set_description(f'Epoch{epoch}')
    loop.set_postfix(loss = loss.item())

Epoch0: 100%|██████████| 48/48 [01:04<00:00,  1.34s/it, loss=3.41]
Epoch1: 100%|██████████| 48/48 [01:04<00:00,  1.34s/it, loss=1.93]
Epoch2: 100%|██████████| 48/48 [01:04<00:00,  1.34s/it, loss=2.55]
