How to build Custom Q&A transformer models in Python from https://www.youtube.com/watch?v=ZIRmXkHp0-c&ab_channel=JamesBriggs

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 8.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 35.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
import os

In [6]:
!mkdir squad

In [9]:
%cd squad

/content/squad


In [7]:
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'

In [10]:
import requests

In [11]:
res = requests.get(f'{url}train-v2.0.json')

In [12]:
for file in ['train-v2.0.json', 'dev-v2.0.json']:
    res = requests.get(f'{url}{file}')
    # write to file
    with open(f'{file}', 'wb') as f:
        for chunk in res.iter_content(chunk_size=4):
            f.write(chunk)

Data prep

In [13]:
import json

In [15]:
def read_squad(path):
  with open(path, 'rb') as f:
    squad_dict = json.load(f)
  contexts = []
  questions = []
  answers = []

  for group in squad_dict['data']:
    for passage in group['paragraphs']:
      context= passage['context']
      for qa in passage['qas']:
        question = qa['question']
        if 'plausible_answers' in qa.keys():
          access = 'plausible_answers'
        else:
          access = 'answers'
        for answer in qa[access]:
          contexts.append(context)
          questions.append(question)
          answers.append(answer)
  return contexts,questions,answers
    
    

In [16]:
train_contexts,train_questions,train_answers= read_squad('train-v2.0.json')
val_contexts,val_questions,val_answers= read_squad('dev-v2.0.json')

In [17]:
train_answers[0]

{'answer_start': 269, 'text': 'in the late 1990s'}

In [18]:
def add_end_idx(answers,contexts):
  for answer,context in zip(answers,contexts):
    gold_text=answer['text']
    start_idx=answer['answer_start']
    end_idx=start_idx+ len(gold_text) 

    if context[start_idx:end_idx] == gold_text:
      answer['answer_end'] = end_idx
    else:
      for n in [1,2]:
        if context[start_idx-n:end_idx-n] == gold_text:
          answer['answer_start'] = start_idx-n
          answer['answer_end'] = end_idx-n

add_end_idx(train_answers,train_contexts)
add_end_idx(val_answers,val_contexts) 
 

In [19]:
train_answers[:5]

[{'answer_end': 286, 'answer_start': 269, 'text': 'in the late 1990s'},
 {'answer_end': 226, 'answer_start': 207, 'text': 'singing and dancing'},
 {'answer_end': 530, 'answer_start': 526, 'text': '2003'},
 {'answer_end': 180, 'answer_start': 166, 'text': 'Houston, Texas'},
 {'answer_end': 286, 'answer_start': 276, 'text': 'late 1990s'}]

Tokenize/Encode

In [20]:
from transformers import DistilBertTokenizerFast

In [21]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') 

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [22]:
train_encodings= tokenizer(train_contexts,train_questions, truncation=True, padding=True)
val_encodings= tokenizer(val_contexts,val_questions, truncation=True, padding=True)

In [23]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [24]:
tokenizer.decode(train_encodings['input_ids'][0])

'[CLS] beyonce giselle knowles - carter ( / biːˈjɒnseɪ / bee - yon - say ) ( born september 4, 1981 ) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r & b girl - group destiny\'s child. managed by her father, mathew knowles, the group became one of the world\'s best - selling girl groups of all time. their hiatus saw the release of beyonce\'s debut album, dangerously in love ( 2003 ), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number - one singles " crazy in love " and " baby boy ". [SEP] when did beyonce start becoming popular? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [37]:
def add_token_positions(encodings,answers):
  start_positions=[]
  end_positions=[]
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i,answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i,answers[i]['answer_end']))
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    go_back=1
    while end_positions[-1] is None:
      end_positions[-1] = encodings.char_to_token(i,answers[i]['answer_end']-go_back)
      go_back+=1
  encodings.update({
      'start_positions': start_positions,
      'end_positions': end_positions
  })

add_token_positions(train_encodings,train_answers)
add_token_positions(val_encodings,val_answers)


In [29]:
import torch

class squadDataset(torch.utils.data.Dataset):
  def __init__(self,encodings):
    self.encodings = encodings
  def __getitem__(self,idx):
    return {key: torch.tensor(val[idx]) for key,val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [30]:
train_datasets = squadDataset(train_encodings)
val_datasets = squadDataset(val_encodings)


Fine-Tuning

In [31]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [32]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

In [33]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)



In [34]:
train_loader = DataLoader(train_datasets, batch_size=16,shuffle=True)

In [38]:
for epoch in range(3):
  loop = tqdm(train_loader)
  for batch in loop:
    optim.zero_grad()

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)

    outputs = model(input_ids, attention_mask = attention_mask,
                    start_positions = start_positions,
                    end_positions = end_positions)
    loss = outputs[0]
    loss.backward()
    optim.step()

    loop.set_description(f'Epoch{epoch}')
    loop.set_postfix(loss=loss.item())

Epoch0: 100%|██████████| 8145/8145 [58:46<00:00,  2.31it/s, loss=0.926]
Epoch1: 100%|██████████| 8145/8145 [58:42<00:00,  2.31it/s, loss=0.76]
Epoch2: 100%|██████████| 8145/8145 [58:45<00:00,  2.31it/s, loss=0.615]


In [39]:
model_path='model/distibert-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('model/distibert-custom/tokenizer_config.json',
 'model/distibert-custom/special_tokens_map.json',
 'model/distibert-custom/vocab.txt',
 'model/distibert-custom/added_tokens.json',
 'model/distibert-custom/tokenizer.json')

In [40]:
model.eval()

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            

In [41]:
val_loader= DataLoader(val_datasets, batch_size=16)
acc=[]

loop = tqdm(val_loader)
for batch in loop:
  with torch.no_grad():
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_true = batch['start_positions'].to(device)
    end_true = batch['end_positions'].to(device)

    outputs = model(input_ids, attention_mask = attention_mask)

    start_pred=torch.argmax(outputs['start_logits'], dim=1)
    end_pred=torch.argmax(outputs['end_logits'], dim=1)

    acc.append(((start_pred==start_true).sum()/len(start_pred)).item())
    acc.append(((end_pred==end_true).sum()/len(end_pred)).item())


100%|██████████| 1640/1640 [03:48<00:00,  7.17it/s]


In [42]:
sum(acc)/len(acc)

0.6413681402439024