In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/squad-2/train-v2.0.json
/kaggle/input/squad-2/dev-v2.0.json


In [2]:
import os
import requests
import json
import torch
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm



In [3]:
# url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
# for file in ['train-v2.0.json', 'dev-v2.0.json']:
#     res = requests.get(f'{url}{file}')
#     with open(f'Transformer\Q&A\Data{file}', 'wb') as f:
#         for chunk in res.iter_content(chunk_size=4):
#             f.write(chunk)


def read_squad(path):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers


train_contexts, train_questions, train_answers = read_squad(
    '/kaggle/input/squad-2/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad(
    '/kaggle/input/squad-2/dev-v2.0.json')

print(train_answers[0])

{'text': 'in the late 1990s', 'answer_start': 269}


In [4]:
# train_contexts = train_contexts[:1000]
# train_questions = train_questions[:1000]
# train_answers = train_answers[:1000]

In [5]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for i in [1, 2]:
                if context[start_idx - i:end_idx - i] == gold_text:
                    answer['answer_start'] = start_idx - i
                    answer['answer_end'] = end_idx - i


add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions,
                            truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions,
                          truncation=True, padding=True)

print(train_encodings.keys())

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

dict_keys(['input_ids', 'attention_mask'])


In [6]:
print(train_answers[:5])

[{'text': 'in the late 1990s', 'answer_start': 269, 'answer_end': 286}, {'text': 'singing and dancing', 'answer_start': 207, 'answer_end': 226}, {'text': '2003', 'answer_start': 526, 'answer_end': 530}, {'text': 'Houston, Texas', 'answer_start': 166, 'answer_end': 180}, {'text': 'late 1990s', 'answer_start': 276, 'answer_end': 286}]


In [7]:
print(train_encodings.char_to_token(0, train_answers[0]['answer_end'] - 1))

70


In [8]:
def add_token_positions(encodings, answers):
    start_positions = [] 
    end_positions = [] 
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length 
        go_back = 1 
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - go_back)
            go_back += 1
    encodings.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })
    
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [9]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [10]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [11]:
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.to(device)
model.train() 
optim = AdamW(model.parameters(), lr=5e-5)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
num_epochs = 3

for epoch in range(num_epochs):
    model.train() 
    loop = tqdm(train_loader, leave=True)
    for data in loop:
        optim.zero_grad() 
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        start_positions = data['start_positions'].to(device)
        end_positions = data['end_positions'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, 
                        start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0] 
        loss.backward() 
        optim.step() 
        loop.set_description(f'Epoch {epoch + 1}')
        loop.set_postfix(loss=loss.item())

Epoch 1: 100%|██████████| 2714/2714 [1:10:11<00:00,  1.55s/it, loss=0.519]
Epoch 2: 100%|██████████| 2714/2714 [1:10:17<00:00,  1.55s/it, loss=0.427]
Epoch 3: 100%|██████████| 2714/2714 [1:10:14<00:00,  1.55s/it, loss=0.759]


In [13]:
model_path = '/kaggle/working/distilbert_finetune'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/kaggle/working/distilbert_finetune/tokenizer_config.json',
 '/kaggle/working/distilbert_finetune/special_tokens_map.json',
 '/kaggle/working/distilbert_finetune/vocab.txt',
 '/kaggle/working/distilbert_finetune/added_tokens.json',
 '/kaggle/working/distilbert_finetune/tokenizer.json')

In [14]:
model.eval() 
val_loader = DataLoader(val_dataset, batch_size=32)

acc = [] 

loop = tqdm(val_loader)
for data in loop:
    with torch.no_grad():
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        start_true = data['start_positions'].to(device)
        end_true = data['end_positions'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1) 
        acc.append(((start_pred == start_true).sum() / len(start_pred)).item())
        acc.append(((end_pred == end_true).sum() / len(end_pred)).item())
        
acc = sum(acc) / len(acc)
print(f'Accuracy: {acc}')

100%|██████████| 635/635 [05:57<00:00,  1.78it/s]

Accuracy: 0.6613540495063845





In [15]:
print('T/F\tstart\tend\n')
for i in range(10):
    print(f'true\t{start_true[i]}\t{end_true[i]}\n'
          f'pred\t{start_pred[i]}\t{end_pred[i]}\n')

T/F	start	end

true	67	68
pred	66	68

true	67	68
pred	66	68

true	67	68
pred	66	68

true	66	68
pred	66	68

true	171	173
pred	151	65

true	171	173
pred	151	65

true	171	173
pred	151	65

true	171	173
pred	151	65

true	171	173
pred	151	65

true	158	161
pred	2	4

