In [1]:
import json
from abc import ABC

import numpy as np
import random
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, BertForQuestionAnswering, BertTokenizerFast

from tqdm.auto import tqdm
import csv

In [2]:
if torch.cuda.is_available():
    device = 'cuda'
    torch.cuda.manual_seed(666)
else:
    device = 'cpu'
torch.manual_seed(666)
random.seed(666)

In [3]:
train_data = json.load(open('hw7_train.json', 'r', encoding='utf-8'))
dev_data = json.load(open('hw7_dev.json', 'r', encoding='utf-8'))
test_data = json.load(open('hw7_test.json', 'r', encoding='utf-8'))

train_questions = train_data['questions']
train_paragraphs = train_data['paragraphs']

dev_questions = dev_data['questions']
dev_paragraphs = dev_data['paragraphs']

test_questions = test_data['questions']
test_paragraphs = test_data['paragraphs']

In [4]:
model = BertForQuestionAnswering.from_pretrained('bert-base-chinese')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-chinese a

In [5]:
tokenized_train_question = tokenizer([text['question_text'].replace(' ', '') for text in train_questions], add_special_tokens=False)
tokenized_dev_question = tokenizer([text['question_text'].replace(' ', '') for text in dev_questions], add_special_tokens=False)
tokenized_test_question = tokenizer([text['question_text'].replace(' ', '') for text in test_questions], add_special_tokens=False)
tokenized_train_paragraphs = tokenizer(list(map(lambda x : x.replace(' ', ''), train_paragraphs)), add_special_tokens=False)
tokenized_dev_paragraphs = tokenizer(list(map(lambda x : x.replace(' ', ''), dev_paragraphs)), add_special_tokens=False)
tokenized_test_paragraphs = tokenizer(list(map(lambda x : x.replace(' ', ''), test_paragraphs)), add_special_tokens=False)


Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


In [6]:
# print(len(train_paragraphs))
# print(max([len(para) for para in train_paragraphs]))
# print(max([len(q['question_text']) for q in train_questions]))
# print(list(filter(lambda x : len(x) >= 221, [q['question_text'] for q in train_questions])))
# len(tokenized_train_paragraphs['input_ids'])

In [7]:
class MyDataset(Dataset):
    def __init__(self, mode='train'):
        self.mode = mode
        if self.mode == 'train':
            self.questions = train_questions
            self.tokenized_questions = tokenized_train_question
            self.paragraphs = tokenized_train_paragraphs
        elif self.mode == 'dev':
            self.questions = dev_questions
            self.tokenized_questions = tokenized_dev_question
            self.paragraphs = tokenized_dev_paragraphs
        else:
            self.questions = test_questions
            self.tokenized_questions = tokenized_test_question
            self.paragraphs = tokenized_test_paragraphs
        self.max_paragraph_len = 150
        self.max_question_len = 30

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, index):
        question = self.questions[index]
        tokenized_paragraph = self.paragraphs[question['paragraph_id']]
        tokenized_question_text = self.tokenized_questions[index]

        answer = question['answer_text']
        input_ids_question = [101] + self.tokenized_questions[index].ids[0:self.max_question_len] + [102]
        if self.mode == 'train':
            start_pos = question['answer_start']
            end_pos = question['answer_end']
             # mid = (start_pos + end_pos) // 2
        # if len(tokenized_paragraph) > self.max_paragraph_len:
        #     para_start = max(0, mid - self.max_paragraph_len // 2)
        #     para_end = min(para_start + self.max_paragraph_len, len(tokenized_paragraph.ids))
        #     para_token_start = tokenized_paragraph.char_to_token(para_start)
        #     para_token_end = tokenized_paragraph.char_to_token(para_end)
            answer_start_in_token = tokenized_paragraph.char_to_token(start_pos)
            answer_end_in_token = tokenized_paragraph.char_to_token(end_pos)
            if answer_start_in_token is None or answer_end_in_token is None:
                answer_start_in_token = start_pos
                answer_end_in_token = end_pos
            mid = (answer_end_in_token + answer_start_in_token) // 2
            para_start = max(0, min(mid - self.max_question_len // 2, len(tokenized_paragraph) - self.max_paragraph_len))
            para_end = para_start + self.max_paragraph_len
            input_ids_paragraph = tokenized_paragraph.ids[para_start : para_end] + [102]

            # input_ids_paragraph = tokenized_paragraph.ids[para_token_start : para_token_end] + [102]
            # if len(input_ids_paragraph) == 431:
            #     print(tokenizer.decode(tokenized_paragraph.ids))
            #     print('para_start = {0}, para_end = {1}'.format(para_start, para_end))
            #     print('para_token_start = {0}, end = {1}'.format(para_token_start, para_token_end))
            # answer_start_in_token = len(input_ids_question) + start_pos - para_start
            # answer_end_in_token = len(input_ids_question) + end_pos - para_start
            answer_start_in_token += len(input_ids_question) - para_start
            answer_end_in_token += len(input_ids_question) - para_start

            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids),torch.tensor(attention_mask), answer_start_in_token, answer_end_in_token

        else:
            input_ids_paragraph_list = []
            m = len(tokenized_paragraph) // self.max_paragraph_len  + 1
            for i in range(m):
                input_ids_paragraph_list.append(tokenized_paragraph.ids[self.max_paragraph_len * i : min(self.max_paragraph_len * (i + 1), len(tokenized_paragraph))] + [102])
            token_type_ids_list = []
            attention_mask_list = []
            input_ids_list = []
            for p in input_ids_paragraph_list:
                input_ids, token_type, attention_mask = self.padding(input_ids_question, p)
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type)
                attention_mask_list.append(attention_mask)
            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)

    def padding(self, q, p):
        padding_len = self.max_paragraph_len + self.max_question_len + 3 - len(q) - len(p)
        # if padding_len < 0:
        #     print(padding_len, len(q), len(p))
        input_ids = q + p + [0] * padding_len
        token_type_ids = [0] * len(q) + [1] * len(p) + [0] * padding_len
        attention_mask = [1] * (len(q) + len(p)) + [0] * padding_len
        return input_ids, token_type_ids, attention_mask

In [8]:
train_set = MyDataset('train')
dev_set = MyDataset('dev')
test_set = MyDataset('test')
train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
dev_loader = DataLoader(dev_set, batch_size=1)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False)

In [9]:
# len(list(train_loader)[27])

In [10]:
config = {'lr': 0.0001, 'epoch': 1}

In [11]:
def evaluate(data, output, index, mode='dev'):
    if mode == 'dev':
        questions = dev_questions
    else:
        questions = test_questions

    question = questions[index]
    answer_text = question['answer_text']
    start_pos = torch.argmax(output.start_logits, dim=1)
    end_pos = torch.argmax(output.end_logits, dim=1)
    sum = float('-inf')
    n = 0
    for i in range(output.start_logits.shape[0]):
        if sum < output.start_logits[i][start_pos[i]] + output.end_logits[i][end_pos[i]]:
            sum = output.start_logits[i][start_pos[i]] + output.end_logits[i][end_pos[i]]
            n = i
    # print(output.start_logits.shape[0], data[0].shape)
    # print(data[0])
    pred_text = tokenizer.decode(data[0][0][n][start_pos[n] : end_pos[n] + 1]) # plus 1 is important here
    if pred_text.replace(' ', '') == answer_text:
        return 1, pred_text.replace(' ', '')
    else:
        return 0, pred_text.replace(' ', '')
    # answer_start = question['answer_start']
    # answer_end = question['answer']

class Trainer():
    def __init__(self, config):
        self.model = model
        self.config = config
        self.optimizer = torch.optim.AdamW(params=self.model.parameters(), lr=self.config['lr'])



    def train(self):
        model.train()
        model.to(device)
        for epoch in range(self.config['epoch']):
            progress_train = tqdm(train_loader)
            progress_train.set_description('epoch {0}, training'.format(epoch+1))
            steps = 0
            acc = 0
            accumulated_acc = 0
            accumulated_loss = 0
            for data in progress_train:
                data = [i.to(device) for i in data]
                steps += 1
                output = model(input_ids=data[0], token_type_ids=data[1], attention_mask=data[2], start_positions=data[3], end_positions=data[4])
                loss = output.loss
                start_pos = torch.argmax(output.start_logits, dim=1)
                end_pos = torch.argmax(output.end_logits, dim=1)
                acc = torch.mean(((start_pos == data[3]) & (end_pos == data[4])).float(), dim=0).item()
                progress_train.set_postfix(loss=loss, acc=acc)
                accumulated_acc += acc
                accumulated_loss += loss.item()
                output.loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()
            print('Average loss = {0}, Average acc = {1}'.format(accumulated_loss / steps, accumulated_acc / steps))

            ###########  validation   #################
            model.eval()
            progress_dev = tqdm(dev_loader)
            progress_dev.set_description('epoch {0}, validating'.format(epoch+1))
            with torch.no_grad():
                dev_acc = 0
                for i, data in enumerate(progress_dev):
                    # data = list(map(lambda x : list(map(lambda y : y.to(device), x)), data))


                    # batch size of dev is 1, after squeezing dim 0, the number of split paragraph becomes dim 0
                    output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device), attention_mask=data[2].squeeze(dim=0).to(device))
                    dev_acc += evaluate(data, output, i, 'dev')[0]
                print('epoch {0}, dev acc = {1}'.format(epoch, dev_acc / len(dev_loader)))
            model.train()
            model.save_pretrained('./saved_model')

In [12]:
def infer():
    model.eval()
    result = []
    with torch.no_grad():
        progress_test = tqdm(test_loader)
        progress_test.set_description('predicting')

        for i, data in enumerate(progress_test):
            # data = list(map(lambda x : list(map(lambda y : y.to(device), x)), data))

            output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device), attention_mask=data[2].squeeze(dim=0).to(device))
            result.append(evaluate(data, output, i, 'test')[1])
    with open('result.csv', 'w') as f:
        f.write("ID,Answer\n")
        # writer = csv.writer(f)
        #   writer.writerow(['ID', 'Answer'])
        for i, test_question in enumerate(test_questions):
        # Replace commas in answers with empty strings (since csv is separated by comma)
        # Answers in kaggle are processed in the same way
            f.write(f"{test_question['id']},{result[i].replace(',','')}\n")

    print(f"Completed!")

In [13]:
trainer = Trainer(config)
trainer.train()
infer()

  0%|          | 0/496 [00:00<?, ?it/s]

Average loss = 0.9106101567947096, Average acc = 0.6891822077093586


  0%|          | 0/4131 [00:00<?, ?it/s]

epoch 0, dev acc = 0.5039941902687001


  0%|          | 0/4957 [00:00<?, ?it/s]

Completed!


In [14]:
print(list(dev_loader)[1][1].squeeze(dim=0).shape)
print('----------------------------------')
print(dev_set[1])


torch.Size([4, 183])
----------------------------------
(tensor([[  101,  9246,  8156,  2399,  4635,  5909,  3136,  2530,  6629,  5412,
          4638,  5178,  3362,  3221,   136,   102,  1039,  3308,  2527,  3309,
          8024,  4294,  1162,  3221,  9146,  8129,  2399,   807,   704,  2527,
          3309,  5635, 12809,  2399,   807,  3309,  7279,  8024,   746,  3197,
           510,  4602,  4554,  5645,  3717,  4134,  3229,  2382,  4634,  4495,
          8024,  7941,  3777,  1765,  1281,  3717,  2642,  2215,  1071,  1713,
          7028,  8024,  5735,   809,  3644,   807,   704,  1751,  4374,  3308,
          4638,  3613,  3149,   868,  3683,  6733,  8024,  4912,  4031,  2398,
          1772,   129,   119,   129,  2399,   671,  3613,  8024,  1060,  2129,
          4158,   124,   119,   126,  2399,  8024,  1039,   807,  4158,   122,
           119,   127,  2399,  8024,  3209,   510,  3926,  1060,   807,  1772,
          4158,   123,   119,   129,  2399,   511,  5645,  3634,  1398,  3