In [50]:
import json
import numpy as np
import random
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, BertForQuestionAnswering, BertTokenizerFast

from tqdm.auto import tqdm

device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

# Fix random seed for reproducibility
def same_seeds(seed):
	  torch.manual_seed(seed)
	  if torch.cuda.is_available():
		    torch.cuda.manual_seed(seed)
		    torch.cuda.manual_seed_all(seed)
	  np.random.seed(seed)
	  random.seed(seed)
	  torch.backends.cudnn.benchmark = False
	  torch.backends.cudnn.deterministic = True
same_seeds(0)

In [51]:
device

device(type='cuda', index=2)

In [7]:
# from accelerate import Accelerator
# accelerator = Accelerator(fp16=True)
# device = accelerator.device

In [52]:
model = BertForQuestionAnswering.from_pretrained("bert-base-chinese").to(device)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-chinese a

In [53]:
def read_data(file):
    with open(file, 'r', encoding="utf-8") as reader:
        data = json.load(reader)
    return data["questions"], data["paragraphs"]

In [54]:
root_path = 'ML_study/chapter7/'
train_questions, train_paragraphs = read_data(root_path + "hw7_train.json")
dev_questions, dev_paragraphs = read_data(root_path + "hw7_dev.json")
test_questions, test_paragraphs = read_data(root_path + "hw7_test.json")

## 数据分questions 和 paragraphs
## question部分的每条数据有：
## id(词条的id int) paragraph_id(段落id int) question text(问句内容 str)
## answer_text(回答内容 str) answer start(回答在文章中的起始位置 int) answer end(回答在文章中的结束位置 int)


In [55]:
print('训练集QA对有： ', len(train_questions))
print('验证集QA对有： ', len(dev_questions))
print('测试集QA对有： ', len(test_questions))

训练集QA对有：  26936
验证集QA对有：  3524
测试集QA对有：  3493


In [56]:
train_questions[0]

{'id': 0,
 'paragraph_id': 3884,
 'question_text': '羅馬教皇利奧三世在800年正式加冕誰為羅馬人的皇帝?',
 'answer_text': '查理大帝',
 'answer_start': 141,
 'answer_end': 144}

In [57]:
len(train_paragraphs)

8014

In [58]:
train_paragraphs[0]

'2010年引進的廣州快速公交運輸系統，屬世界第二大快速公交系統，日常載客量可達100萬人次，高峰時期每小時單向客流高達26900人次，僅次於波哥大的快速交通系統，平均每10秒鐘就有一輛巴士，每輛巴士單向行駛350小時。包括橋樑在內的站台是世界最長的州快速公交運輸系統站台，長達260米。目前廣州市區的計程車和公共汽車主要使用液化石油氣作燃料，部分公共汽車更使用油電、氣電混合動力技術。2012年底開始投放液化天然氣燃料的公共汽車，2014年6月開始投放液化天然氣插電式混合動力公共汽車，以取代液化石油氣公共汽車。2007年1月16日，廣州市政府全面禁止在市區內駕駛摩托車。違反禁令的機動車將會予以沒收。廣州市交通局聲稱禁令的施行，使得交通擁擠問題和車禍大幅減少。廣州白雲國際機場位於白雲區與花都區交界，2004年8月5日正式投入運營，屬中國交通情況第二繁忙的機場。該機場取代了原先位於市中心的無法滿足日益增長航空需求的舊機場。目前機場有三條飛機跑道，成為國內第三個擁有三跑道的民航機場。比鄰近的香港國際機場第三跑道預計的2023年落成早8年。'

In [59]:
first_question_text = train_questions[0]['question_text']
first_answer_text = train_questions[0]['answer_text']

In [60]:
first_tokenzed_question = tokenizer(first_question_text)

In [61]:
first_tokenzed_question

{'input_ids': [101, 5397, 7679, 3136, 4640, 1164, 1953, 676, 686, 1762, 8280, 2399, 3633, 2466, 1217, 1089, 6306, 4158, 5397, 7679, 782, 4638, 4640, 2370, 136, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [84]:
first_paragraph = train_paragraphs[0]

In [85]:
len(first_paragraph)

471

In [64]:
first_paragraph_tokenizer = tokenizer(first_paragraph)
first_paragraph_tokenizer

{'input_ids': [101, 8166, 2399, 2471, 6868, 4638, 2451, 2336, 2571, 6862, 1062, 769, 6880, 6745, 5143, 5186, 8024, 2253, 686, 4518, 5018, 753, 1920, 2571, 6862, 1062, 769, 5143, 5186, 8024, 3189, 2382, 6734, 2145, 7030, 1377, 6888, 8135, 5857, 782, 3613, 8024, 7770, 2292, 3229, 3309, 3680, 2207, 3229, 1606, 1403, 2145, 3837, 7770, 6888, 11023, 8279, 782, 3613, 8024, 1006, 3613, 3176, 3797, 1520, 1920, 4638, 2571, 6862, 769, 6858, 5143, 5186, 8024, 2398, 1772, 3680, 8108, 4907, 7132, 2218, 3300, 671, 6739, 2349, 1894, 8024, 3680, 6739, 2349, 1894, 1606, 1403, 6121, 7691, 8612, 2207, 3229, 511, 1259, 2886, 3578, 3558, 1762, 1058, 4638, 4991, 1378, 3221, 686, 4518, 3297, 7269, 4638, 2336, 2571, 6862, 1062, 769, 6880, 6745, 5143, 5186, 4991, 1378, 8024, 7269, 6888, 9044, 5101, 511, 4680, 1184, 2451, 2336, 2356, 1281, 4638, 6243, 4923, 6722, 1469, 1062, 1066, 3749, 6722, 712, 6206, 886, 4500, 3890, 1265, 4767, 3779, 3706, 868, 4234, 3160, 8024, 6956, 1146, 1062, 1066, 3749, 6722, 3291, 886,

In [66]:
len(first_paragraph_tokenizer[0])

445

In [87]:
first_paragraph_tokenizer.char_to_token(470)


443

In [68]:
train_questions_tokenized = tokenizer([train_question["question_text"] for train_question in train_questions], add_special_tokens=False)
dev_questions_tokenized = tokenizer([dev_question["question_text"] for dev_question in dev_questions], add_special_tokens=False)
test_questions_tokenized = tokenizer([test_question["question_text"] for test_question in test_questions], add_special_tokens=False)

## 每个句子被变成了一个向量 input_ids是每个token在词典中的编号
## attention mask都是1 没有mask

In [79]:
train_questions_tokenized[0].ids

[5397,
 7679,
 3136,
 4640,
 1164,
 1953,
 676,
 686,
 1762,
 8280,
 2399,
 3633,
 2466,
 1217,
 1089,
 6306,
 4158,
 5397,
 7679,
 782,
 4638,
 4640,
 2370,
 136]

In [90]:
train_questions_tokenized[0].tokens

['羅',
 '馬',
 '教',
 '皇',
 '利',
 '奧',
 '三',
 '世',
 '在',
 '800',
 '年',
 '正',
 '式',
 '加',
 '冕',
 '誰',
 '為',
 '羅',
 '馬',
 '人',
 '的',
 '皇',
 '帝',
 '?']

In [69]:
train_paragraphs_tokenized = tokenizer(train_paragraphs, add_special_tokens=False)
dev_paragraphs_tokenized = tokenizer(dev_paragraphs, add_special_tokens=False)
test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens=False)


Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


In [91]:
train_paragraphs_tokenized[0]

Encoding(num_tokens=443, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [70]:
len(train_paragraphs_tokenized)

3

In [71]:
class QA_Dataset(Dataset):
    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):
        self.split = split
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 40
        self.max_paragraph_len = 150

        ##### TODO: Change value of doc_stride #####
        self.doc_stride = 150

        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]
        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1
		# 最大长度是CLS + max_question_len + SEP + max_paragraph_len + 1

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]

        ##### TODO: Preprocessing #####
        # Hint: How to prevent model from learning something it should not learn

        if self.split == "train":
            # Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph
            answer_start_token = tokenized_paragraph.char_to_token(question["answer_start"])
            answer_end_token = tokenized_paragraph.char_to_token(question["answer_end"])
			# 因为tokenized之后的长度和句子长度不一定相同 所以使用char_to_token这个api对应到tokenized之后的位置


            # A single window is obtained by slicing the portion of paragraph containing the answer
            mid = (answer_start_token + answer_end_token) // 2
            paragraph_start = max(0, min(mid - self.max_paragraph_len // 2, len(tokenized_paragraph) - self.max_paragraph_len))
            paragraph_end = paragraph_start + self.max_paragraph_len
			# 使用max_paragraph_len限制长度 要做到把answer包起来


            # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]
            # question是从头到max_len 前面加上CLS后面加上SEP
            # paragraph是从上面计算的start到end 后面加上SEP

            # Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window
            answer_start_token += len(input_ids_question) - paragraph_start
            answer_end_token += len(input_ids_question) - paragraph_start
            # 将start end从绝对意义上tokenized的位置换到window中的相对位置 (也就是真实值)

            # Pad sequence and obtain inputs to model
            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token

        # Validation/Testing
        else:
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []

            # 以doc_stride为区间分割paragraph
            # Paragraph is split into several windows, each with start positions separated by step "doc_stride"
            for i in range(0, len(tokenized_paragraph), self.doc_stride):

                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]

                # Pad sequence and obtain inputs to model
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)

                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)

            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)

    def padding(self, input_ids_question, input_ids_paragraph):
        ## padding要将[CLS] question [SEP] paragraph [SEP]后面加上[0] 以达到max_seq_len
        ## token_type_ids 用以区分question和paragraph [0]是question和padding [1]是paragraph
        ## attention_mask是为了在算attention的时候 padding的部分不加以计算


        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        # Indices of input sequence tokens in the vocabulary
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len

        return input_ids, token_type_ids, attention_mask

In [72]:
train_set = QA_Dataset("train", train_questions, train_questions_tokenized, train_paragraphs_tokenized)
dev_set = QA_Dataset("dev", dev_questions, dev_questions_tokenized, dev_paragraphs_tokenized)
test_set = QA_Dataset("test", test_questions, test_questions_tokenized, test_paragraphs_tokenized)

In [73]:
train_batch_size = 16

train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, pin_memory=True)
dev_loader = DataLoader(dev_set, batch_size=1, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)

In [76]:
for data in enumerate(test_loader):
    print(data)
    break


(0, [tensor([[[ 101, 1894, 2135, 7269, 4638, 7531, 4666,  677, 3298, 3300,  862,
          6172, 7617, 4289,  136,  102,  671, 5663, 7386, 1519, 1350, 1199,
           678, 1894, 4959, 5865,  519,  676, 5682,  520, 1169, 3302, 8024,
           684, 3760, 3300,  877, 2965,  818,  862, 6725, 7389, 8024, 1912,
          6134, 7370,  749, 1169, 3302, 1920, 2207, 1912,  699, 4192,  679,
          1398,  511, 5635, 3176,  678, 1894, 4638, 1169, 3302, 8024, 1762,
          6153, 1366, 4638, 6956, 1146, 8024, 3300, 5148, 5682, 4638, 5226,
          5251, 2551, 4995, 8024,  684, 2898, 3300, 4638, 2776, 3291, 1008,
          5747, 4638, 3564, 2094,  511, 7531, 6956, 4638, 6956, 1146, 8024,
          6858, 2382, 3221, 7946, 5682, 6509, 7440, 2384, 8024, 2772, 3221,
          2380, 3300, 5148, 5682,  510, 4635, 5682,  510, 7941, 5682, 1469,
          7946, 5682,  809, 1350, 5166, 5682, 7858, 7852, 5417, 3688, 4638,
          7946, 5682, 4192, 7481, 7770, 7515, 4666,  677, 8024, 1184, 5442,
       

In [74]:
def evaluate(data, output):
    ##### TODO: Postprocessing #####
    # There is a bug and room for improvement in postprocessing
    # Hint: Open your prediction file to see what is wrong

    answer = ''
    max_prob = float('-inf')
    num_of_windows = data[0].shape[1]

    for k in range(num_of_windows):
        # Obtain answer by choosing the most probable start position / end position
        start_prob, start_index = torch.max(output.start_logits[k], dim=0)
        end_prob, end_index = torch.max(output.end_logits[k], dim=0)

        # Probability of answer is calculated as sum of start_prob and end_prob
        prob = start_prob + end_prob

        # Replace answer if calculated probability is larger than previous windows
        if prob > max_prob:
            max_prob = prob
            # Convert tokens to chars (e.g. [1920, 7032] --> "大 金")
            answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])

    # Remove spaces in answer (e.g. "大 金" --> "大金")
    return answer.replace(' ','')

In [96]:
num_epoch = 10
validation = True
logging_step = 100
learning_rate = 1e-4
optimizer = AdamW(model.parameters(), lr=learning_rate)

In [97]:
model.train()

print("Start Training ...")

for epoch in range(num_epoch):
    step = 1
    train_loss = train_acc = 0

    for data in tqdm(train_loader):
        # Load all data into GPU
        data = [i.to(device) for i in data]

        # Model inputs: input_ids, token_type_ids, attention_mask, start_positions, end_positions (Note: only "input_ids" is mandatory)
        # Model outputs: start_logits, end_logits, loss (return when start_positions/end_positions are provided)
        output = model(input_ids=data[0], token_type_ids=data[1], attention_mask=data[2], start_positions=data[3], end_positions=data[4])
        # Choose the most probable start position / end position
        start_index = torch.argmax(output.start_logits, dim=1)
        end_index = torch.argmax(output.end_logits, dim=1)

        # Prediction is correct only if both start_index and end_index are correct
        train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()
        train_loss += output.loss

        output.loss.backward()

        optimizer.step()
        optimizer.zero_grad()
        step += 1

        ##### TODO: Apply linear learning rate decay #####

        # Print training loss and accuracy over past logging step
        if step % logging_step == 0:
            print(f"Epoch {epoch + 1} | Step {step} | loss = {train_loss.item() / logging_step:.3f}, acc = {train_acc / logging_step:.3f}")
            train_loss = train_acc = 0

    if validation:
        print("Evaluating Dev Set ...")
        model.eval()
        with torch.no_grad():
            dev_acc = 0
            for i, data in enumerate(tqdm(dev_loader)):
                output = model(input_ids=data[0].squeeze().to(device), token_type_ids=data[1].squeeze().to(device),
                       attention_mask=data[2].squeeze().to(device))
                # prediction is correct only if answer text exactly matches
                dev_acc += evaluate(data, output) == dev_questions[i]["answer_text"]
            print(f"Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader):.3f}")
        model.train()

Epoch 1 | Step 200 | loss = 0.421, acc = 0.807
Epoch 1 | Step 300 | loss = 0.404, acc = 0.794
Epoch 1 | Step 400 | loss = 0.446, acc = 0.790
Epoch 1 | Step 500 | loss = 0.488, acc = 0.766
Epoch 1 | Step 600 | loss = 0.494, acc = 0.768
Epoch 1 | Step 700 | loss = 0.510, acc = 0.770
Epoch 1 | Step 800 | loss = 0.499, acc = 0.774
Epoch 1 | Step 900 | loss = 0.409, acc = 0.793
Epoch 1 | Step 1000 | loss = 0.510, acc = 0.761
Epoch 1 | Step 1100 | loss = 0.410, acc = 0.799
Epoch 1 | Step 1200 | loss = 0.430, acc = 0.799
Epoch 1 | Step 1300 | loss = 0.471, acc = 0.777
Epoch 1 | Step 1400 | loss = 0.455, acc = 0.781
Epoch 1 | Step 1500 | loss = 0.419, acc = 0.799
Epoch 1 | Step 1600 | loss = 0.491, acc = 0.779

Evaluating Dev Set ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3524.0), HTML(value='')))


Validation | Epoch 1 | acc = 0.483
Start Training ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1684.0), HTML(value='')))

Epoch 1 | Step 100 | loss = 0.326, acc = 0.821
Epoch 1 | Step 200 | loss = 0.327, acc = 0.833
Epoch 1 | Step 300 | loss = 0.356, acc = 0.832
Epoch 1 | Step 400 | loss = 0.374, acc = 0.821
Epoch 1 | Step 500 | loss = 0.426, acc = 0.799
Epoch 1 | Step 600 | loss = 0.398, acc = 0.797
Epoch 1 | Step 700 | loss = 0.372, acc = 0.817
Epoch 1 | Step 800 | loss = 0.377, acc = 0.809
Epoch 1 | Step 900 | loss = 0.488, acc = 0.790
Epoch 1 | Step 1000 | loss = 0.425, acc = 0.812
Epoch 1 | Step 1100 | loss = 0.452, acc = 0.784
Epoch 1 | Step 1200 | loss = 0.466, acc = 0.800
Epoch 1 | Step 1300 | loss = 0.487, acc = 0.783
Epoch 1 | Step 1400 | loss = 0.429, acc = 0.797
Epoch 1 | Step 1500 | loss = 0.469, acc = 0.775
Epoch 1 | Step 1600 | loss = 0.436, acc = 0.804

Evaluating Dev Set ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3524.0), HTML(value='')))


Validation | Epoch 1 | acc = 0.454


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1684.0), HTML(value='')))

Epoch 2 | Step 100 | loss = 0.371, acc = 0.812
Epoch 2 | Step 200 | loss = 0.310, acc = 0.853
Epoch 2 | Step 300 | loss = 0.300, acc = 0.847
Epoch 2 | Step 400 | loss = 0.338, acc = 0.836
Epoch 2 | Step 500 | loss = 0.394, acc = 0.819
Epoch 2 | Step 600 | loss = 0.387, acc = 0.811
Epoch 2 | Step 700 | loss = 0.328, acc = 0.839
Epoch 2 | Step 800 | loss = 0.362, acc = 0.837
Epoch 2 | Step 900 | loss = 0.311, acc = 0.846
Epoch 2 | Step 1000 | loss = 0.340, acc = 0.831
Epoch 2 | Step 1100 | loss = 0.338, acc = 0.824
Epoch 2 | Step 1200 | loss = 0.300, acc = 0.837
Epoch 2 | Step 1300 | loss = 0.347, acc = 0.832
Epoch 2 | Step 1400 | loss = 0.329, acc = 0.834
Epoch 2 | Step 1500 | loss = 0.435, acc = 0.794
Epoch 2 | Step 1600 | loss = 0.399, acc = 0.816

Evaluating Dev Set ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3524.0), HTML(value='')))


Validation | Epoch 2 | acc = 0.399


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1684.0), HTML(value='')))

Epoch 3 | Step 100 | loss = 0.469, acc = 0.776
Epoch 3 | Step 200 | loss = 0.326, acc = 0.831
Epoch 3 | Step 300 | loss = 0.313, acc = 0.836
Epoch 3 | Step 400 | loss = 0.310, acc = 0.850
Epoch 3 | Step 500 | loss = 0.281, acc = 0.859
Epoch 3 | Step 600 | loss = 0.292, acc = 0.855
Epoch 3 | Step 700 | loss = 0.264, acc = 0.862
Epoch 3 | Step 800 | loss = 0.249, acc = 0.864
Epoch 3 | Step 900 | loss = 0.284, acc = 0.861
Epoch 3 | Step 1000 | loss = 0.242, acc = 0.881
Epoch 3 | Step 1100 | loss = 0.288, acc = 0.851
Epoch 3 | Step 1200 | loss = 0.288, acc = 0.860
Epoch 3 | Step 1300 | loss = 0.286, acc = 0.859
Epoch 3 | Step 1400 | loss = 0.278, acc = 0.861
Epoch 3 | Step 1500 | loss = 0.259, acc = 0.869
Epoch 3 | Step 1600 | loss = 0.254, acc = 0.861

Evaluating Dev Set ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3524.0), HTML(value='')))


Validation | Epoch 3 | acc = 0.490


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1684.0), HTML(value='')))

Epoch 4 | Step 100 | loss = 0.165, acc = 0.891
Epoch 4 | Step 200 | loss = 0.149, acc = 0.917
Epoch 4 | Step 300 | loss = 0.174, acc = 0.902
Epoch 4 | Step 400 | loss = 0.179, acc = 0.907
Epoch 4 | Step 500 | loss = 0.193, acc = 0.891
Epoch 4 | Step 600 | loss = 0.252, acc = 0.860
Epoch 4 | Step 700 | loss = 0.279, acc = 0.864
Epoch 4 | Step 800 | loss = 0.250, acc = 0.874
Epoch 4 | Step 900 | loss = 0.235, acc = 0.873
Epoch 4 | Step 1000 | loss = 0.245, acc = 0.867
Epoch 4 | Step 1100 | loss = 0.231, acc = 0.877
Epoch 4 | Step 1200 | loss = 0.244, acc = 0.879
Epoch 4 | Step 1300 | loss = 0.229, acc = 0.879
Epoch 4 | Step 1400 | loss = 0.273, acc = 0.864
Epoch 4 | Step 1500 | loss = 0.239, acc = 0.868
Epoch 4 | Step 1600 | loss = 0.279, acc = 0.857

Evaluating Dev Set ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3524.0), HTML(value='')))


Validation | Epoch 4 | acc = 0.472


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1684.0), HTML(value='')))

Epoch 5 | Step 100 | loss = 0.184, acc = 0.894
Epoch 5 | Step 200 | loss = 0.188, acc = 0.904
Epoch 5 | Step 300 | loss = 0.218, acc = 0.882
Epoch 5 | Step 400 | loss = 0.193, acc = 0.892
Epoch 5 | Step 500 | loss = 0.231, acc = 0.885
Epoch 5 | Step 600 | loss = 0.215, acc = 0.896
Epoch 5 | Step 700 | loss = 0.228, acc = 0.891
Epoch 5 | Step 800 | loss = 0.258, acc = 0.875
Epoch 5 | Step 900 | loss = 0.247, acc = 0.869
Epoch 5 | Step 1000 | loss = 0.250, acc = 0.880
Epoch 5 | Step 1100 | loss = 0.217, acc = 0.889
Epoch 5 | Step 1200 | loss = 0.244, acc = 0.868
Epoch 5 | Step 1300 | loss = 0.229, acc = 0.881
Epoch 5 | Step 1400 | loss = 0.236, acc = 0.866
Epoch 5 | Step 1500 | loss = 0.260, acc = 0.861
Epoch 5 | Step 1600 | loss = 0.275, acc = 0.869

Evaluating Dev Set ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3524.0), HTML(value='')))


Validation | Epoch 5 | acc = 0.464


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1684.0), HTML(value='')))

Epoch 6 | Step 100 | loss = 0.197, acc = 0.890
Epoch 6 | Step 200 | loss = 0.165, acc = 0.904
Epoch 6 | Step 300 | loss = 0.166, acc = 0.909
Epoch 6 | Step 400 | loss = 0.228, acc = 0.873
Epoch 6 | Step 500 | loss = 0.182, acc = 0.908
Epoch 6 | Step 600 | loss = 0.201, acc = 0.897
Epoch 6 | Step 700 | loss = 0.234, acc = 0.889
Epoch 6 | Step 800 | loss = 0.209, acc = 0.885
Epoch 6 | Step 900 | loss = 0.235, acc = 0.887
Epoch 6 | Step 1000 | loss = 0.265, acc = 0.869
Epoch 6 | Step 1100 | loss = 0.270, acc = 0.869
Epoch 6 | Step 1200 | loss = 0.228, acc = 0.888
Epoch 6 | Step 1300 | loss = 0.219, acc = 0.874
Epoch 6 | Step 1400 | loss = 0.250, acc = 0.875
Epoch 6 | Step 1500 | loss = 0.215, acc = 0.891
Epoch 6 | Step 1600 | loss = 0.248, acc = 0.887

Evaluating Dev Set ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3524.0), HTML(value='')))


Validation | Epoch 6 | acc = 0.454


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1684.0), HTML(value='')))

Epoch 7 | Step 100 | loss = 0.160, acc = 0.907
Epoch 7 | Step 200 | loss = 0.126, acc = 0.937
Epoch 7 | Step 300 | loss = 0.154, acc = 0.920
Epoch 7 | Step 400 | loss = 0.192, acc = 0.902
Epoch 7 | Step 500 | loss = 0.221, acc = 0.886
Epoch 7 | Step 600 | loss = 0.189, acc = 0.900
Epoch 7 | Step 700 | loss = 0.201, acc = 0.887
Epoch 7 | Step 800 | loss = 0.198, acc = 0.887
Epoch 7 | Step 900 | loss = 0.194, acc = 0.899
Epoch 7 | Step 1000 | loss = 0.193, acc = 0.892
Epoch 7 | Step 1100 | loss = 0.269, acc = 0.876
Epoch 7 | Step 1200 | loss = 0.255, acc = 0.873
Epoch 7 | Step 1300 | loss = 0.238, acc = 0.894
Epoch 7 | Step 1400 | loss = 0.203, acc = 0.899
Epoch 7 | Step 1500 | loss = 0.222, acc = 0.887
Epoch 7 | Step 1600 | loss = 0.259, acc = 0.884

Evaluating Dev Set ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3524.0), HTML(value='')))


Validation | Epoch 7 | acc = 0.461


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1684.0), HTML(value='')))

Epoch 8 | Step 100 | loss = 0.181, acc = 0.899
Epoch 8 | Step 200 | loss = 0.164, acc = 0.907
Epoch 8 | Step 300 | loss = 0.172, acc = 0.915
Epoch 8 | Step 400 | loss = 0.158, acc = 0.915
Epoch 8 | Step 500 | loss = 0.179, acc = 0.907
Epoch 8 | Step 600 | loss = 0.195, acc = 0.908
Epoch 8 | Step 700 | loss = 0.196, acc = 0.906
Epoch 8 | Step 800 | loss = 0.170, acc = 0.915
Epoch 8 | Step 900 | loss = 0.302, acc = 0.856
Epoch 8 | Step 1000 | loss = 0.352, acc = 0.845
Epoch 8 | Step 1100 | loss = 0.268, acc = 0.874
Epoch 8 | Step 1200 | loss = 0.263, acc = 0.876
Epoch 8 | Step 1300 | loss = 0.270, acc = 0.865
Epoch 8 | Step 1400 | loss = 0.226, acc = 0.881
Epoch 8 | Step 1500 | loss = 0.262, acc = 0.877
Epoch 8 | Step 1600 | loss = 0.225, acc = 0.878

Evaluating Dev Set ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3524.0), HTML(value='')))


Validation | Epoch 8 | acc = 0.464


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1684.0), HTML(value='')))

Epoch 9 | Step 100 | loss = 0.131, acc = 0.921
Epoch 9 | Step 200 | loss = 0.134, acc = 0.934
Epoch 9 | Step 300 | loss = 0.121, acc = 0.932
Epoch 9 | Step 400 | loss = 0.183, acc = 0.899
Epoch 9 | Step 500 | loss = 0.172, acc = 0.912
Epoch 9 | Step 600 | loss = 0.215, acc = 0.896
Epoch 9 | Step 700 | loss = 0.197, acc = 0.896
Epoch 9 | Step 800 | loss = 0.195, acc = 0.891
Epoch 9 | Step 900 | loss = 0.196, acc = 0.896
Epoch 9 | Step 1000 | loss = 0.212, acc = 0.891
Epoch 9 | Step 1100 | loss = 0.209, acc = 0.891
Epoch 9 | Step 1200 | loss = 0.252, acc = 0.867
Epoch 9 | Step 1300 | loss = 0.217, acc = 0.882
Epoch 9 | Step 1400 | loss = 0.179, acc = 0.906
Epoch 9 | Step 1500 | loss = 0.214, acc = 0.883
Epoch 9 | Step 1600 | loss = 0.208, acc = 0.880

Evaluating Dev Set ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3524.0), HTML(value='')))


Validation | Epoch 9 | acc = 0.425


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1684.0), HTML(value='')))

Epoch 10 | Step 100 | loss = 0.104, acc = 0.931
Epoch 10 | Step 200 | loss = 0.134, acc = 0.927
Epoch 10 | Step 300 | loss = 0.142, acc = 0.924
Epoch 10 | Step 400 | loss = 0.139, acc = 0.929
Epoch 10 | Step 500 | loss = 0.169, acc = 0.923
Epoch 10 | Step 600 | loss = 0.151, acc = 0.923
Epoch 10 | Step 700 | loss = 0.128, acc = 0.932
Epoch 10 | Step 800 | loss = 0.202, acc = 0.897
Epoch 10 | Step 900 | loss = 0.179, acc = 0.907
Epoch 10 | Step 1000 | loss = 0.167, acc = 0.907
Epoch 10 | Step 1100 | loss = 0.190, acc = 0.904
Epoch 10 | Step 1200 | loss = 0.176, acc = 0.901
Epoch 10 | Step 1300 | loss = 0.254, acc = 0.882
Epoch 10 | Step 1400 | loss = 0.242, acc = 0.881
Epoch 10 | Step 1500 | loss = 0.221, acc = 0.886
Epoch 10 | Step 1600 | loss = 0.234, acc = 0.881

Evaluating Dev Set ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3524.0), HTML(value='')))


Validation | Epoch 10 | acc = 0.412


In [81]:
print("Saving Model ...")
model_save_dir = root_path + 'model/'
model.save_pretrained(model_save_dir)

Saving Model ...


In [82]:
print("Evaluating Test Set ...")

result = []

model.eval()
with torch.no_grad():
    for data in tqdm(test_loader):
        output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
        result.append(evaluate(data, output))

result_file = root_path + "result.csv"
with open(result_file, 'w') as f:
	  f.write("ID,Answer\n")
	  for i, test_question in enumerate(test_questions):
        # Replace commas in answers with empty strings (since csv is separated by comma)
        # Answers in kaggle are processed in the same way
		    f.write(f"{test_question['id']},{result[i].replace(',','')}\n")

print(f"Completed! Result is in {result_file}")

Evaluating Test Set ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3493.0), HTML(value='')))


Completed! Result is in ML_study/chapter7/result.csv


In [92]:
model

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_