# Bert

In [1]:
!ls

HW07.ipynb
HW07.pdf
bert_learn_demo.ipynb
data
ml2021-spring-hw7.zip
test.ipynb


In [2]:
# !tar -zxvf ml2021-spring-hw7.zip -C data/

In [3]:
def prints(arr):
    if isinstance(arr, torch.Tensor) or isinstance(arr, np.ndarray):
        print(arr, arr.shape)
        return
    if isinstance(arr, list) or isinstance(arr, tuple):
        print(arr, len(arr))

In [4]:
import json
import numpy as np
import random
import torch
from torch.utils.data import DataLoader, Dataset 
from transformers import AdamW, BertForQuestionAnswering, BertTokenizerFast

from tqdm.auto import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

# Fix random seed for reproducibility
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
                
same_seeds(0)

In [5]:
model = BertForQuestionAnswering.from_pretrained("bert-base-chinese").to(device)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-chinese a

In [6]:
!head data/hw7_train.json

{
    "questions": [
        {
            "id": 0,
            "paragraph_id": 3884,
            "question_text": "缇呴Μ鏁欑殗鍒╁ェ涓変笘鍦�800骞存�ｅ紡鍔犲啎瑾扮偤缇呴Μ浜虹殑鐨囧笣?",
            "answer_text": "鏌ョ悊澶у笣",
            "answer_start": 141,
            "answer_end": 144
        },


In [7]:
data_home = "data"
import os
def read_data(file, home=data_home):
    with open(os.path.join(home,file), 'r', encoding="utf-8") as reader:
        data = json.load(reader)
    return data["questions"], data["paragraphs"]


train_questions, train_paragraphs = read_data("hw7_train.json")
dev_questions, dev_paragraphs = read_data("hw7_dev.json")
test_questions, test_paragraphs = read_data("hw7_test.json")

In [8]:
dev_questions[:3]

[{'id': 0,
  'paragraph_id': 538,
  'question_text': '哪一個訓練中心的設備被使用來訓練網絡城與媒體城勞工的未來知識?',
  'answer_text': '杜拜知識村',
  'answer_start': 312,
  'answer_end': 316},
 {'id': 1,
  'paragraph_id': 281,
  'question_text': '南加大單一科系所收到的捐贈最高紀錄是多少?',
  'answer_text': '3500萬美元',
  'answer_start': 634,
  'answer_end': 640},
 {'id': 2,
  'paragraph_id': 953,
  'question_text': '梅艷芳以哪一部電影獲得金馬獎?',
  'answer_text': '《胭脂扣》',
  'answer_start': 426,
  'answer_end': 430}]

In [9]:
len(train_questions), train_questions[0]

(26936,
 {'id': 0,
  'paragraph_id': 3884,
  'question_text': '羅馬教皇利奧三世在800年正式加冕誰為羅馬人的皇帝?',
  'answer_text': '查理大帝',
  'answer_start': 141,
  'answer_end': 144})

In [10]:
len(dev_questions), len(dev_paragraphs)

(3524, 1000)

In [11]:
dev_paragraphs[953], dev_paragraphs[953][426:431]

('梅艷芳，1963年10月10日－2003年12月30日，香港出生，香港歌影壇巨星，人稱「樂壇大姐大」和「舞台女王」。她的形象百變，更是華語樂壇首位在同一場演唱會換多套歌衫以及在每張專輯的封面和音樂影片都有不同造型的歌手，因此亦有「百變女歌神」之稱。梅艷芳的影響力不只在演藝界，她更代表了整個香港及華人社會，她去世後被傳媒廣泛稱為「香港之女」。梅艷芳跟徐小鳳和蔡琴一樣，是華人樂壇少有的女低音。梅艷芳自小與家人在荔園、啟德遊樂場賣唱。1982年，在無線電視及華星唱片合辦的第一屆新秀歌唱大賽中以一曲〈風的季節〉勝出，星途由此展開。在事業早期，歌曲路線已是冶艷前衛，舞台衣著華麗大膽，風格千變萬化。梅艷芳在1985年－1989年十大勁歌金曲頒獎典禮當中連續五屆奪得最受歡迎女歌星並於1989年叱吒樂壇流行榜頒獎典禮奪得叱吒樂壇女歌手金獎。而於1985年推出的專輯《壞女孩》銷量更達14白金，奠定了她在歌壇的地位。她在電影界也獲得多項大獎，1987年更以電影《胭脂扣》一舉拿下金馬獎、香港電影金像獎及亞太影展最佳女主角后冠，演技備受肯定。她同時也是華人女歌手中演唱會場次的最高紀錄保持者，共計292場。',
 '《胭脂扣》')

## tokenizer

In [12]:
train_questions[0].keys()

dict_keys(['id', 'paragraph_id', 'question_text', 'answer_text', 'answer_start', 'answer_end'])

In [13]:
# Tokenize questions and paragraphs separately
# 「add_special_tokens」 is set to False since special tokens will be added 
    # when tokenized questions and paragraphs are combined in datset __getitem__ 

train_questions_tokenized = tokenizer([train_question["question_text"] 
       for train_question in train_questions], add_special_tokens=False)
dev_questions_tokenized = tokenizer([dev_question["question_text"] 
       for dev_question in dev_questions], add_special_tokens=False)
test_questions_tokenized = tokenizer([test_question["question_text"] 
        for test_question in test_questions], add_special_tokens=False) 

# add_special_tokens = False, 导致ids 没有 101 和 102
train_paragraphs_tokenized = tokenizer(train_paragraphs, 
                                       add_special_tokens=False)
dev_paragraphs_tokenized = tokenizer(dev_paragraphs, 
                                     add_special_tokens=False)
test_paragraphs_tokenized = tokenizer(test_paragraphs, 
                                      add_special_tokens=False)

# You can safely ignore the warning message as tokenized sequences will be futher processed in datset __getitem__ before passing to model

Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


### add_special_tokens=False

`train_questions_tokenized`、 `train_paragraphs_tokenized`的ids 不是101开头、102结尾, 是因为设置了`add_special_tokens=False`

In [14]:
print(train_questions_tokenized[0].ids)

[5397, 7679, 3136, 4640, 1164, 1953, 676, 686, 1762, 8280, 2399, 3633, 2466, 1217, 1089, 6306, 4158, 5397, 7679, 782, 4638, 4640, 2370, 136]


In [15]:
train_paragraphs[0]

'2010年引進的廣州快速公交運輸系統，屬世界第二大快速公交系統，日常載客量可達100萬人次，高峰時期每小時單向客流高達26900人次，僅次於波哥大的快速交通系統，平均每10秒鐘就有一輛巴士，每輛巴士單向行駛350小時。包括橋樑在內的站台是世界最長的州快速公交運輸系統站台，長達260米。目前廣州市區的計程車和公共汽車主要使用液化石油氣作燃料，部分公共汽車更使用油電、氣電混合動力技術。2012年底開始投放液化天然氣燃料的公共汽車，2014年6月開始投放液化天然氣插電式混合動力公共汽車，以取代液化石油氣公共汽車。2007年1月16日，廣州市政府全面禁止在市區內駕駛摩托車。違反禁令的機動車將會予以沒收。廣州市交通局聲稱禁令的施行，使得交通擁擠問題和車禍大幅減少。廣州白雲國際機場位於白雲區與花都區交界，2004年8月5日正式投入運營，屬中國交通情況第二繁忙的機場。該機場取代了原先位於市中心的無法滿足日益增長航空需求的舊機場。目前機場有三條飛機跑道，成為國內第三個擁有三跑道的民航機場。比鄰近的香港國際機場第三跑道預計的2023年落成早8年。'

In [16]:
token_test = tokenizer(train_paragraphs[0][:50])
print(token_test[0].ids)

[101, 8166, 2399, 2471, 6868, 4638, 2451, 2336, 2571, 6862, 1062, 769, 6880, 6745, 5143, 5186, 8024, 2253, 686, 4518, 5018, 753, 1920, 2571, 6862, 1062, 769, 5143, 5186, 8024, 3189, 2382, 6734, 2145, 7030, 1377, 6888, 8135, 5857, 782, 3613, 8024, 7770, 2292, 3229, 3309, 102]


In [17]:
token_test2 = tokenizer(train_paragraphs[0][:50],add_special_tokens=False)
print(token_test2[0].ids)

[8166, 2399, 2471, 6868, 4638, 2451, 2336, 2571, 6862, 1062, 769, 6880, 6745, 5143, 5186, 8024, 2253, 686, 4518, 5018, 753, 1920, 2571, 6862, 1062, 769, 5143, 5186, 8024, 3189, 2382, 6734, 2145, 7030, 1377, 6888, 8135, 5857, 782, 3613, 8024, 7770, 2292, 3229, 3309]


### tokenize demo

In [18]:
s_list = dev_paragraphs[953].split('。')
len(s_list)

12

In [19]:
temp = tokenizer(s_list)

In [20]:
temp.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [21]:
for idx, (token_id, s) in enumerate(zip(temp['input_ids'], s_list)):  
    print(idx, token_id, s)
    print(len(token_id),len(s))
    print('-' * 20 + 'decode' + '-' * 20)
    print(tokenizer.decode(token_id))
    break

0 [101, 3449, 5684, 5710, 8024, 9155, 2399, 8108, 3299, 8108, 3189, 8025, 8263, 2399, 8110, 3299, 8114, 3189, 8024, 7676, 3949, 1139, 4495, 8024, 7676, 3949, 3625, 2512, 1883, 2342, 3215, 8024, 782, 4935, 519, 3556, 1883, 1920, 1995, 1920, 520, 1469, 519, 5659, 1378, 1957, 4374, 520, 102] 梅艷芳，1963年10月10日－2003年12月30日，香港出生，香港歌影壇巨星，人稱「樂壇大姐大」和「舞台女王」
49 57
--------------------decode--------------------
[CLS] 梅 艷 芳 ， 1963 年 10 月 10 日 － 2003 年 12 月 30 日 ， 香 港 出 生 ， 香 港 歌 影 壇 巨 星 ， 人 稱 「 樂 壇 大 姐 大 」 和 「 舞 台 女 王 」 [SEP]


#### char_to_token

In [22]:
print(temp[0].offsets)

[(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 8), (8, 9), (9, 11), (11, 12), (12, 14), (14, 15), (15, 16), (16, 20), (20, 21), (21, 23), (23, 24), (24, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 31), (31, 32), (32, 33), (33, 34), (34, 35), (35, 36), (36, 37), (37, 38), (38, 39), (39, 40), (40, 41), (41, 42), (42, 43), (43, 44), (44, 45), (45, 46), (46, 47), (47, 48), (48, 49), (49, 50), (50, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (0, 0)]


In [23]:
print(temp[0].tokens)

['[CLS]', '梅', '艷', '芳', '，', '1963', '年', '10', '月', '10', '日', '－', '2003', '年', '12', '月', '30', '日', '，', '香', '港', '出', '生', '，', '香', '港', '歌', '影', '壇', '巨', '星', '，', '人', '稱', '「', '樂', '壇', '大', '姐', '大', '」', '和', '「', '舞', '台', '女', '王', '」', '[SEP]']


In [24]:
for idx,(v1,v2) in enumerate(temp[0].offsets):
    print((idx, v1, v2), end=',')

(0, 0, 0),(1, 0, 1),(2, 1, 2),(3, 2, 3),(4, 3, 4),(5, 4, 8),(6, 8, 9),(7, 9, 11),(8, 11, 12),(9, 12, 14),(10, 14, 15),(11, 15, 16),(12, 16, 20),(13, 20, 21),(14, 21, 23),(15, 23, 24),(16, 24, 26),(17, 26, 27),(18, 27, 28),(19, 28, 29),(20, 29, 30),(21, 30, 31),(22, 31, 32),(23, 32, 33),(24, 33, 34),(25, 34, 35),(26, 35, 36),(27, 36, 37),(28, 37, 38),(29, 38, 39),(30, 39, 40),(31, 40, 41),(32, 41, 42),(33, 42, 43),(34, 43, 44),(35, 44, 45),(36, 45, 46),(37, 46, 47),(38, 47, 48),(39, 48, 49),(40, 49, 50),(41, 50, 51),(42, 51, 52),(43, 52, 53),(44, 53, 54),(45, 54, 55),(46, 55, 56),(47, 56, 57),(48, 0, 0),

In [25]:
for i in [13,27,56,57]:
    print(temp[0].char_to_token(i), end=', ')

9, 18, 47, None, 

In [26]:
for i in [13,27,56,57]:
    print(temp.char_to_token(i), end=', ')

9, 18, 47, None, 

In [27]:
for i in [13,27,56,57]:
    print(temp[1].char_to_token(i), end=', ')

14, 28, 57, 58, 

#### idx

In [28]:
temp[0]

Encoding(num_tokens=49, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [29]:
print(temp[0].ids)

[101, 3449, 5684, 5710, 8024, 9155, 2399, 8108, 3299, 8108, 3189, 8025, 8263, 2399, 8110, 3299, 8114, 3189, 8024, 7676, 3949, 1139, 4495, 8024, 7676, 3949, 3625, 2512, 1883, 2342, 3215, 8024, 782, 4935, 519, 3556, 1883, 1920, 1995, 1920, 520, 1469, 519, 5659, 1378, 1957, 4374, 520, 102]


In [30]:
print(temp[0].type_ids)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [31]:
print(temp[0].tokens)

['[CLS]', '梅', '艷', '芳', '，', '1963', '年', '10', '月', '10', '日', '－', '2003', '年', '12', '月', '30', '日', '，', '香', '港', '出', '生', '，', '香', '港', '歌', '影', '壇', '巨', '星', '，', '人', '稱', '「', '樂', '壇', '大', '姐', '大', '」', '和', '「', '舞', '台', '女', '王', '」', '[SEP]']


In [32]:
print(temp[0].offsets)

[(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 8), (8, 9), (9, 11), (11, 12), (12, 14), (14, 15), (15, 16), (16, 20), (20, 21), (21, 23), (23, 24), (24, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 31), (31, 32), (32, 33), (33, 34), (34, 35), (35, 36), (36, 37), (37, 38), (38, 39), (39, 40), (40, 41), (41, 42), (42, 43), (43, 44), (44, 45), (45, 46), (46, 47), (47, 48), (48, 49), (49, 50), (50, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (0, 0)]


In [33]:
print(temp[0].attention_mask, len(temp[0].attention_mask))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 49


In [34]:
print(temp[0].special_tokens_mask)

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [35]:
print(temp[0].overflowing)

[]


overflowing

In [36]:
s = "梅艷芳，1963年10月10日－2003年12月30日，香港出生，香港歌影壇巨星，人稱「樂壇大姐大」和「舞台女王」。她的形象百變，更是華語樂壇首位在同一場演唱會換多套歌衫以及在每張專輯的封面和音樂影片都有不同造型的歌手，因此亦有「百變女歌神」之稱。梅艷芳的影響力不只在演藝界，她更代表了整個香港及華人社會，她去世後被傳媒廣泛稱為「香港之女」。梅艷芳跟徐小鳳和蔡琴一樣，是華人樂壇少有的女低音。梅艷芳自小與家人在荔園、啟德遊樂場賣唱。1982年，在無線電視及華星唱片合辦的第一屆新秀歌唱大賽中以一曲〈風的季節〉勝出，星途由此展開。在事業早期，歌曲路線已是冶艷前衛，舞台衣著華麗大膽，風格千變萬化。梅艷芳在1985年－1989年十大勁歌金曲頒獎典禮當中連續五屆奪得最受歡迎女歌星並於1989年叱吒樂壇流行榜頒獎典禮奪得叱吒樂壇女歌手金獎。而於1985年推出的專輯《壞女孩》銷量更達14白金，奠定了她在歌壇的地位。她在電影界也獲得多項大獎，1987年更以電影《胭脂扣》一舉拿下金馬獎、香港電影金像獎及亞太影展最佳女主角后冠，演技備受肯定。她同時也是華人女歌手中演唱會場次的最高紀錄保持者，共計292場。"

over_demo = tokenizer([s, s*2, s*3, s*1100])
print(over_demo.keys())
print(over_demo[3].overflowing) # 不知道overflowing是什么

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[]


## dataset

In [37]:
q_paragraph_id = train_questions[0]["paragraph_id"]
q_paragraph_id

3884

In [38]:
train_paragraphs_tokenized[q_paragraph_id]

Encoding(num_tokens=680, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [39]:
# tokenized_paragraphs[question["paragraph_id"]]

### Dataset Demo

拿到训练集的第一个问题和第一个答案

In [40]:
train_questions[0]

{'id': 0,
 'paragraph_id': 3884,
 'question_text': '羅馬教皇利奧三世在800年正式加冕誰為羅馬人的皇帝?',
 'answer_text': '查理大帝',
 'answer_start': 141,
 'answer_end': 144}

In [41]:
train_paragraphs[3884], train_paragraphs[3884][141:144+1]

('利奧三世開創的伊蘇里亞王朝在8世紀末期走上了末路，隨後統治帝國的一群無能皇帝進一步加深了災難局面。這其中最著名的是伊琳娜女皇，她弄瞎了作為法定繼承人的兒子君士坦丁六世的眼睛，將其關入修道院，自己成為第一個大權獨攬的東羅馬女皇。此舉影響重大，導致羅馬教皇利奧三世在800年把法蘭克國王查理大帝加冕為羅馬人的皇帝，使西方帝國有了與東羅馬帝國分庭抗禮的藉口。此外，皇帝尼基弗魯斯在與多瑙河下游平原的保加利亞第一帝國的普利斯卡戰役中被殺，頭蓋骨更被保加利亞酋長克魯姆做成了酒杯。馬其頓王朝的誕生開創了東羅馬帝國歷史上第二個最輝煌的時期。馬其頓王朝開國皇帝巴西爾一世生於亞美尼亞，幼時全家被多瑙河下游平原的保加利亞第一帝國俘虜，發配到馬其頓去開墾土地。長大後，他成為皇宮馬倌，貌美而多力，受到阿莫利王朝末代皇帝麥可三世的注意和寵愛。麥可任命他為宮廷侍衛長，並於866年把他立為自己的繼承人和共帝。867年，巴西爾發覺自己有失寵的跡象，於是在9月23日晚上發動了政變，他先用手擰彎了皇帝寢室的門閂，然後在半夜帶著親信殺入皇帝睡房，迅速制服衛兵，並殺掉了麥可三世。雖然皇位為篡奪而來，但巴西爾一世很快以自己的英明行為讓大家刮目相看。他在軍事上的勝利使其躋身於東羅馬帝國帝國最偉大的軍事家之列。他嚴格貫徹席哈克略王朝時開始的軍事制改革，在巴爾幹半島建立新軍事州，向這些地區遷入新移民，並憑藉不斷增強的君主制國的國力鞏固國防建設，不僅在巴爾幹半島的多瑙河沿岸北部設立邊境要塞，成功阻擋了斯拉夫人南下，而且在小亞細亞擴充軍隊並反擊了阿拉伯人的侵略，在義大利南部，也收復了原屬於東羅馬帝國的領地。',
 '查理大帝')

In [42]:
train_paragraphs_tokenized[3884].char_to_token(141), \
train_paragraphs_tokenized[3884].char_to_token(144)

(139, 142)

In [43]:
print(train_paragraphs_tokenized[3884].tokens)

['利', '奧', '三', '世', '開', '創', '的', '伊', '蘇', '里', '亞', '王', '朝', '在', '8', '世', '紀', '末', '期', '走', '上', '了', '末', '路', '，', '隨', '後', '統', '治', '帝', '國', '的', '一', '群', '無', '能', '皇', '帝', '進', '一', '步', '加', '深', '了', '災', '難', '局', '面', '。', '這', '其', '中', '最', '著', '名', '的', '是', '伊', '琳', '娜', '女', '皇', '，', '她', '弄', '瞎', '了', '作', '為', '法', '定', '繼', '承', '人', '的', '兒', '子', '君', '士', '坦', '丁', '六', '世', '的', '眼', '睛', '，', '將', '其', '關', '入', '修', '道', '院', '，', '自', '己', '成', '為', '第', '一', '個', '大', '權', '獨', '攬', '的', '東', '羅', '馬', '女', '皇', '。', '此', '舉', '影', '響', '重', '大', '，', '導', '致', '羅', '馬', '教', '皇', '利', '奧', '三', '世', '在', '800', '年', '把', '法', '蘭', '克', '國', '王', '查', '理', '大', '帝', '加', '冕', '為', '羅', '馬', '人', '的', '皇', '帝', '，', '使', '西', '方', '帝', '國', '有', '了', '與', '東', '羅', '馬', '帝', '國', '分', '庭', '抗', '禮', '的', '藉', '口', '。', '此', '外', '，', '皇', '帝', '尼', '基', '弗', '魯', '斯', '在', '與', '多', '瑙', '河', '下', '游', '平', '原', '的', '保', '加', '利', '亞', '第', '一

In [44]:
train_paragraphs_tokenized[3884].tokens[139:142+1]

['查', '理', '大', '帝']

In [45]:
print(train_paragraphs_tokenized[0].ids)

[8166, 2399, 2471, 6868, 4638, 2451, 2336, 2571, 6862, 1062, 769, 6880, 6745, 5143, 5186, 8024, 2253, 686, 4518, 5018, 753, 1920, 2571, 6862, 1062, 769, 5143, 5186, 8024, 3189, 2382, 6734, 2145, 7030, 1377, 6888, 8135, 5857, 782, 3613, 8024, 7770, 2292, 3229, 3309, 3680, 2207, 3229, 1606, 1403, 2145, 3837, 7770, 6888, 11023, 8279, 782, 3613, 8024, 1006, 3613, 3176, 3797, 1520, 1920, 4638, 2571, 6862, 769, 6858, 5143, 5186, 8024, 2398, 1772, 3680, 8108, 4907, 7132, 2218, 3300, 671, 6739, 2349, 1894, 8024, 3680, 6739, 2349, 1894, 1606, 1403, 6121, 7691, 8612, 2207, 3229, 511, 1259, 2886, 3578, 3558, 1762, 1058, 4638, 4991, 1378, 3221, 686, 4518, 3297, 7269, 4638, 2336, 2571, 6862, 1062, 769, 6880, 6745, 5143, 5186, 4991, 1378, 8024, 7269, 6888, 9044, 5101, 511, 4680, 1184, 2451, 2336, 2356, 1281, 4638, 6243, 4923, 6722, 1469, 1062, 1066, 3749, 6722, 712, 6206, 886, 4500, 3890, 1265, 4767, 3779, 3706, 868, 4234, 3160, 8024, 6956, 1146, 1062, 1066, 3749, 6722, 3291, 886, 4500, 3779, 7442, 

In [46]:
print(train_questions_tokenized[0].ids)

[5397, 7679, 3136, 4640, 1164, 1953, 676, 686, 1762, 8280, 2399, 3633, 2466, 1217, 1089, 6306, 4158, 5397, 7679, 782, 4638, 4640, 2370, 136]


In [47]:
class QA_Dataset(Dataset):
    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):
        self.split = split
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 40
        self.max_paragraph_len = 150
        
        ##### TODO: Change value of doc_stride #####
        # paragraph 切分的分片大小
        self.doc_stride = 150

        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]
        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[
                                question["paragraph_id"]]

        ##### TODO: Preprocessing #####
        # Hint: How to prevent model from learning something it should not learn
        
        # 为什么train 不需要给paragraph分片，但valid和test需要分片 ?
        if self.split == "train":
            # Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph  
            
            # answer_start_token 139
            answer_start_token = tokenized_paragraph.char_to_token(
                question["answer_start"])
            # answer_end_token 142
            answer_end_token = tokenized_paragraph.char_to_token(
                question["answer_end"])

            # A single window is obtained 
                # by slicing the portion of paragraph containing the answer
            mid = (answer_start_token + answer_end_token) // 2 # 140
            
            # 确定初始下标时，要保证：
                # answer不能被截掉
                # 截掉前面的
            paragraph_start = max(
                0, 
                min(
                      mid - self.max_paragraph_len // 2, 
                      len(tokenized_paragraph) - self.max_paragraph_len))
            
            paragraph_end = paragraph_start + self.max_paragraph_len

            # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
            input_ids_question = [101] + \
                tokenized_question.ids[:self.max_question_len] + [102]
            # 填上 101, 102，是因为在之前通过设置add_special_tokens=False，
                # ids中就没有101,102

            # 后一句，不需要101开头，只在尾部添加 102
            input_ids_paragraph = tokenized_paragraph.ids[
                paragraph_start : paragraph_end] + [102]
            print("input_ids_paragraph", input_ids_paragraph)
            
            # Convert answer's start/end positions in tokenized_paragraph 
                # to start/end positions in the window  
            answer_start_token += len(input_ids_question) - paragraph_start
            answer_end_token += len(input_ids_question) - paragraph_start

            # Pad sequence and obtain inputs to model 
            input_ids, token_type_ids, attention_mask = self.padding(
                    input_ids_question, input_ids_paragraph)

            return torch.tensor(input_ids), torch.tensor(token_type_ids), \
                torch.tensor(attention_mask), answer_start_token, answer_end_token

        # Validation/Testing
        else:
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
            
            # Paragraph is split into several windows, 
                # each with start positions separated by step "doc_stride"
            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                
                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]
                
                # Pad sequence and obtain inputs to model
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)
            
            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), \
                        torch.tensor(attention_mask_list)

    def padding(self, input_ids_question, input_ids_paragraph):
        # Pad zeros if sequence length is shorter than max_seq_len
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        # Indices of input sequence tokens in the vocabulary
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
        
        return input_ids, token_type_ids, attention_mask


# train_set = QA_Dataset("valid", train_questions, train_questions_tokenized, train_paragraphs_tokenized)
# input_ids_list_, token_type_ids_list_, attention_mask_list = train_set[0]

In [48]:
"""
class QA_Dataset(Dataset):
    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):
        self.split = split
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 40
        self.max_paragraph_len = 150
        
        ##### TODO: Change value of doc_stride #####
        self.doc_stride = 150

        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]
        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]

        ##### TODO: Preprocessing #####
        # Hint: How to prevent model from learning something it should not learn

        if self.split == "train":
            # Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph  
            answer_start_token = tokenized_paragraph.char_to_token(question["answer_start"])
            answer_end_token = tokenized_paragraph.char_to_token(question["answer_end"])

            # A single window is obtained 
                # by slicing the portion of paragraph containing the answer
            mid = (answer_start_token + answer_end_token) // 2
            paragraph_start = max(0, 
                                  min(mid - self.max_paragraph_len // 2, 
                                      len(tokenized_paragraph) - self.max_paragraph_len))
            paragraph_end = paragraph_start + self.max_paragraph_len
            
            # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102] 
            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]		
            
            # Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window  
            answer_start_token += len(input_ids_question) - paragraph_start
            answer_end_token += len(input_ids_question) - paragraph_start
            
            # Pad sequence and obtain inputs to model 
            input_ids, token_type_ids, attention_mask = self.padding(
                    input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token

        # Validation/Testing
        else:
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
            
            # Paragraph is split into several windows, each with start positions separated by step "doc_stride"
            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                
                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]
                
                # Pad sequence and obtain inputs to model
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)
            
            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), \
                        torch.tensor(attention_mask_list)

    def padding(self, input_ids_question, input_ids_paragraph):
        # Pad zeros if sequence length is shorter than max_seq_len
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        # Indices of input sequence tokens in the vocabulary
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
        
        return input_ids, token_type_ids, attention_mask

train_set = QA_Dataset("train", train_questions, train_questions_tokenized, train_paragraphs_tokenized)
"""

'\nclass QA_Dataset(Dataset):\n    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):\n        self.split = split\n        self.questions = questions\n        self.tokenized_questions = tokenized_questions\n        self.tokenized_paragraphs = tokenized_paragraphs\n        self.max_question_len = 40\n        self.max_paragraph_len = 150\n        \n        ##### TODO: Change value of doc_stride #####\n        self.doc_stride = 150\n\n        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]\n        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1\n\n    def __len__(self):\n        return len(self.questions)\n\n    def __getitem__(self, idx):\n        question = self.questions[idx]\n        tokenized_question = self.tokenized_questions[idx]\n        tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]\n\n        ##### TODO: Preprocessing #####\n        # Hint: How to prevent model from le

In [49]:
train_set = QA_Dataset("train", train_questions, train_questions_tokenized, train_paragraphs_tokenized)
dev_set = QA_Dataset("dev", dev_questions, dev_questions_tokenized, dev_paragraphs_tokenized)
test_set = QA_Dataset("test", test_questions, test_questions_tokenized, test_paragraphs_tokenized)

train_batch_size = 16

# Note: Do NOT change batch size of dev_loader / test_loader !
# Although batch size=1, it is actually a batch consisting of several windows from the same QA pair
train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, pin_memory=True)
dev_loader = DataLoader(dev_set, batch_size=1, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)

## evaluate

## train

In [None]:
num_epoch = 1 
validation = True
logging_step = 100
learning_rate = 1e-4
optimizer = AdamW(model.parameters(), lr=learning_rate)

if fp16_training:
    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) 

model.train()

print("Start Training ...")

for epoch in range(num_epoch):
    step = 1
    train_loss = train_acc = 0
    
    for data in tqdm(train_loader):	
        # Load all data into GPU
        data = [i.to(device) for i in data]
        
        # Model inputs: input_ids, token_type_ids, attention_mask, start_positions, end_positions (Note: only "input_ids" is mandatory)
        # Model outputs: start_logits, end_logits, loss (return when start_positions/end_positions are provided)  
        output = model(
            input_ids=data[0], 
            token_type_ids=data[1], 
            attention_mask=data[2], 
            start_positions=data[3], 
            end_positions=data[4]
        )

        # Choose the most probable start position / end position
        start_index = torch.argmax(output.start_logits, dim=1)
        end_index = torch.argmax(output.end_logits, dim=1)
        
        # Prediction is correct only if both start_index and end_index are correct
        train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()
        train_loss += output.loss
        
        if fp16_training:
            accelerator.backward(output.loss)
        else:
            output.loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
        step += 1

        ##### TODO: Apply linear learning rate decay #####
        
        # Print training loss and accuracy over past logging step
        if step % logging_step == 0:
            print(f"Epoch {epoch + 1} | Step {step} |
                  loss = {train_loss.item() / logging_step:.3f}, 
                  acc = {train_acc / logging_step:.3f}")
            train_loss = train_acc = 0

    if validation:
        print("Evaluating Dev Set ...")
        model.eval()
        with torch.no_grad():
            dev_acc = 0
            for i, data in enumerate(tqdm(dev_loader)):
                output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
                # prediction is correct only if answer text exactly matches
                dev_acc += evaluate(data, output) == dev_questions[i]["answer_text"]
            print(f"Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader):.3f}")
        model.train()

# Save a model and its configuration file to the directory 「saved_model」 
# i.e. there are two files under the direcory 「saved_model」: 「pytorch_model.bin」 and 「config.json」
# Saved model can be re-loaded using 「model = BertForQuestionAnswering.from_pretrained("saved_model")」
print("Saving Model ...")
model_save_dir = "saved_model" 
model.save_pretrained(model_save_dir)