# 安裝套件

In [1]:
%%capture
!pip install transformers

In [2]:
import torch
from transformers import AutoTokenizer,BertTokenizerFast

In [3]:
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
import time

#連接雲端硬碟

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/讀書會/bert/SQuAD20
%ls

/content/drive/MyDrive/讀書會/bert/SQuAD20
dev-v2.0.json  test_model  test_model2  test_model3  train-v2.0.json


In [6]:
import json
from pprint import pprint
with open('dev-v2.0.json') as file:
  val_data = json.load(file)


#讀取資料

In [7]:
val_path = Path('/content/drive/MyDrive/讀書會/bert/SQuAD20/dev-v2.0.json')

def read_data(path,limit):
  with open(path, 'rb') as f:
      squad_dict = json.load(f)

  contexts = []
  questions = []
  answers = []
  for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # 檢查答案在'answers'或'plausible_answers'
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
                if limit != None and len(contexts) > limit:
                    return contexts, questions, answers

  return contexts, questions, answers

In [8]:
val_contexts, val_questions, val_answers = read_data(val_path,2000)

#新增 answer 的結束位置
( 這是指在context中的位置 )

In [9]:
def add_end_idx(answers):
    for answer in answers:
        gold_text = answer['text']
        start_idx = answer['answer_start']
        if gold_text == '':
          end_idx = 0
        else:
          end_idx = start_idx + len(gold_text) # Find end character index of answer in context
        answer['answer_end'] = end_idx

add_end_idx(val_answers)

#Tokenizer
將 input 資料轉換成 input_ids、token_type_ids 與 attention_mask

* **input_ids**：每一個單字(包含標點符號)對應到一個數字，這就是所謂的 token。
* **token_type_ids**：因為context會跟question併在一起，所以用不同id來區分。
* **attention_mask**：表示模型該注意的部分，padding 的部分會標記為0。

In [10]:
from transformers import AutoTokenizer,BertTokenizerFast, BertTokenizer, AdamW, BertForQuestionAnswering
tokenizer_auto = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer_fast = BertTokenizerFast.from_pretrained("bert-base-uncased")

val_encodings = tokenizer_fast(val_contexts, val_questions, truncation=True, padding=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

#新增 answer 的 start_position 跟 end_positions 
( 這是找到開始跟結束的token位置 )

In [11]:
def add_token_positions(encodings, answers):
    cnt1 = 0
    cnt2 = 0
    # 初始化列表以包含答案start/end的标记索引
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
        # print("* ",start_positions)
        # print("# ",answers[i]['answer_start'])
        
        # 如果開始位置為None，代表答案已被截断
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
            
            
        # 如果結束位置為None，往左移直到找到不為None的值
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
            

        # 如果結束位置仍為None，代表答案已被截断
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
            
        
    # 更新開始與结束位置
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    

add_token_positions(val_encodings, val_answers)

In [12]:
val_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

#定義Dataset，並轉換成 tensor 格式

In [13]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# 建dataset
val_dataset = SquadDataset(val_encodings)

In [14]:
print(val_encodings)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [15]:
pprint(val_dataset[0])

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0

In [16]:
print("input_ids\n", val_encodings['input_ids'][0])
print("input_ids to tokens\n",tokenizer.convert_ids_to_tokens(val_encodings['input_ids'][0]))
print("input_ids_decode\n", tokenizer.decode(val_encodings['input_ids'][0]))
print("attention_mask\n", val_encodings['attention_mask'][0])

print("start_positions\n", val_encodings['start_positions'][0])
print("end_positions\n", val_encodings['end_positions'][0])

input_ids
 [101, 1996, 5879, 2015, 1006, 5879, 1024, 2053, 3126, 2386, 5104, 1025, 2413, 1024, 5879, 5104, 1025, 3763, 1024, 5879, 3490, 1007, 2020, 1996, 2111, 2040, 1999, 1996, 6049, 1998, 6252, 4693, 2435, 2037, 2171, 2000, 13298, 1010, 1037, 2555, 1999, 2605, 1012, 2027, 2020, 9287, 2013, 15342, 1006, 1000, 5879, 1000, 3310, 2013, 1000, 15342, 2386, 1000, 1007, 10642, 1998, 8350, 2013, 5842, 1010, 10399, 1998, 5120, 2040, 1010, 2104, 2037, 3003, 4897, 2080, 1010, 3530, 2000, 8415, 10768, 2389, 3723, 2000, 2332, 2798, 3523, 1997, 2225, 23151, 7405, 1012, 2083, 8213, 1997, 27574, 1998, 6809, 2007, 1996, 3128, 26165, 1998, 3142, 1011, 26522, 4509, 7080, 1010, 2037, 8481, 2052, 6360, 13590, 2007, 1996, 8594, 2075, 2937, 1011, 2241, 8578, 1997, 2225, 23151, 7405, 1012, 1996, 5664, 3451, 1998, 5636, 4767, 1997, 1996, 5879, 2015, 6003, 3322, 1999, 1996, 2034, 2431, 1997, 1996, 6049, 2301, 1010, 1998, 2009, 2506, 2000, 19852, 2058, 1996, 13034, 4693, 1012, 102, 1999, 2054, 2406, 2003, 1329

# 環境設定
* **Batch size:** 8, 16
* **Learning rate (lr):**  5e−5, 3e−5, 2e−5
* **epochs:**  3

In [17]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# 使用GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# 決定一次要放多少訓練資料集給模型訓練
# 每個iteration以8或16筆做計算
# batch_size = 8
batch_size = 16

# 決定模型要看整個訓練資料集幾遍
# 決定訓練要跑幾回合 
epochs = 3

#載入fine tuned model

In [18]:
# Define the bert tokenizer
#tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Load the fine-tuned modeol
model = torch.load("test_model",map_location=torch.device('cuda'))
model.eval()

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

#Evaluation
* pre: 預測答案
* ref: 真實答案

In [19]:
model.eval()

# 將資料丟入DataLoader
val_loader = DataLoader(val_dataset, batch_size=batch_size)

acc = []
pre = []
index = 0
answer_pred = []

for batch in val_loader:
    # 不需要計算梯度，因為沒有訓練
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        # 使用正確答案的開始和結束位置
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)

        # 預測
        # start_logits跟end_logits: 每個Bert預測答案開始和結束位置的confidence level
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        
        answer_pred = [tokenizer.decode(input_ids[i][start_pred[i]: end_pred[i]+1]) for i in range(len(input_ids))]
        
        for i in range(len(answer_pred)):
          text = answer_pred[i]
          if text and (text[-1] == '.' or text[-1] == ','):
            text = text[:-1]
          pre.append({'prediction_text': text,'id': str(index),'no_answer_probability':0})
          index += 1
        
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

        
# 計算acc
acc = sum(acc)/len(acc)
print(pre)

[{'prediction_text': 'france', 'id': '0', 'no_answer_probability': 0}, {'prediction_text': 'france', 'id': '1', 'no_answer_probability': 0}, {'prediction_text': 'france', 'id': '2', 'no_answer_probability': 0}, {'prediction_text': 'france', 'id': '3', 'no_answer_probability': 0}, {'prediction_text': 'the 10th and 11th centuries', 'id': '4', 'no_answer_probability': 0}, {'prediction_text': 'the 10th and 11th centuries', 'id': '5', 'no_answer_probability': 0}, {'prediction_text': 'the 10th and 11th centuries', 'id': '6', 'no_answer_probability': 0}, {'prediction_text': 'the 10th and 11th centuries', 'id': '7', 'no_answer_probability': 0}, {'prediction_text': 'france', 'id': '8', 'no_answer_probability': 0}, {'prediction_text': 'france', 'id': '9', 'no_answer_probability': 0}, {'prediction_text': 'france', 'id': '10', 'no_answer_probability': 0}, {'prediction_text': 'france', 'id': '11', 'no_answer_probability': 0}, {'prediction_text': 'rollo', 'id': '12', 'no_answer_probability': 0}, {'p

In [20]:
pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Collecting datasets>=2.0.0
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-non

In [21]:
ref = []
for i in range(len(val_answers)):
  start = val_answers[i]['answer_start']
  text = val_answers[i]['text']
  ref.append({'answers':{'answer_start':[start],'text':[text]},'id':str(i)})

print(ref)

[{'answers': {'answer_start': [159], 'text': ['France']}, 'id': '0'}, {'answers': {'answer_start': [159], 'text': ['France']}, 'id': '1'}, {'answers': {'answer_start': [159], 'text': ['France']}, 'id': '2'}, {'answers': {'answer_start': [159], 'text': ['France']}, 'id': '3'}, {'answers': {'answer_start': [94], 'text': ['10th and 11th centuries']}, 'id': '4'}, {'answers': {'answer_start': [87], 'text': ['in the 10th and 11th centuries']}, 'id': '5'}, {'answers': {'answer_start': [94], 'text': ['10th and 11th centuries']}, 'id': '6'}, {'answers': {'answer_start': [94], 'text': ['10th and 11th centuries']}, 'id': '7'}, {'answers': {'answer_start': [256], 'text': ['Denmark, Iceland and Norway']}, 'id': '8'}, {'answers': {'answer_start': [256], 'text': ['Denmark, Iceland and Norway']}, 'id': '9'}, {'answers': {'answer_start': [256], 'text': ['Denmark, Iceland and Norway']}, 'id': '10'}, {'answers': {'answer_start': [256], 'text': ['Denmark, Iceland and Norway']}, 'id': '11'}, {'answers': {'

# 計算EM與F1
https://huggingface.co/spaces/evaluate-metric/squad_v2


In [22]:
from evaluate import load
squad_v2_metric = load("squad_v2")
# pred = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22', 'no_answer_probability': 0.}]
# refe = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
results = squad_v2_metric.compute(predictions=pre, references=ref)
results

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

{'exact': 36.43178410794603,
 'f1': 49.78741895476965,
 'total': 2001,
 'HasAns_exact': 36.43178410794603,
 'HasAns_f1': 49.78741895476965,
 'HasAns_total': 2001,
 'best_exact': 36.43178410794603,
 'best_exact_thresh': 0.0,
 'best_f1': 49.78741895476965,
 'best_f1_thresh': 0.0}