## Practice 4 - Question & Answering with BERT

### 실습 4.1 - Load SQuAD Raw Data with JSON
### SQuAD 데이터 살펴보기

In [1]:
import os
import json

filename = "/kaggle/input/2024-1-nlp-4/train-v2.json"

with open(filename, "r", encoding='utf-8') as reader:
    input_data = json.load(reader)["data"]
    
for entry in input_data:
    for paragraph in entry["paragraphs"]:
        context = paragraph['context']
        print(context)
        print()
        
        for qa in paragraph['qas']:
            is_impossible = qa['is_impossible']

            if not is_impossible:
                answer = qa['answers'][0]
                original_answer = answer['text']
                answer_start = answer['answer_start']
                
            qid=qa['id'],
            question=qa['question'],
                
            print(qid, question, answer)
        
    
        break
    break

Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".

('56be85543aeaaa14008c9063',) ('When did Beyonce start becoming popular?',) {'text': 'in the late 1990s', 'answer_start': 269}
('56be85543aeaaa14008c9065',) ('What areas did Beyonce compete in when she was growing up?',) {'text': 'singing and dancing', 'answer_start': 207}
('56be85543aeaaa14008c9066',) 

### 실습 4.2 - SQuAD Dataset Class 생성 (from raw data to tokenized version)

In [2]:
import os
import json
import torch
from torch.utils.data import Dataset, TensorDataset

# import module we'll need to import our custom module
from shutil import copyfile

# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "/kaggle/input/2024-1-nlp-4/feature.py", dst = "/kaggle/working/feature.py")
from feature import convert_examples_to_features

def is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False

class SquadExample():
    def __init__(self, qid, context, question, answer, start, end, is_impossible):
        self.qid = qid
        self.context = context
        self.question = question
        self.answer = answer
        self.start = start
        self.end = end
        self.is_impossible = is_impossible
        
    def __repr__(self):
        #return self.context[self.start:self.end]
        #if self.context[self.start:self.end] != self.answer:
        #    return 'NA!! {} - {}'.format(self.context[self.start:self.end], answer)
        return 'id:{}  question:{}...  answer:{}...  is_impossible:{}'.format(
            self.qid,
            self.question[:10],
            self.answer[:10],
            self.is_impossible)

class SquadDataset(Dataset):
    def __init__(self, path, tokenizer, is_train=True, is_inference=False):
        '''
        path: SquadDataset 데이터셋 위치
        tokenizer: Squad 데이터셋을 토크나이징할 토크나이저, ex) BertTokenizer
        is_train: SquadDataset을 정의하는 목적이 모델 학습용일 경우 True, 그렇지 않으면 False
        is_inference: SquadDataset을 정의하는 목적이 인퍼런스용일 경우 True, 그렇지 않으면 False
        '''
        
        if is_train:
            filename = os.path.join(path, 'train-v2.json')
        else:
            if is_inference:
                filename = os.path.join(path, 'test-v2.json')
            else:
                filename = os.path.join(path, 'dev-v2.json')

        cached_features_file = os.path.join(os.path.dirname(filename), 'cached_{}_64.cache'.format('train' if is_train else 'valid'))
        #cached_examples_file = os.path.join(os.path.dirname(filename), 'cached_example_{}_64.cache'.format('train' if is_train else 'valid'))

        if os.path.exists(cached_features_file):
            print('cache file exists')
            self.features = torch.load(cached_features_file)
        else:
            print('cache file does not exist')

            with open(filename, "r", encoding='utf-8') as reader:
                input_data = json.load(reader)["data"]

            self.examples = []
            number_of_examples = 100
            for entry in input_data[:number_of_examples]:
                for paragraph in entry["paragraphs"]:
                    context = paragraph['context']
                    
                    doc_tokens = []
                    char_to_word_offset = []
                    prev_is_whitespace = True
                    for c in context:
                        if is_whitespace(c):
                            prev_is_whitespace = True
                        else:
                            if prev_is_whitespace:
                                doc_tokens.append(c)
                            else:
                                doc_tokens[-1] += c
                            prev_is_whitespace = False
                        char_to_word_offset.append(len(doc_tokens) - 1)
                            
                            
                    for qa in paragraph['qas']:
                        is_impossible = qa['is_impossible']
                        
                        if not is_impossible:
                            answer = qa['answers'][0]
                            original_answer = answer['text']
                            answer_start = answer['answer_start']
                            
                            answer_length = len(original_answer)
                            start_pos = char_to_word_offset[answer_start]
                            end_pos = char_to_word_offset[answer_start + answer_length - 1]

                            answer_end = answer_start + len(original_answer)
                        else:
                            original_answer = ''
                            start_pos = 1
                            end_pos = -1

                        example = SquadExample(
                            qid=qa['id'],
                            context=doc_tokens,
                            question=qa['question'],
                            answer=original_answer,
                            start=start_pos,
                            end=end_pos,
                            is_impossible=is_impossible)
                        self.examples.append(example)
            print('examples: {}'.format(len(self.examples)))

            self.features = convert_examples_to_features(
                examples=self.examples,
                tokenizer=tokenizer,
                max_seq_length=384,
                doc_stride=128,
                max_query_length=64,
                is_training=True if not is_inference else False)
            print('is_training: {}'.format(True if not is_inference else False))

            # torch.save(self.examples, cached_examples_file)
            # torch.save(self.features, cached_features_file)

        '''
        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in self.features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in self.features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in self.features], dtype=torch.long)
        all_cls_index = torch.tensor([f.cls_index for f in self.features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in self.features], dtype=torch.float)
        if is_train:
            all_start_positions = torch.tensor([f.start_position for f in self.features], dtype=torch.long)
            all_end_positions = torch.tensor([f.end_position for f in self.features], dtype=torch.long)
            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                    all_start_positions, all_end_positions,
                                    all_cls_index, all_p_mask)
        else:
            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask)
        return dataset
        '''


    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

### 실습 4.3 - SQuAD DataLoader 생성 (from raw data to tokenized version)

In [3]:
import torch
from torch.utils.data import DataLoader

class SquadDataLoader(DataLoader):
    def __init__(self, dataset, batch_size, is_inference=False, shuffle=True):
        '''
        dataset: SquadDataset으로 정의한 데이터셋 객체
        batch_size: 배치 사이즈
        is_inference: SquadDataLoader를 인퍼런스 목적으로 사용할 경우 True, 그렇지 않으면 False
        shuffle: 데이터의 순서를 섞을 경우 True, 그렇지 않으면 False
        '''
        self.is_inference = is_inference
        super().__init__(dataset, collate_fn=self.squad_collate_fn, batch_size=batch_size, shuffle=shuffle)
        
    def squad_collate_fn(self, features):
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

        # return 6 tensors
        if self.is_inference:
            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
            return all_input_ids, all_input_mask, all_segment_ids, all_cls_index, all_p_mask, all_example_index
        # return 7 tensors
        else:
            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
            return all_input_ids, all_input_mask, all_segment_ids, all_cls_index, all_p_mask, all_start_positions, all_end_positions

### 실습 4.4 Load Dataset

In [4]:
import torch
from tqdm import tqdm, trange
from torch import nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer

path = "/kaggle/input/2024-1-nlp-4/"

print("Tokenizer Loading")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#print("Dataset Loading")
#train_dataset = SquadDataset(path, tokenizer, is_train=True) # 153,000

#print("Data Loader")
#train_dataloader = SquadDataLoader(train_dataset, batch_size=32, is_inference=False, shuffle=True)

Tokenizer Loading


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### 실습 4.5 - Load Pre-trained BERT
### 과제 4.1 - BERT for Question Answering 모델 이해하고 설명하기 / Tokenizer 변경해보기

#### BERT for Question Answering 참고
#### https://huggingface.co/docs/transformers/v4.41.0/en/model_doc/bert#transformers.BertForQuestionAnswering

#### BERT Tokenizer 참고
#### https://huggingface.co/docs/transformers/v4.41.0/en/model_doc/bert#transformers.BertTokenizer

In [6]:
# pytoch model import from huggingface
from transformers import BertTokenizer, BertForQuestionAnswering, AdamW

# GPU 이용 방법 - Notebook Option - Session Options - ACCELRATOR 설정 (GPU P100)
# .cuda() 옵션을 제거하면 cpu에서도 학습 가능
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').cuda()

model.train()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

### 실습 4.6 - Fine-tuning with your SQuAD Dataset

In [7]:
# train fucntion
def train(model, dataloader, optimizer):
    tbar = tqdm(dataloader, desc='Training', leave=True)
    
    total_loss = 0.0
    for i, batch in enumerate(tbar):
        optimizer.zero_grad()
        
        # cls_index와 p_mask는 XLNet 모델에 사용되므로 BERT에서는 사용하지 않는다.
        input_ids, input_mask, segment_ids, cls_index, p_mask, start_positions, end_positions = batch
        
        # to cuda (gpu 사용 시)
        input_ids = input_ids.cuda()
        input_mask = input_mask.cuda()
        segment_ids = segment_ids.cuda()
        start_positions = start_positions.cuda()
        end_positions = end_positions.cuda()
        
        # train model
        #out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        inputs = {
            'input_ids': input_ids,
            'token_type_ids': segment_ids,
            'attention_mask': input_mask,
        }
        out = model(**inputs, start_positions=start_positions, end_positions=end_positions)
        loss = out.loss

        loss.backward()
        optimizer.step()
        
        total_loss += loss.data.item()
        tbar.set_description("Average Loss = {:.4f})".format(total_loss/(i+1)))

In [8]:
"""
Train (Fine-tune) your BERT with SQuAD dataset
"""

optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
loss = nn.CrossEntropyLoss()
n_epoch = 3

# actual training
for i in range(n_epoch):
    train(model, train_dataloader, optimizer)


# save model
# torch.save(model.state_dict(), 'squad_model.bin')

Average Loss = 1.7845): 100%|██████████| 937/937 [19:02<00:00,  1.22s/it]
Average Loss = 0.9764): 100%|██████████| 937/937 [19:03<00:00,  1.22s/it]
Average Loss = 0.6722): 100%|██████████| 937/937 [19:04<00:00,  1.22s/it]


In [9]:
torch.save(model.state_dict(), 'squad_model.bin')

### 과제 4.2 Inference 및 Evaluate

- 파인튜닝을 마치고 dev-v2.json 파일을 불러와 Inference를 위한 코드를 실행한다.
- 예측한 span과 정답 span을 비교해본다.
- F1을 이용하여 dev-v2.json의 샘플 1000개를 대상으로 예측한 span과 정답 span을 평가하는 코드를 작성한다.

아래 평가용 코드 참고

- https://github.com/jinkilee/hello-transformer/blob/master/research/chapter4/squad/run_evaluate.py
- https://github.com/jinkilee/hello-transformer/blob/master/research/chapter4/squad/evaluate.py

In [5]:
valid_dataset = SquadDataset(path, tokenizer, is_train=False) # 11,873
valid_dataloader = SquadDataLoader(valid_dataset, batch_size=32, is_inference=False, shuffle=True)

cache file does not exist
examples: 11873


100%|██████████| 11873/11873 [02:18<00:00, 85.55it/s] 

is_training: True





In [6]:
from transformers import BertTokenizer, BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').cuda()
model.load_state_dict(torch.load('/kaggle/input/squad_model/pytorch/squad/1/squad_model.bin'))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [None]:
device = torch.device("cuda")
batch_size = 32

In [8]:
# 학습된 모델이 예측한 결과와 주어진 validation 데이터셋과 비교해본다.
import pandas as pd
def inference(model, tokenizer):
  answer_sheet = []
  num_batches = 3
  for i in range(num_batches):
    all_input_ids, all_input_mask, all_segment_ids, all_cls_index, all_p_mask, all_start_positions, all_end_positions = next(iter(valid_dataloader))
    for j in range(batch_size):
        input_ids = all_input_ids[j].unsqueeze(0).to(device)
        token_type_ids = all_segment_ids[j].unsqueeze(0).to(device)
        attention_mask = all_input_mask[j].unsqueeze(0).to(device)
        output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        predict_start_index = output.start_logits.argmax()
        predict_end_index = output.end_logits.argmax()
        predict_answer_tokens = all_input_ids[j][predict_start_index:predict_end_index+1]
        pred = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
        
        answ = tokenizer.decode(all_input_ids[j][all_start_positions[j]:all_end_positions[j]+1])
    
        answer_sheet.append([answ, pred])
  
  df = pd.DataFrame(answer_sheet, columns=['Answer', 'Prediction'])
  df.to_csv('/kaggle/working/answer.csv', index=False)
  print(df)
  return


In [None]:
inference(model, tokenizer)

In [None]:
# Fine-tuned된 데이터셋을 평가한다.

def evaluate(model, tokenizer):
"""
Write your code here
"""

    
def main():
    # 모델 정의
    model = BertForQuestionAnswering.from_pretrained("bert-base-uncased", num_labels = 2).to(device)
    #model.load_state_dict(torch.load('models/squad_model.bin'))
    model.eval()

    model.to(args.device)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    evaluate(model, tokenizer)