In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 14.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 75.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 65.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.5 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 78.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found e

In [112]:
import os
import json
import random
import numpy as np
import pandas as pd

from tqdm import tqdm

import torch
from torch.optim import Adam, AdamW
from torch.utils.data import TensorDataset, RandomSampler, DataLoader
from transformers import BertTokenizer, BertTokenizerFast, BertForQuestionAnswering
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import  f1_score

import gc

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/goorm/02.qa

/content/gdrive/MyDrive/goorm/02.qa


In [None]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Data Load

In [113]:
# 제공된 데이터 로드 
def data_load(path, test = False):
    with open('datas/'+path, 'rb') as f:
        squad_dict = json.load(f)


    contexts = []
    questions = []
    answers = [] 
    guids = []
    start_ids = []
    end_ids = []

    for datas in squad_dict['data']:
        for paragraphs in datas['paragraphs']:
            context = paragraphs['context']
            for qas in paragraphs['qas']:
                question = qas['question']
                guid = qas['guid']

                contexts.append(context)
                questions.append(question)
                guids.append(guid)

                if test == False:
                    answer = qas['answers'][0]
                    answers.append(answer['text'])
                    start_index = answer['answer_start']
                    start_ids.append(start_index)
                    
                    end_index = start_index + len(answer['text'])
                    end_ids.append(end_index)

                    data = {'contexts' : contexts, 'questions' : questions, 'answers' : answers, 'start_ids': start_ids,'end_ids': end_ids}
                else:
                    data = {'contexts' : contexts, 'questions' : questions}


    return pd.DataFrame(data, columns=data.keys()),guids

In [125]:
# ai hub 데이터 로드
def aihub_data_load(path):
    with open('datas/'+path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = [] 
    start_ids = []
    end_ids = []

    for datas in squad_dict['data']:
        for paragraphs in datas['paragraphs']:
            context = paragraphs['context']
            for qas in paragraphs['qas']:
                question = qas['question']
                answer = qas['answers'][0]
                start_index = answer['answer_start']
                end_index = start_index + len(answer['text'])

                contexts.append(context)
                questions.append(question)
                answers.append(answer['text'])                
                start_ids.append(start_index)
                end_ids.append(end_index)                  

    data = {'contexts' : contexts, 'questions' : questions, 'answers' : answers, 'start_ids': start_ids,'end_ids': end_ids}

    return pd.DataFrame(data,columns=data.keys())

In [None]:
train_df = aihub_data_load('ko_nia_normal_squad_all.json')
valid = train_df.sample(frac = 0.1)

train_df = train_df.drop(valid.index)

valid_df, valid_guid = data_load('train.json')
valid_df = valid_df.append(valid)

# Tokenizer를 이용한 데이터 전처리

In [117]:
path = 'klue/bert-base'
tokenizer = BertTokenizerFast.from_pretrained(path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForQuestionAnswering.from_pretrained(path)

model = model.to(device)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model chec

In [None]:
# method를 통해 train, valid, test에 맞게 데이터를 전처리
def qa_preprocess(df, batch_size=16, method='train'):

    if method == 'train' or method == 'valid':
        batch_input = tokenizer(df['contexts'].tolist(), df['questions'].tolist(), truncation=True, padding=True)

        start_ids = df['start_ids'].tolist()
        end_ids = df['end_ids'].tolist()

        # input_ids에서 start position과 end position을 찾아주고 토크나이저의 max lnegth를 초과한다면 제거
        start_positions = [batch_input.char_to_token(i, start_ids[i]) for i in range(len(start_ids))]
        end_positions = [batch_input.char_to_token(i, end_ids[i]-1) for i in range(len(end_ids))]
        deleting_list = [i for i, v in enumerate(end_positions) if v == None]
            
        batch_input.update({'start_positions': start_positions, 'end_positions': end_positions})

        batch_input = {key : [v for ids, v in enumerate(value) if ids not in deleting_list] for key, value in batch_input.items()}
        batch_input = {key : torch.tensor(np.array(value,dtype=float).astype(int)) for key, value in batch_input.items()}

        input_ids = batch_input['input_ids'] 
        segments = batch_input['token_type_ids']
        masks = batch_input['attention_mask']
        start_ids = batch_input['start_positions']
        end_ids = batch_input['end_positions']

        dataset = TensorDataset(input_ids, masks, segments, start_ids, end_ids)
        if method == 'train':
            dataset_sampler = RandomSampler(dataset)
            dataloader = DataLoader(dataset, sampler=dataset_sampler, batch_size=16)
        elif method == 'valid':
            dataloader = DataLoader(dataset, batch_size=batch_size)
        return dataloader, deleting_list

    elif method == 'test':
        batch_input = tokenizer(df['contexts'].tolist(), df['questions'].tolist(), truncation=True, padding=True)
        batch_input = {key : torch.tensor(value) for key, value in batch_input.items()}

        dataset = TensorDataset(batch_input['input_ids'], batch_input['attention_mask'], batch_input['token_type_ids'])
        dataloader = DataLoader(dataset, batch_size=batch_size)

    return dataloader

In [None]:
train_dataloader,train_delete = qa_preprocess(train_df, batch_size=32, method='train')
valid_dataloader,valid_delete = qa_preprocess(valid_df, batch_size=32, method='valid')

# 모델 학습

In [None]:
# edit distance를 이용해 자체적인 평가
def edit_distance(s:str, t: str):
    m = len(s)+1
    n = len(t)+1
    D = [[0]*m for _ in range(n)]
    D[0][0] = 0
    
    for i in range(1,m):
        D[0][i] = D[0][i-1] + 1
    
    for j in range(1,n):
        D[j][0] = D[j-1][0] + 1
    
    for i in range(1,n):
        for j in range(1,m):
            cost = 0

            if s[j-1] != t[i-1]:
                cost = 1
            
            D[i][j] = min(D[i][j-1] + 1,D[i-1][j] + 1, D[i-1][j-1] + cost)
    
    return D[n-1][m-1]

In [None]:
# 쿠다 캐시 메모리 정리
gc.collect()
torch.cuda.empty_cache()

In [None]:
epochs = 4
path = 'qa'

optimizer = AdamW(model.parameters() , lr=1e-5, eps=1e-8)

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)


score = {'loss' : [],
         'f1' : [],
         'edit_score' : []}

for epoch in range(epochs):
# ==================================================================
#                            model train
# ==================================================================
    model.train()

    train_loss = 0.0

    for batchs in tqdm(train_dataloader):
        batch = tuple(b.to(device) for b in batchs)

        inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }


        optimizer.zero_grad()

        outputs = model(**inputs)
        
        loss = outputs[0]
        
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_dataloader)
    score['loss'].append(avg_train_loss)
    

    print(f'Train Loss : {avg_train_loss}')

# ==================================================================
#                            model evaluation
# ==================================================================
    model.eval()

    start_preds = []
    end_preds = []
    inputs_ids = []

    for batchs in tqdm(valid_dataloader):
        batch = tuple(b.to(device) for b in batchs)

        inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

        with torch.no_grad():
            outputs = model(**inputs)

        start_logits = outputs['start_logits'].detach().cpu()
        end_logits = outputs['end_logits'].detach().cpu()
        
        inputs_ids.append(inputs['input_ids'].detach().cpu())
        start_preds.append(start_logits)
        end_preds.append(end_logits)

    input = torch.cat(inputs_ids, dim=0).tolist()
    start_preds = torch.cat(start_preds, dim=0).argmax(dim=-1).tolist()
    end_preds = torch.cat(end_preds, dim=0).argmax(dim=-1).tolist()

    answer = [tokenizer.decode(input[s:e+1]) for input, s, e in zip(input,start_preds,end_preds)]

    f1 = f1_score(valid_delete.drop(valid_delete,axis=0)['answers'].tolist(), answer, average='micro')

    pred_answers =[i.replace(tokenizer.unk_token,'') for i in answer]
    pred_answers =[i.replace(tokenizer.pad_token,'') for i in pred_answers]
    pred_answers =[i.replace(tokenizer.cls_token,'') for i in pred_answers]
    pred_answers =[i.replace(tokenizer.sep_token,'') for i in pred_answers]
    pred_answers =[i.replace('##','') for i in pred_answers]

    edit_score = [edit_distance(i,j) for i,j in zip(valid_df.drop(valid_delete,axis=0)['answers'].tolist(),pred_answers)]
    edit_score = sum(edit_score)/len(edit_score)

    score['f1'].append(f1)
    score['edit_score'].append(edit_score)

    print('f1 score : ', f1)
    print('edit_distance : ', edit_score)

    # 모델 저장 
    model.save_pretrained(f'model/{path}')
    tokenizer.save_pretrained(f'model/{path}')

----

# TEST

In [None]:
path = 'model/klue_QA_bais_batchsize_32_lr_1e-5_epoch4'

tokenizer = BertTokenizerFast.from_pretrained(path)

test_df,test_guid = data_load('test.json',test=True)
test_dataloader = qa_preprocess(test_df, method='test')

model = BertForQuestionAnswering.from_pretrained(path)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)

In [None]:
start_preds = []
end_preds = []
inputs_ids = []


for batchs in tqdm(test_dataloader):
    batch = tuple(b.to(device) for b in batchs)

    inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2]
        }

    with torch.no_grad():
        outputs = model(**inputs)

    start_logits = outputs['start_logits'].detach().cpu()
    end_logits = outputs['end_logits'].detach().cpu()
    
    inputs_ids.append(inputs['input_ids'].detach().cpu())
    start_preds.append(start_logits)
    end_preds.append(end_logits)

input = torch.cat(inputs_ids, dim=0).tolist()
start_preds = torch.cat(start_preds, dim=0).argmax(dim=-1).tolist()
end_preds = torch.cat(end_preds, dim=0).argmax(dim=-1).tolist()

answer = [tokenizer.decode(input[s:e+1]) for input, s, e in zip(input,start_preds,end_preds)]

# 추가적인 후처리로 tokenizer의 토큰들 ##을 공백으로 변환
# 글자 길이가 20이 넘는 데이터들을 공백으로 제거함 
pred_answers =[i.replace(tokenizer.unk_token,'') for i in answer]
pred_answers =[i.replace(tokenizer.pad_token,'') for i in pred_answers]
pred_answers =[i.replace(tokenizer.cls_token,'') for i in pred_answers]
pred_answers =[i.replace(tokenizer.sep_token,'') for i in pred_answers]
pred_answers =[i.replace('##','') for i in pred_answers]

sample = pd.DataFrame()
sample['id'] = test_guid
sample = pd.DataFrame()
sample['id'] = test_guid
sample['Predicted'] = pred_answers['Predicted'] = pred_answers

100%|██████████| 251/251 [00:40<00:00,  6.20it/s]


In [None]:
path = 'sample.csv'
sample.to_csv(f'sample/{path}', index = False)