In [1]:
from dp import is_running_on_ipython, KoMRC, Preprocessor, Postprocessor
if is_running_on_ipython():
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

from transformers import (
    ElectraForQuestionAnswering,
    ElectraTokenizerFast
)
import torch

import numpy as np
import pandas as pd
import random

# Google drive mount
# drive.mount('/content/drive')
import os
from os.path import join
# change for your path
# ckpt_dir = join('/', 'content', 'drive', 'My Drive', 'goorm K-Digital', '자연어처리', 'Project2')
ckpt_dir = 'ckpt'

# seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x1f9db58d8f0>

# Load a test dataset

In [2]:
data_dir = 'data'

tokenizer = ElectraTokenizerFast.from_pretrained('monologg/koelectra-base-v3-finetuned-korquad')
preprocessor = Preprocessor(tokenizer, 512, 128, 'right')
postprocessor = Postprocessor(tokenizer)
test_dataset = KoMRC.load(join(data_dir, 'test.json'))

# Load a pretrained model

In [7]:
model = ElectraForQuestionAnswering.from_pretrained(join(ckpt_dir, 'KJY'))
model.cuda()
model.eval()

ElectraForQuestionAnswering(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

In [9]:
output_path = join('output', 'KJY.csv')
os.makedirs(join('output'), exist_ok=True)

verbose = False
with torch.no_grad():
    rows = []
    for data in tqdm(test_dataset, "Testing"):
        if verbose: print(f'질문: {data["question"]}')
        tokenized_example = tokenizer(data['question'], 
                                      data['context'], 
                                      truncation="only_second",
                                      max_length=512,
                                      stride=128, 
                                      return_overflowing_tokens=True,
                                      return_offsets_mapping=True,
                                      padding='max_length'
                                      )
        input_ids, token_type_ids = [
            torch.tensor(tokenized_example[key], dtype=torch.long, device="cuda")
            for key in ("input_ids", "token_type_ids")
        ]
        offset_mapping = tokenized_example['offset_mapping']

        start_logits, end_logits = model(input_ids=input_ids, token_type_ids=token_type_ids, return_dict=False)
        start_logits = start_logits.cpu()
        end_logits = end_logits.cpu()
        
        answer = postprocessor.eval(input_ids, start_logits, end_logits, data['context'], offset_mapping, max_len=24, verbose=verbose)
        if verbose: print(answer, '\n')
        
        rows.append([data['guid'], answer])
    
    df = pd.DataFrame(rows, columns=['ID', 'Predicted'])
    df.to_csv(output_path, index=False, sep=',', encoding='utf-8-sig')

Testing:   0%|          | 0/4008 [00:00<?, ?it/s]