In [1]:
! pip install transformers wandb
! git clone https://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@github.com/Koowater/goorm-Magicians.git
%cd goorm-Magicians
%ls -a
! wandb login

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
fatal: destination path 'goorm-Magicians' already exists and is not an empty directory.
/content/goorm-Magicians
[0m[01;34m.[0m/                   [01;34mgoorm-Magicians[0m/
[01;34m..[0m/                  koelectra_v3_baseline.py
[01;34mdata[0m/                koelectra_v3_baseline_stride.ipynb
dp.py                koelectra_v3_baseline_truncation.ipynb
edit_distance.ipynb  KorQuAD_v1_dataloader.ipynb
[01;34mexamples[0m/            [01;34m__pycache__[0m/
[01;34m.git[0m/                README.md
[34m[1mwandb[0m: Currently logged in as: [33mkoowater[0m ([33mteam_koowater[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
! nvidia-smi

Wed Jul 27 05:55:48 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from dp import is_running_on_ipython, KoMRC, Preprocessor, Postprocessor, collator
if is_running_on_ipython():
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

from transformers import (
    ElectraForQuestionAnswering,
    ElectraTokenizerFast,
    get_linear_schedule_with_warmup,
    AdamW
)
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from statistics import mean

import wandb
import numpy as np
import pandas as pd
import csv
import random

# Google drive mount
import os
from os.path import join
from google.colab import drive
drive.mount('/content/drive')
# change for your path
ckpt_dir = join('/', 'content', 'drive', 'My Drive', 'goorm K-Digital', '자연어처리', 'Project2')

# seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<torch._C.Generator at 0x7f61c6433470>

In [4]:
# training_name = f'LR:{config_defaults}, SD:{apply_scheduler}, WUS:{warmup_steps}, WD:{weight_decay}, BS:{train_batch_size}/{accumulation}, DA:{data_augmentation}'
base_dir = 'data'
dataset = KoMRC.load(join(base_dir, 'train.json'))
train_dataset, dev_dataset = dataset.split(dataset, eval_ratio=0.2)

# Sweep

In [5]:
sweep_config = {
    'method': 'grid',
    'metric': {
      'name': 'levenshtein_distance',
      'goal': 'minimize'   
    },
    'parameters': {
        'initial_LR': {
            'values': [1e-5, 3e-5, 5e-5, 1e-4]
        },
        'weight_decay': {
            'values': [0.1, 0.01, 0.001]
        },
        'warmup_ratio': {
            'values': [0.5, 1.0]
        },
        'doc_stride': {
            'values': [128, 64]
        }
    }
}


# Training

In [None]:
sweep_id = wandb.sweep(sweep_config, entity='team_koowater', project='KoMRC_koowater')
def train():
    config_defaults = {
        'model': 'monologg/koelectra-base-v3-finetuned-korquad',
        'initial_LR': 5e-5,
        'optimizer': 'AdanW',
        'weight_decay': 0.01,
        'scheduler': True,
        'warmup_ratio': 0.2,
        'data_augmentation': True,
        'train_epoch': 1,
        'train_batch_size': 64,
        'val_batch_size': 64,
        'accumulation': 4,
        'max_len': 512,
        'doc_stride': 128,
        'loss_fn': 'CrossEntropyLoss'
    }
    wandb.init(
        name=f'LR WD WR DS',
        group='sweep',
        config=config_defaults
    )
    config = wandb.config
    accumulation = config.accumulation
    loss_fn = torch.nn.CrossEntropyLoss()
    ckpt_name = f'LR{config.initial_LR}_WD{config.weight_decay}_WR{config.warmup_ratio}_DS{config.doc_stride}'
    
    # HP 4. Doc_stride
    tokenizer = ElectraTokenizerFast.from_pretrained(config.model)
    preprocessor = Preprocessor(tokenizer, config.max_len, config.doc_stride, 'right')

    preprocessor.load_dataset(train_dataset)
    train_examples = preprocessor.tokenize()
    preprocessor.load_dataset(dev_dataset)
    dev_examples = preprocessor.tokenize()

    postprocessor = Postprocessor(tokenizer)

    train_loader = DataLoader(train_examples, batch_size=config.train_batch_size//accumulation, shuffle=True, collate_fn=collator, num_workers=2)
    dev_loader = DataLoader(dev_examples, batch_size=config.train_batch_size//accumulation, shuffle=False, collate_fn=collator, num_workers=2)

    train_iter = len(train_loader)
    total_training_steps = config.train_epoch * np.ceil(len(train_loader) / accumulation)

    # Model 
    model = ElectraForQuestionAnswering.from_pretrained(config.model)
    model.cuda()
    wandb.watch(model, loss_fn, log='all', log_freq=int(len(train_loader) / 10))

    # HP 1. Initial LR
    # HP 2. Weight decay
    optimizer = AdamW(model.parameters(), 
                      lr=config.initial_LR,
                      weight_decay=config.weight_decay) 

    # HP 3. Warmup ratio
    #       warmup_ratio를 조절하여 warmup_steps를 정해주세요.
    #       Ex) warmup_ratio = 0.2라면 0.2 epoch까지 warmup 진행
    warmup_ratio = config.warmup_ratio
    warmup_steps = np.ceil((train_iter * warmup_ratio) / accumulation)
    apply_scheduler = config.scheduler
    if apply_scheduler:
        scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                                    num_training_steps=total_training_steps,
                                                    num_warmup_steps=warmup_steps)
        
    os.makedirs(ckpt_dir, exist_ok=True)
    train_losses = []
    total_distance = {
        'epoch': [],
        'iter': [],
        'dist': []
    }
    lowest_val_loss = 9999.
    lowest_dist = 9999.

    # Training loop
    for epoch in range(config.train_epoch):
        print("- Epoch", epoch)
        running_loss = []
        losses = []
        progress_bar = tqdm(train_loader, desc='Train')

        for iter, batch in enumerate(progress_bar):
            del batch['guid'], batch['offset_mapping'], batch['overflow_to_sample_mapping'] #_#
            batch = {key: value.cuda() for key, value in batch.items()}
            start = batch.pop('start_positions')
            end = batch.pop('end_positions')
            
            start_logits, end_logits = model(**batch, return_dict=False)
            loss = loss_fn(start_logits, start) + loss_fn(end_logits, end)
            (loss / accumulation).backward()
            running_loss.append(loss.item())
            del batch, start, end, start_logits, end_logits, loss
            
            is_accumulation = (iter+1) % accumulation

            if is_accumulation:
                pass
            else:
                clip_grad_norm_(model.parameters(), max_norm=1.)
                optimizer.step()
                if apply_scheduler:
                    scheduler.step()
                optimizer.zero_grad(set_to_none=True)

                losses.append(mean(running_loss))
                progress_bar.set_description(f"Train - loss: {losses[-1]:.3f}")

            # Validation
            is_validation = (iter+1) % int(len(train_loader) / 5) == 0
            if is_validation:
                dev_bar = tqdm(dev_loader, desc="Val")
                dev_losses = []
                distances = []
                for batch in dev_bar:
                    del batch['guid'], batch['offset_mapping'], batch['overflow_to_sample_mapping'] #_#
                    batch = {key: value.cuda() for key, value in batch.items()}
                    true_s = batch.pop('start_positions')
                    true_e = batch.pop('end_positions')
                    
                    with torch.no_grad():
                        start_logits, end_logits = model(**batch, return_dict=False)
                    dev_loss = loss_fn(start_logits, true_s) + loss_fn(end_logits, true_e)

                    dev_losses.append(dev_loss.item())

                    pred_s = torch.argmax(start_logits, dim=1)
                    pred_e = torch.argmax(end_logits, dim=1)

                    _, _, dist = postprocessor.postprocess(batch['input_ids'], (pred_s, pred_e), (true_s, true_e), True)

                    dev_bar.set_description(f"Val - loss: {mean(dev_losses):.3f}, dist: {mean(dist):.3f}")
                    distances.append(mean(dist))    
                    
                    del batch, pred_s, pred_e, start_logits, end_logits, true_s, true_e, dev_loss, dist

                # Let's check best ckpt.
                if lowest_val_loss > mean(dev_losses):
                    model.save_pretrained(join(f'{ckpt_dir}', f'{ckpt_name}.bin'))
                    lowest_val_loss = mean(dev_losses)
                    print('Lowest val loss... weights are saved.')
                if lowest_dist > mean(distances):
                    model.save_pretrained(join(f'{ckpt_dir}', f'{ckpt_name}.bin'))
                    lowest_dist = mean(distances)
                    print('Lowest distance... weights are saved.')
                    
                total_distance['epoch'].append(epoch)
                total_distance['iter'].append(iter)
                total_distance['dist'].append(mean(distances))
                best_idx = total_distance['dist'].index(min(total_distance['dist']))
                best_iter = (total_distance['epoch'][best_idx]+1)*10000 + total_distance['iter'][best_idx]
                wandb.log({
                    'loss': mean(running_loss),
                    'val_loss': mean(dev_losses),
                    'levenshtein_distance': mean(distances),
                    'best_distance': total_distance['dist'][best_idx],
                    'best_iter': best_iter,
                    'lr': optimizer.param_groups[0]["lr"],
                    'epoch': round(iter/len(train_loader)+epoch, 4)
                })


            if not is_validation and not is_accumulation:
                wandb.log({
                    'loss': mean(running_loss),
                    'lr': optimizer.param_groups[0]["lr"],
                    'epoch': round(iter/len(train_loader)+epoch, 4)
                })

            if is_accumulation:
                pass
            else:
                running_loss = []
            
            
        train_losses.append(mean(losses))
        print(f"Total train loss: {train_losses[-1]:.3f}\n")

wandb.agent(sweep_id, train)

Create sweep with ID: xx2204qh
Sweep URL: https://wandb.ai/team_koowater/KoMRC_koowater/sweeps/xx2204qh


[34m[1mwandb[0m: Agent Starting Run: m8kh4tsj with config:
[34m[1mwandb[0m: 	doc_stride: 128
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	warmup_ratio: 0.5
[34m[1mwandb[0m: 	weight_decay: 0.1
[34m[1mwandb[0m: Currently logged in as: [33mkoowater[0m ([33mteam_koowater[0m). Use [1m`wandb login --relogin`[0m to force relogin


Tokenizing...


  0%|          | 0/15203 [00:00<?, ?it/s]

Tokenizing...


  0%|          | 0/3780 [00:00<?, ?it/s]

- Epoch 0




Train:   0%|          | 0/951 [00:00<?, ?it/s]

Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.


Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.


Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.


Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.


Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.
Total train loss: 5.106



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
levenshtein_distance,█▇▃▁▁
loss,▆▇█▆▆▆▆▅▇▅▅▅▃▄▃▂▃▃▃▃▃▂▂▃▃▂▃▂▂▃▂▂▂▃▂▂▂▂▂▁
lr,▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▇▇▇████▇▇▇▆▆▅▅▅▄▄▄▃▃▃▂▂▁▁
val_loss,█▂▁▁▁

0,1
epoch,0.9979
levenshtein_distance,2.49842
loss,1.36394
lr,0.0
val_loss,3.23375


[34m[1mwandb[0m: Agent Starting Run: 1dwfetmt with config:
[34m[1mwandb[0m: 	doc_stride: 128
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	warmup_ratio: 0.5
[34m[1mwandb[0m: 	weight_decay: 0.01


Tokenizing...


  0%|          | 0/15203 [00:00<?, ?it/s]

Tokenizing...


  0%|          | 0/3780 [00:00<?, ?it/s]

- Epoch 0




Train:   0%|          | 0/951 [00:00<?, ?it/s]

Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.


Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.


Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.


Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.


Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Total train loss: 5.115



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
levenshtein_distance,█▇▃▁▁
loss,█▇█▇▆▆▇▅▅▅▆▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▂▃▂▁
lr,▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▇▇▇████▇▇▇▆▆▅▅▅▄▄▄▃▃▃▂▂▁▁
val_loss,█▂▁▁▁

0,1
epoch,0.9979
levenshtein_distance,2.5058
loss,1.4107
lr,0.0
val_loss,3.22144


[34m[1mwandb[0m: Agent Starting Run: t63e7il5 with config:
[34m[1mwandb[0m: 	doc_stride: 128
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	warmup_ratio: 0.5
[34m[1mwandb[0m: 	weight_decay: 0.001


Tokenizing...


  0%|          | 0/15203 [00:00<?, ?it/s]

Tokenizing...


  0%|          | 0/3780 [00:00<?, ?it/s]

- Epoch 0




Train:   0%|          | 0/951 [00:00<?, ?it/s]

Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.


Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.


Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.


Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.


Val:   0%|          | 0/237 [00:00<?, ?it/s]

Lowest val loss... weights are saved.
Lowest distance... weights are saved.
Total train loss: 5.201



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
levenshtein_distance,█▇▃▂▁
loss,▆▇▄▇█▄▆▆▄▅▄▄▃▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁
lr,▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▇▇▇████▇▇▇▆▆▅▅▅▄▄▄▃▃▃▂▂▁▁
val_loss,█▂▁▁▁

0,1
epoch,0.9979
levenshtein_distance,2.48523
loss,1.67083
lr,0.0
val_loss,3.23928


[34m[1mwandb[0m: Agent Starting Run: xubfcdft with config:
[34m[1mwandb[0m: 	doc_stride: 128
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	warmup_ratio: 1
[34m[1mwandb[0m: 	weight_decay: 0.1


Tokenizing...


  0%|          | 0/15203 [00:00<?, ?it/s]

Tokenizing...


  0%|          | 0/3780 [00:00<?, ?it/s]

- Epoch 0




Train:   0%|          | 0/951 [00:00<?, ?it/s]

Val:   0%|          | 0/237 [00:00<?, ?it/s]

# Evaluation

In [8]:
# choose your best saved_parameters
model = ElectraForQuestionAnswering.from_pretrained(join(ckpt_dir, 'model.1'))
model.cuda()
model.eval()

ElectraForQuestionAnswering(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

In [None]:
# !!!!!!!!!! sweep에 맞게 training loop를 수정하느라 이 코드는 바로 사용 어렵습니다.
#
# (X) dev_loader에 대한 inference result를 직접 확인하고 싶다면 이 셀을 실행하세요.

for idx, batch in zip(range(10), dev_loader):
    print(f'------{idx}------')
    with torch.no_grad():
        start_logits, end_logits = model(input_ids=batch['input_ids'].cuda(), return_dict=False)
    
    start = torch.argmax(start_logits, dim=1)
    end = torch.argmax(end_logits, dim=1)

    dist = []
    result = postprocessor.postprocess(batch['input_ids'], (start, end), (batch['start_positions'], batch['end_positions']), True)
    for r in list(zip(result[0], result[1], result[2])):
        print(r)
        dist.append(r[2])

print(f'Mean Distance: {mean(dist)}')

------0------
('', '서강대 메리홀 대극장', 11)
('', '', 0)
('중앙회의', '저축은행중앙회', 5)
('구문론을 포함한 문법', '구문론', 8)
('1956년', '1956년', 0)
('', '', 0)
('32명', '1부', 3)
('이베이', '이베이', 0)
('103개', '50개', 2)
('애플 아이패드', '‘ 쿠리어 ’', 7)
('스타일태그', '스타일태그', 0)
('', '‘ 은행 계좌이동제 ’', 12)
('', '', 0)
('', '', 0)
('‘ 최종면접에서의 역량 부족 ’', '‘ 최종면접에서의 역량 부족 ’', 0)
('‘ 최종면접에서의 역량 부족 ’', '‘ 최종면접에서의 역량 부족 ’', 0)
('바트러첸코 이반', '바트러첸코 이반', 0)
('', '', 0)
('게라심 콜라파코프스키', '게라심 콜라파코프스키', 0)
('경북대', '경북대', 0)
('금관문화훈장', '은관문화훈장', 1)
('19명', '19명', 0)
('', '', 0)
('1844년 9월', '9월', 6)
('', '', 0)
('2011년', '2011년부터', 2)
('', '', 0)
('', '', 0)
('1년7개월', '1년7개월', 0)
('', '', 0)
('임시저장시설', '', 6)
('', '‘ 중간저장 ’', 8)
('2억달러', '6000만달러', 5)
('10월1일', '10월1일', 0)
('사운드클라우드', '사운드클라우드', 0)
('지방자치단체', '정부', 6)
('윤진숙', '최낙정 해양수산부 장관', 12)
('시몬스', '템퍼', 3)
('', '1사1병영 캠페인', 9)
('', '', 0)
('파란색', '파란색', 0)
('파란색', '파란색', 0)
('', '‘ 골드 올리브 ’', 10)
('', '', 0)
('노원구와 강북구', '강북구', 5)
('', '', 0)
('', '동아시아정상회의 ( EAS )', 16)
('5달러', '5달러', 0)
('

In [7]:
# 이 부분도 sweep와 연동이 되어있지 않습니다. HP를 수동으로 설정해주세요.

tokenizer = ElectraTokenizerFast.from_pretrained('monologg/koelectra-base-v3-finetuned-korquad')
preprocessor = Preprocessor(tokenizer, 512, 128, 'right')
postprocessor = Postprocessor(tokenizer)
test_dataset = KoMRC.load(join(base_dir, 'test.json'))
preprocessor.load_dataset(test_dataset, eval=True)
test_examples = preprocessor.tokenize_eval()

Downloading:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/257k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/591 [00:00<?, ?B/s]

Tokenizing...


In [16]:
test_dataset = KoMRC.load(join(base_dir, 'test.json'))
output_path = join(base_dir, 'output', 'result.csv')
os.makedirs(join(base_dir, 'output'), exist_ok=True)

with torch.no_grad(), open(output_path, 'w', encoding = 'utf-8-sig') as fd:
    writer = csv.writer(fd)
    writer.writerow(['Id', 'Predicted'])

    rows = []
    for data in tqdm(test_dataset, "Testing"):
        tokenized_example = tokenizer(data['question'], 
                                      data['context'], 
                                      truncation="only_second",
                                      max_length=512,
                                      stride=128, 
                                      return_overflowing_tokens=True,
                                      return_offsets_mapping=True,
                                      padding='max_length'
                                      )
        input_ids, token_type_ids = [
            torch.tensor(tokenized_example[key], dtype=torch.long, device="cuda")
            for key in ("input_ids", "token_type_ids")
        ]

        with torch.no_grad():
            start_logits, end_logits = model(input_ids=input_ids, token_type_ids=token_type_ids, return_dict=False)
    
        start = torch.argmax(start_logits, dim=1)
        end = torch.argmax(end_logits, dim=1)
        answer_vector = list(zip(start, end))
        
        predicts = []
        # 지문이 여러 개로 나뉘어졌다면?
        if len(answer_vector) != 1:
            num_inputs = len(answer_vector)
            # 지문 개수만큼
            for i in range(num_inputs):
                # decode 수행 후 리스트에 보관
                predict = tokenizer.decode(tokenized_example['input_ids'][i][start[i]:end[i]+1])
                predicts.append(predict)
            
            # 여러 개의 지문에 대한 정답 중 
            # 어떤 것이 가장 정답으로 적절한지 판단한다.
            choiced_answer = []
            for i in range(num_inputs):
                # start, end가 0이면 정답이 존재하지 않는다.
                if answer_vector[i][0] == 0 and answer_vector[i][1] == 0:
                    continue
                # start가 end보다 늦게 등장하면 정답이 존재하지 않는다.
                if answer_vector[i][0] > answer_vector[i][1]:
                    continue
                choiced_answer.append([i, answer_vector[i]])
            # 선택된 정답이 2개 이상이라면?
            if len(choiced_answer) > 1:
                # 정답의 길이를 비교한다.
                # 이 부분 코드가 너무 비효율적인 것 같다... 개선이 필요하다.
                answer_len = list(map(lambda x: abs(x[1][0] - x[1][1]), choiced_answer))
                min_idx = answer_len.index(min(answer_len))
                min_idx = choiced_answer[min_idx][0]
                predicts = [predicts[min_idx]]
        else:
            predicts.append(tokenizer.decode(tokenized_example['input_ids'][0][start[0]:end[0]+1]))
        
        rows.append([data['guid'], predicts[0]])
    
    writer.writerows(rows)

Testing:   0%|          | 0/4008 [00:00<?, ?it/s]