In [None]:
! pip install transformers wandb
! git clone https://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@github.com/Koowater/goorm-Magicians.git
%cd goorm-Magicians
%ls -a
! wandb login

In [1]:
from dp import is_running_on_ipython, KoMRC, Preprocessor, collator
if is_running_on_ipython():
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

from transformers import (
    ElectraForQuestionAnswering,
    ElectraTokenizerFast,
    get_linear_schedule_with_warmup
)
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from statistics import mean

import wandb
import numpy as np
import json
import random
from typing import List, Tuple, Dict, Any

# Google drive mount
import os
from os.path import join
from google.colab import drive
drive.mount('/content/drive')
# change for your path
ckpt_dir = join('/', 'content', 'drive', 'My Drive', 'goorm K-Digital', '자연어처리', 'Project2')

# seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

In [None]:
tokenizer = ElectraTokenizerFast.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad")

base_dir = 'data'
dataset = KoMRC.load(join(base_dir, 'train.json'))
train_dataset, dev_dataset = dataset.split(dataset, eval_ratio=0.2)

# Hyper-parameter
max_length = 512
doc_stride = 128
padding_side = 'right'

preprocessor = Preprocessor(tokenizer, max_length, doc_stride, padding_side)
preprocessor.load_dataset(train_dataset)
train_examples = preprocessor.tokenize()
preprocessor.load_dataset(dev_dataset)
dev_examples = preprocessor.tokenize()

# Hyper-parameter

In [None]:
model_name = 'monologg/koelectra-base-v3-finetuned-korquad'
model = ElectraForQuestionAnswering.from_pretrained(model_name)
model.cuda()

learning_rate = 5e-5
apply_scheduler = True
weight_decay = 0.01 
optimizer = torch.optim.AdamW(model.parameters(), weight_decay=weight_decay, lr=learning_rate) 

train_epoch = 10
train_batch_size = 64
dev_batch_size = 64
accumulation = 4

data_augmentation = False

training_name = f'LR:{learning_rate}, SD:{apply_scheduler}, WD:{weight_decay}, BS:{train_batch_size}/{accumulation}, DA:{data_augmentation}'
wandb.init(
    entity='team_koowater',
    project='KoMRC',
    name=training_name,
    config={
        'model': model.__class__.__name__,
        'learning_rate': learning_rate,
        'optimizer': optimizer.__class__.__name__,
        'weight_decay': weight_decay,
        'scheduler': apply_scheduler,
        'data_augmentation': data_augmentation,
        'train_epoch': train_epoch,
        'train_batch_size': train_batch_size,
        'val_batch_size': dev_batch_size,
        'max_length': max_length,
        'doc_stride': doc_stride
})

# DataLoader

In [None]:
train_loader = DataLoader(train_examples, batch_size=train_batch_size//accumulation, shuffle=True, collate_fn=collator, num_workers=2)
dev_loader = DataLoader(dev_examples, batch_size=train_batch_size//accumulation, shuffle=False, collate_fn=collator, num_workers=2)

total_training_steps = train_epoch * np.ceil((len(train_loader) / train_batch_size / accumulation))
if apply_scheduler:
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                                num_training_steps=total_training_steps,
                                                num_warmup_steps=200)

# Training

In [None]:
ckpt_dir = 'dump'
os.makedirs(ckpt_dir, exist_ok=True)
train_losses = []
dev_losses = []

for step, epoch in enumerate(range(train_epoch)):
    print("- Epoch", epoch)

    running_loss = 0.
    losses = []
    progress_bar = tqdm(train_loader, desc='Train')

    for batch in progress_bar:
        del batch['guid'], batch['offset_mapping'], batch['overflow_to_sample_mapping'] #_#
        batch = {key: value.cuda() for key, value in batch.items()}
        start = batch.pop('start_positions')
        end = batch.pop('end_positions')
        
        start_logits, end_logits = model(**batch, return_dict=False)
        loss = F.cross_entropy(start_logits, start) + F.cross_entropy(end_logits, end)
        (loss / accumulation).backward()
        running_loss += loss.item()
        del batch, start, end, start_logits, end_logits, loss

        if step+1 % accumulation:
            continue

        clip_grad_norm_(model.parameters(), max_norm=1.)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad(set_to_none=True)

        losses.append(running_loss / accumulation)
        progress_bar.set_description(f"Train - loss: {losses[-1]:.3f}")

        # Validation
        dev_losses = []
        for batch in tqdm(dev_loader, desc="Validation"):
            del batch['guid'], batch['offset_mapping'], batch['overflow_to_sample_mapping'] #_#
            batch = {key: value.cuda() for key, value in batch.items()}
            start = batch.pop('start_positions')
            end = batch.pop('end_positions')
            
            with torch.no_grad():
                start_logits, end_logits = model(**batch, return_dict=False)
            dev_loss = F.cross_entropy(start_logits, start) + F.cross_entropy(end_logits, end)

            dev_losses.append(dev_loss.item())
            del batch, start, end, start_logits, end_logits, loss

        dev_losses = mean(dev_losses)
        print(f"Validation loss: {dev_losses:.3f}")

        model.save_pretrained(join(f'{ckpt_dir}', f'model.{epoch}'))

        wandb.log({
            'loss': running_loss / accumulation,
            'dev_loss': dev_losses,
            'lr': optimizer.param_groups[0]["lr"]
        })

        running_loss = 0.

    train_losses.append(mean(losses))
    print(f"Total train loss: {train_losses[-1]:.3f}")