- WandB sweep을 통한 hyperparameter search training code입니다.
- 본 notebook은 Colab 환경에 맞게 작성되었습니다.

In [1]:
! pip install transformers sacremoses wandb sentencepiece
! git clone https://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@github.com/Koowater/goorm-Magicians.git
%cd goorm-Magicians
%ls -a
! wandb login # Please insert your WandB code

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 13.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 81.6 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.21-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 69.3 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 74.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 75.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tok

In [1]:
! nvidia-smi

Mon Aug 22 19:02:25 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 516.59       Driver Version: 516.59       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:26:00.0  On |                  N/A |
| 29%   44C    P8    22W / 175W |   1386MiB /  8192MiB |     28%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from dp import is_running_on_ipython, KoMRC, Preprocessor, Postprocessor, collator, FocalLoss
if is_running_on_ipython():
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

from transformers import (
    ElectraForQuestionAnswering,
    ElectraTokenizerFast,
    DebertaV2ForQuestionAnswering,
    DebertaV2TokenizerFast,
    get_linear_schedule_with_warmup,
    AdamW
)

import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from statistics import mean

import wandb
import numpy as np
import pandas as pd
import csv
import random

# Google drive mount
import os
from os.path import join
from google.colab import drive
drive.mount('/content/drive')
# change for your path
data_dir = join('/', 'content', 'drive', 'My Drive', 'goorm K-Digital', '자연어처리', 'Project2', 'goorm-Magicians', 'data')
ckpt_dir = join('/', 'content', 'drive', 'My Drive', 'goorm K-Digital', '자연어처리', 'Project2')

# seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)


train_dataset = KoMRC.load(join(data_dir, 'train.json'))
dev_dataset = KoMRC.load(join(data_dir, 'test.json'))

Mounted at /content/drive


<torch._C.Generator at 0x7f1cb834f490>

# Hyper-parameter

In [None]:
model_name = 'monologg/koelectra-base-v3-finetuned-korquad'
model = ElectraForQuestionAnswering.from_pretrained(model_name)
model.cuda()
tokenizer = ElectraTokenizerFast.from_pretrained(model_name)
loss_fn = torch.nn.CrossEntropyLoss()

learning_rate = 5e-5
apply_scheduler = True
weight_decay = 0.01 
optimizer = AdamW(model.parameters(), weight_decay=weight_decay, lr=learning_rate) 

max_length = 512
doc_stride = 64

train_epoch = 10
train_batch_size = 256
dev_batch_size = 256
accumulation = 4
warmup_steps = 100 / accumulation

training_name = f'LR:{learning_rate}, SD:{apply_scheduler}, WD:{weight_decay}, BS:{train_batch_size}/{accumulation}, DA:{data_augmentation}'
wandb.init(
    entity='team_koowater',
    project='KoMRC_koowater',
    name=training_name,
    config={
        'model': model_name,
        'learning_rate': learning_rate,
        'optimizer': optimizer.__class__.__name__,
        'weight_decay': weight_decay,
        'scheduler': apply_scheduler,
        'train_epoch': train_epoch,
        'train_batch_size': train_batch_size,
        'val_batch_size': dev_batch_size,
        'max_length': max_length,
        'doc_stride': doc_stride
})

In [None]:
preprocessor = Preprocessor(tokenizer, max_length, doc_stride, 'right')
preprocessor.load_dataset(train_dataset)
train_examples = preprocessor.tokenize()
preprocessor.load_dataset(dev_dataset)
dev_examples = preprocessor.tokenize()

train_loader = DataLoader(train_examples, batch_size=train_batch_size//accumulation, shuffle=True, collate_fn=collator, num_workers=2)
dev_loader = DataLoader(dev_examples, batch_size=train_batch_size//accumulation, shuffle=False, collate_fn=collator, num_workers=2)

# total training steps
total_training_steps = train_epoch * np.ceil(len(train_loader) / accumulation)

# scheduler
if apply_scheduler:
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                                num_training_steps=total_training_steps,
                                                num_warmup_steps=200/accumulation)

postprocessor = Postprocessor(tokenizer)

# Training

In [None]:
epochs = 3

ckpt_name = f'{model_name}_lr{learning_rate}'

os.makedirs(ckpt_dir, exist_ok=True)
train_losses = []
total_distance = {
    'epoch': [],
    'iter': [],
    'dist': []
}
lowest_val_loss = 9999.
lowest_dist = 9999.

# Training loop
for epoch in range(epochs):
    print("- Epoch", epoch)
    running_loss = []
    losses = []
    progress_bar = tqdm(train_loader, desc='Train')

    for iter, batch in enumerate(progress_bar):
        del batch['guid'], batch['offset_mapping'], batch['overflow_to_sample_mapping'] #_#
        batch = {key: value.cuda() for key, value in batch.items()}
        start = batch.pop('start_positions')
        end = batch.pop('end_positions')
        
        start_logits, end_logits = model(**batch, return_dict=False)
        loss = loss_fn(start_logits, start) + loss_fn(end_logits, end)
        (loss / accumulation).backward()
        running_loss.append(loss.item())
        del batch, start, end, start_logits, end_logits, loss
        
        is_accumulation = (iter+1) % accumulation

        if is_accumulation:
            pass
        else:
            clip_grad_norm_(model.parameters(), max_norm=1.)
            optimizer.step()
            if apply_scheduler:
                scheduler.step()
            optimizer.zero_grad(set_to_none=True)

            losses.append(mean(running_loss))
            progress_bar.set_description(f"Train - loss: {losses[-1]:.3f}")

        # Validation
        is_validation = (iter+1) % int(len(train_loader) / 5) == 0
        if is_validation:
            dev_bar = tqdm(dev_loader, desc="Val")
            dev_losses = []
            distances = []
            for batch in dev_bar:
                del batch['guid'], batch['offset_mapping'], batch['overflow_to_sample_mapping'] #_#
                batch = {key: value.cuda() for key, value in batch.items()}
                true_s = batch.pop('start_positions')
                true_e = batch.pop('end_positions')
                
                with torch.no_grad():
                    start_logits, end_logits = model(**batch, return_dict=False)
                dev_loss = loss_fn(start_logits, true_s) + loss_fn(end_logits, true_e)

                dev_losses.append(dev_loss.item())

                pred_s = torch.argmax(start_logits, dim=1)
                pred_e = torch.argmax(end_logits, dim=1)

                results = postprocessor.postprocess(batch['input_ids'], (pred_s, pred_e), (true_s, true_e), True)
                dist = [results[2][i] for i, result in enumerate(results[1]) if result]

                dev_bar.set_description(f"Val - loss: {mean(dev_losses):.3f}, dist: {mean(dist):.3f}")
                distances.append(mean(dist))    
                
                del batch, pred_s, pred_e, start_logits, end_logits, true_s, true_e, dev_loss, dist

            # Let's check best ckpt.
            if lowest_val_loss > mean(dev_losses):
                model.save_pretrained(join(f'{ckpt_dir}', f'{ckpt_name}.bin'))
                lowest_val_loss = mean(dev_losses)
                print('Lowest val loss... weights are saved.')
            if lowest_dist > mean(distances):
                model.save_pretrained(join(f'{ckpt_dir}', f'{ckpt_name}.bin'))
                lowest_dist = mean(distances)
                print('Lowest distance... weights are saved.')
                
            total_distance['epoch'].append(epoch)
            total_distance['iter'].append(iter)
            total_distance['dist'].append(mean(distances))
            best_idx = total_distance['dist'].index(min(total_distance['dist']))
            best_iter = (total_distance['epoch'][best_idx]+1)*10000 + total_distance['iter'][best_idx]
            wandb.log({
                'loss': mean(running_loss),
                'val_loss': mean(dev_losses),
                'levenshtein_distance': mean(distances),
                'best_distance': total_distance['dist'][best_idx],
                'best_iter': best_iter,
                'lr': optimizer.param_groups[0]["lr"],
                'epoch': round(iter/len(train_loader)+epoch, 4)
            })

        if not is_validation and not is_accumulation:
            wandb.log({
                'loss': mean(running_loss),
                'lr': optimizer.param_groups[0]["lr"],
                'epoch': round(iter/len(train_loader)+epoch, 4)
            })

        if is_accumulation:
            pass
        else:
            running_loss = []
        
    # # For test
    # break
    train_losses.append(mean(losses))
    print(f"Total train loss: {train_losses[-1]:.3f}\n")
