In [1]:
import argparse, sys, os, random, string
from typing import Optional, Union
from dataclasses import dataclass
# import logging

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset

from sklearn.model_selection import KFold

import transformers
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
# !pip install sentencepiece
# !pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
from kobert_tokenizer import KoBERTTokenizer

import wandb
## run `wandb login --relogin`` in TERMINAL if you want to use your own  wandb profile


print(f"unsing {torch.device('cuda' if torch.cuda.is_available() else 'cpu')} with {torch.__version__}")

  from .autonotebook import tqdm as notebook_tqdm


unsing cuda with 1.10.0+cu111


In [2]:
class TextDataset(Dataset):
    def __init__(self, data: pd.DataFrame, tokenizer, indices=None):
        
        if indices is None:     ## when testing, not training, use the whole data without sampling indices
            indices = range(len(data))
            
        self.data = data.iloc[indices]
        self.hyps = self.data.filter(regex='hyp').keys()
        self.tokenizer=tokenizer
        
    def __len__(self)  :
        return len(self.data)
    
    def __getitem__(self, idx):
        # obs1 sentences: questions that work as given sentence for multiple choice
        obs1_sentences = [self.data.iloc[idx]['OBS1']] * len(self.hyps)
        # hyp_obs2 sentences: option for multiple choices. obs2 sentece is fixed according to obs1 sentence. only hyppthesis sentences differs.
        obs2_sentence = self.data.iloc[idx]['OBS2']
        hyp_obs2_sentences = [f"{self.data.iloc[idx][hyp]} {obs2_sentence}" for hyp in self.hyps]

        # text-encoding (tokenizing)
        text_embed = self.tokenizer(obs1_sentences, hyp_obs2_sentences, padding=True, return_tensors='pt')
        input_ids = text_embed['input_ids']
        token_type_ids = text_embed['token_type_ids']
        attention_mask = text_embed['attention_mask']
        
        # answers - 0, 1, 2, ..
        label = torch.tensor(self.data.iloc[idx]['label']).unsqueeze(0)
        
        return {'input_ids' : input_ids, 
                'token_type_ids':token_type_ids, 
                'attention_mask' : attention_mask, 
                'labels':label } 
    

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """
    tokenizer: PreTrainedTokenizerBase 
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])

        # flattnen all the inputs/attetions masks etc.
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        # This will return a dictionary with tensors of shape `(batch_size * 4) x seq_length`
        batch = self.tokenizer.pad(
                flattened_features,
                padding=self.padding,
                max_length=self.max_length,
                pad_to_multiple_of=self.pad_to_multiple_of,
                return_tensors="pt"
                )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch



def get_tokenizer(text_model: str):
    if "kobert" in text_model:
        # !pip install sentencepiece
        # !pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
        from kobert_tokenizer import KoBERTTokenizer
        tokenizer = KoBERTTokenizer.from_pretrained(text_model)
    elif "klue" in text_model:
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(text_model, use_fast=True)
    else:
        print("Got unexpected text model and load AutoTokenizer. Please Check the tokenizer model")
        tokenizer = AutoTokenizer.from_pretrained(text_model, use_fast=True)
    return tokenizer

In [3]:
class KFoldTrainer :
    def __init__(self, train_set: pd.DataFrame, test_set:pd.DataFrame, fold_idx: int, tokenizer, configs, args) :
        self.train_set = train_set
        self.test_set = test_set
        self.args = args
        
        self.model = configs['text_model']
        self.tokenizer = tokenizer
        self.data_collator = DataCollatorForMultipleChoice(tokenizer)


        self.k = configs['kfolds']
        self.kfold_split = list(KFold(n_splits=self.k, shuffle=True, random_state=42).split(train_set))
        train_idx, val_idx = self.kfold_split[fold_idx]
        
        self.dataset_dict = {'train': TextDataset(self.train_set, self.tokenizer, train_idx), 
                       'valid' : TextDataset(self.train_set, self.tokenizer, val_idx),
                       'test': TextDataset(self.test_set, self.tokenizer)}        

        self.model = AutoModelForMultipleChoice.from_pretrained(self.model)
        
        wandb.watch(self.model)
        wandb.config.update(self.args)        
        
        self.trainer = Trainer(
            model = self.model,
            args = self.args,
            train_dataset = self.dataset_dict['train'],
            eval_dataset = self.dataset_dict['valid'],
            data_collator = self.data_collator,
            compute_metrics = self.compute_metrics,  
            )        
        
        
    def compute_metrics(self, eval_predictions : transformers.EvalPrediction):
        predictions, label_ids = eval_predictions
        preds = np.argmax(predictions, axis=1)
        return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

          
    def train_fold(self, fold_idx):
        # logging.info(f'{fold}/{self.k} - fold started')
        print(f'===== {fold_idx+1}/{self.k} - fold TRAINING started =====')

        self.model.train()
        self.trainer.train()
        
        print(f'===== {fold_idx+1}/{self.k} - fold TESTING started =====')
        self.model.eval()
        metrics = self.trainer.evaluate(self.dataset_dict['test'])
        print(f'== {fold_idx+1}th fold metric is {metrics} ==')
                
        del self.dataset_dict       ## re-sample datasets with another k-fold indices
        torch.cuda.empty_cache()    ## empty CUDA memory before starting next fold
        

In [4]:
## !! CHECK !! the output_dir in main() function !!
configs = {
    'text_model' : 'skt/kobert-base-v1',
    'kfolds' : 3,
    'train_path' : '/kovar-vol/kovar/dataset/train.json',
    'test_path':'/kovar-vol/kovar/dataset/photo_test.json'
}


args = TrainingArguments(
    output_dir = f"{configs['text_model']}-ft_base",    ## this is just fundamental output_dir. It should change at main() to prevent overwrite
    overwrite_output_dir = False,
    evaluation_strategy = "epoch", #evaluation is done (and logged) every eval_steps
    save_strategy='epoch',
    load_best_model_at_end=True,
    save_total_limit=2,
    logging_strategy = "steps", #logging is done every logging steps
    learning_rate = 1e-6,
    logging_steps = 500, #number of update steps between two logs if logging_strategy = "steps"
    eval_steps = 500,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5, 
    report_to = "wandb",
    weight_decay = 0.01, #The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW optimizer.
)

In [5]:
def main():
    # Load datasets   
    train_set = pd.read_json(configs['train_path'], lines=True)
    test_set = pd.read_json(configs['test_path'], lines=True)

    # Load Tokenizer
    tokenizer = get_tokenizer(configs['text_model'])
    
    wandb.init()
    
    ## !! 혹 중간에 학습이 끊기면, range(2,3) 이런 식으로 fold_idx를 조정해서 학습을 이어가자 !!
    for fold_idx in range(configs['kfolds']):   
        args.output_dir = f"/kovar-vol/kovar/models/koBERT_ft/koBERT_ft-fold_{fold_idx+1}"
        kfold_trainer = KFoldTrainer(train_set, test_set, fold_idx, tokenizer, configs, args)
        kfold_trainer.train_fold(fold_idx)
        
    wandb.finish()
    

In [6]:
if __name__ == '__main__':
    main()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrkfcl226[0m ([33mkovar[0m). Use [1m`wandb login --relogin`[0m to force relogin


Some weights of BertForMultipleChoice were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


===== 1/3 - fold TRAINING started =====




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.838621,0.61355
2,0.971500,0.760526,0.657724
3,0.800400,0.728964,0.674797
4,0.748900,0.710971,0.682656
5,0.724900,0.703673,0.687263




===== 1/3 - fold TESTING started =====




== 1th fold metric is {'eval_loss': 0.7144871950149536, 'eval_accuracy': 0.6735197305679321, 'eval_runtime': 30.5858, 'eval_samples_per_second': 39.757, 'eval_steps_per_second': 2.485, 'epoch': 5.0} ==


Some weights of BertForMultipleChoice were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


===== 2/3 - fold TRAINING started =====


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.838288,0.595554
2,0.949400,0.774014,0.624017
3,0.805000,0.741688,0.650854
4,0.759600,0.726926,0.666034
5,0.738200,0.721747,0.668203




===== 2/3 - fold TESTING started =====




== 2th fold metric is {'eval_loss': 0.7215868234634399, 'eval_accuracy': 0.6710526347160339, 'eval_runtime': 30.5661, 'eval_samples_per_second': 39.783, 'eval_steps_per_second': 2.486, 'epoch': 5.0} ==


Some weights of BertForMultipleChoice were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


===== 3/3 - fold TRAINING started =====


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.839068,0.616969
2,0.947100,0.764857,0.644077
3,0.806200,0.732778,0.654107
4,0.757600,0.714738,0.663323
5,0.735800,0.710654,0.666034




===== 3/3 - fold TESTING started =====




== 3th fold metric is {'eval_loss': 0.7198613882064819, 'eval_accuracy': 0.671875, 'eval_runtime': 30.5899, 'eval_samples_per_second': 39.752, 'eval_steps_per_second': 2.484, 'epoch': 5.0} ==


0,1
eval/accuracy,▂▆▇██▇▁▃▅▆▇▇▃▅▅▆▆▇
eval/loss,█▄▂▁▁▂█▅▃▂▂▂█▄▃▂▁▂
eval/runtime,▂▂▅██▁█████▁█████▁
eval/samples_per_second,█▇▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/steps_per_second,█▇▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▃▃▅▅▆▇███▁▁▃▃▅▅▆▇███▁▁▃▃▅▅▆▇███
train/global_step,▁▁▃▃▅▅▆▇███▁▁▃▃▅▅▆▇███▁▁▃▃▅▅▆▇███
train/learning_rate,█▆▃▁█▆▃▁█▆▃▁
train/loss,█▃▂▁▇▃▂▁▇▃▂▁
train/total_flos,▁█▁

0,1
eval/accuracy,0.67188
eval/loss,0.71986
eval/runtime,30.5899
eval/samples_per_second,39.752
eval/steps_per_second,2.484
train/epoch,5.0
train/global_step,2310.0
train/learning_rate,0.0
train/loss,0.7358
train/total_flos,3698461342361952.0
