In [1]:
import sys
sys.path.append("..")
import os
import torch
import random
import numpy as np

from transformers import AutoTokenizer, AutoConfig,Trainer, TrainingArguments
from datasets import load_metric
from easydict import EasyDict

from model import RobertaForStsRegression
from dataset import KlueStsWithSentenceMaskDataset
from utils import read_json

In [2]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [3]:
args = EasyDict({
    "data_dir" : "./data",
    "model_dir": "./model",
    "output_dir": "./output",
    "train_filename" : "klue-sts-v1.1_train.json",
    "valid_filename" : "klue-sts-v1.1_dev.json",
    
    "num_workers" : 4,
    "max_seq_length":512,
    
    "batch_size": 32,
    "learning_rate": 2e-5,
    "num_train_epochs":5,
    "save_total_limit":2,
    "gradient_accumulation_steps":1,
    "weight_decay" : 0.01,
    "evaluation_strategy" : "steps",
    "save_steps": 250,
    "eval_steps": 250
})

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [5]:
model_name_or_path = 'klue/roberta-base'
config = AutoConfig.from_pretrained(model_name_or_path)
config.num_labels = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 

In [6]:
train_file_path = os.path.join(args.data_dir, args.train_filename)
valid_file_path = os.path.join(args.data_dir, args.valid_filename)

train_json = read_json(train_file_path)
valid_json = read_json(valid_file_path)

train_dataset = KlueStsWithSentenceMaskDataset(train_json, tokenizer, 510)
valid_dataset = KlueStsWithSentenceMaskDataset(train_json, tokenizer, 510)

In [7]:
model = RobertaForStsRegression.from_pretrained(model_name_or_path, config=config)
model.to(device)

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForStsRegression: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForStsRegression from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForStsRegression from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForStsRegression were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'label_classifier.linear.bias', 'sentence_fc_layer.linear.bias', 'la

RobertaForStsRegression(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Lay

In [8]:
pearson = load_metric("pearsonr").compute
def compute_metrics(pred):
    references = pred.label_ids
    predictions = pred.predictions
    metric = pearson(predictions=predictions, references=references)
    return metric

In [9]:
training_args = TrainingArguments(
    output_dir= args.model_dir,
    save_total_limit=args.save_total_limit,
    save_steps=args.save_steps,
    num_train_epochs=args.num_train_epochs,
    learning_rate=args.learning_rate,
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size,
    #warmup_steps=500,                
    gradient_accumulation_steps = args.gradient_accumulation_steps,
    weight_decay=args.weight_decay,
    logging_dir='./logs',
    logging_steps = args.save_steps, 
    evaluation_strategy= args.evaluation_strategy,
    metric_for_best_model = 'pearsonr',
    fp16=True,
    fp16_opt_level='O1',
    eval_steps = args.save_steps,
    load_best_model_at_end = True 
  )

In [10]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,             # evaluation dataset
    compute_metrics=compute_metrics         # define metrics function
  )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using amp fp16 backend


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
trainer.train()

***** Running training *****
  Num examples = 11668
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1825
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mddobokki[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Step,Training Loss,Validation Loss,Pearsonr
250,0.659,0.148391,0.958859
500,0.1654,0.113508,0.964891
750,0.1353,0.095553,0.970332
1000,0.0965,0.130414,0.970599
1250,0.0817,0.152467,0.972884
1500,0.067,0.144197,0.973835
1750,0.0557,0.138358,0.974617


***** Running Evaluation *****
  Num examples = 11668
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-250
Configuration saved in ./model/checkpoint-250/config.json
Model weights saved in ./model/checkpoint-250/pytorch_model.bin
Deleting older checkpoint [model/checkpoint-1500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 11668
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-500
Configuration saved in ./model/checkpoint-500/config.json
Model weights saved in ./model/checkpoint-500/pytorch_model.bin
Deleting older checkpoint [model/checkpoint-1750] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 11668
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-750
Configuration saved in ./model/checkpoint-750/config.json
Model weights saved in ./model/checkpoint-750/pytorch_model.bin
Deleting older checkpoint [model/checkpoint-250] due to args.save_total_limit
***** Running Evaluation *****
 

TrainOutput(global_step=1825, training_loss=0.1749032746929012, metrics={'train_runtime': 320.2896, 'train_samples_per_second': 182.148, 'train_steps_per_second': 5.698, 'total_flos': 3907652956376400.0, 'train_loss': 0.1749032746929012, 'epoch': 5.0})

In [12]:
model.save_pretrained(args.model_dir)
tokenizer.save_pretrained(args.model_dir)

Configuration saved in ./model/config.json
Model weights saved in ./model/pytorch_model.bin
tokenizer config file saved in ./model/tokenizer_config.json
Special tokens file saved in ./model/special_tokens_map.json


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json',
 './model/tokenizer.json')