In [1]:
from model import SkeletonAwareBERT, SkeletonAwareRoberta
from transformers import AutoTokenizer, AutoConfig,Trainer, TrainingArguments
from easydict import EasyDict
from dataset import KlueReProcessor
from utils import compute_metrics,SKRelationExtractionDataset
import json
import os
import torch
import random
import numpy as np

In [2]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [3]:
args = EasyDict({
    "batch_size": 32,
    "data_dir" : "./data",
    "model_dir": "./model",
    "model_tarname":"klue-re.tar.gz",
    "output_dir":os.environ.get("SM_OUTPUT_DATA_DIR", "/output"),
    "max_seq_length":512,
    "relation_filename" : "relation_list.json",
    "train_filename" : "klue-re-v1.1_train.json",
    "train_aug_filename1": "train_aug_entity_swap.json",
    "train_aug_filename2": "train_aug_aeda.json",
    "valid_filename" : "klue-re-v1.1_dev.json",
    "num_workers" : 4
})
# 릴레이션 데이터 위치
relation_class_file_path = os.path.join(args.data_dir, args.relation_filename)
# train 데이터 위치
train_file_path = os.path.join(args.data_dir, args.train_filename)
# train augmentation 데이터 위치
train_aug_file_path1 = os.path.join(args.data_dir, args.train_aug_filename1)
train_aug_file_path2 = os.path.join(args.data_dir, args.train_aug_filename2)
# validtaion 데이터 위치
valid_file_path = os.path.join(args.data_dir, args.valid_filename)

with open(relation_class_file_path, "r", encoding="utf-8") as f:
    relation_class = json.load(f)["relations"]


In [4]:
model_name_or_path = 'klue/roberta-large'
#model_name_or_path = "monologg/kobigbird-bert-base"
config = AutoConfig.from_pretrained(model_name_or_path)
config.num_labels = len(relation_class)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,use_fast=False) 
# use_fast의 차이? ture-> batch_encoding = self.tokenizer.batch_encode_plus 부분에서 에러
# tokenizer.add_special_tokens(
#     {"additional_special_tokens": ["<subj>","</subj>","<obj>","</obj>"]})
krp = KlueReProcessor(args,tokenizer)

In [6]:
# 데이터 tokenizing 및 entity mask 및 Syntactic Indicators (si mask)를 생성
train_example = krp._create_examples(train_file_path)
aug_example1 = krp._create_examples(train_aug_file_path1)
# aug_example2 = krp._create_examples(train_aug_file_path2)
train_example.extend(aug_example1)
# train_example.extend(aug_example2)

train_features = krp._convert_features(train_example)
#train_aug_features = krp._convert_features(krp._create_examples(train_aug_file_path))
valid_features = krp._convert_features(krp._create_examples(valid_file_path))

In [7]:
# train_data = 32470
# aug_data = 15306
# total = 47776
# 제대로 합쳐졌나 확인
#print(train_features[0])

In [8]:
# 데이터 셋 로드
train_dataset = SKRelationExtractionDataset(train_features)
valid_dataset = SKRelationExtractionDataset(valid_features)

In [9]:
# model = Rbert.from_pretrained(model_name_or_path, config=config)
# model.roberta.resize_token_embeddings(tokenizer.vocab_size + 4)
# 모델 로드
model = SkeletonAwareRoberta.from_pretrained(model_name_or_path, config=config)
model.roberta.resize_token_embeddings(tokenizer.vocab_size + 8)
# 스페셜 토큰이 6개가 추가되어서 모델 내부의 roberta의 token_embedding을 늘려줌 -> 추가된 토큰 <obj>, </obj>, <subj>, </subj>, <si>, </si>

# model = SkeletonAwareBERT.from_pretrained(model_name_or_path, config=config)
# model.bert.resize_token_embeddings(tokenizer.vocab_size + 8)

Some weights of the model checkpoint at klue/roberta-large were not used when initializing SkeletonAwareRoberta: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing SkeletonAwareRoberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SkeletonAwareRoberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of SkeletonAwareRoberta were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['si_fc_layer.linear.weight', 'label_classifier.linear.weight', 'cls_fc_layer.linear.bias', 'cls_fc_layer.line

Embedding(32008, 1024)

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

SkeletonAwareRoberta(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32008, 1024)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm

In [11]:
save_steps = 100
training_args = TrainingArguments(
    output_dir= args.model_dir,          # output directory
    save_total_limit=2,              # number of total save model.
    save_steps=save_steps,                 # model saving step.
    num_train_epochs=5,              # total number of training epochs
    learning_rate=2e-5,               # learning_rate
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    #warmup_steps=500,                # number of warmup steps for learning rate scheduler
    gradient_accumulation_steps = 1,
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps = save_steps,              # log saving step.
    evaluation_strategy='steps', # evaluation strategy to adopt during training
    metric_for_best_model = "micro f1 score",
    fp16=True,
    fp16_opt_level='O1',
    eval_steps = save_steps,            # evaluation step.
    load_best_model_at_end = True 
  )

In [12]:
# LDAMLossTrainer
# Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,             # evaluation dataset
    compute_metrics=compute_metrics         # define metrics function
  )

Using amp fp16 backend


In [13]:
trainer.train()
#72.744661

***** Running training *****
  Num examples = 36304
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 5675
  weights = torch.tensor(weights.clone().detach()).float()


Step,Training Loss,Validation Loss,Micro f1 score,Auprc,Accuracy
100,0.2372,0.089706,0.0,5.625169,0.596394
200,0.1057,0.063463,13.23693,14.461786,0.61378
300,0.0743,0.057317,48.194311,23.270801,0.653703
400,0.0605,0.04743,51.47129,29.365398,0.667611
500,0.0498,0.040617,51.302176,34.220401,0.700966
600,0.0446,0.039929,53.170189,38.765834,0.685254
700,0.042,0.037787,56.254955,42.080562,0.711269
800,0.0387,0.037762,55.435343,43.353993,0.717965
900,0.037,0.034468,59.244057,46.631231,0.740889
1000,0.0365,0.038086,58.277176,51.45339,0.699292


***** Running Evaluation *****
  Num examples = 7765
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-100
Configuration saved in ./model/checkpoint-100/config.json
Model weights saved in ./model/checkpoint-100/pytorch_model.bin
  weights = torch.tensor(weights.clone().detach()).float()
***** Running Evaluation *****
  Num examples = 7765
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-200
Configuration saved in ./model/checkpoint-200/config.json
Model weights saved in ./model/checkpoint-200/pytorch_model.bin
  weights = torch.tensor(weights.clone().detach()).float()
***** Running Evaluation *****
  Num examples = 7765
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-300
Configuration saved in ./model/checkpoint-300/config.json
Model weights saved in ./model/checkpoint-300/pytorch_model.bin
Deleting older checkpoint [model/checkpoint-100] due to args.save_total_limit
  weights = torch.tensor(weights.clone().detach()).float()
***** Running Ev

KeyboardInterrupt: 

In [None]:
model.save_pretrained(args.model_dir)
tokenizer.save_pretrained(args.model_dir)

Configuration saved in ./model/config.json
Model weights saved in ./model/pytorch_model.bin
tokenizer config file saved in ./model/tokenizer_config.json
Special tokens file saved in ./model/special_tokens_map.json
added tokens file saved in ./model/added_tokens.json


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')