In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments, RobertaConfig, RobertaTokenizer, RobertaForSequenceClassification, BertTokenizer, EarlyStoppingCallback
from transformers import AutoModel
import torch
import pickle as pickle
from sklearn.model_selection import train_test_split
import torch.nn as nn

In [2]:
# load model and tokenizer
MODEL_NAME = "klue/roberta-large" # "bert-base-uncased", "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
added_token_num = tokenizer.add_special_tokens({"additional_special_tokens":["[LOC]", "[DAT]", "[NOH]", "[PER]", "[ORG]", "[POH]"]})

In [3]:
def load_data(dataset_dir):
  """ csv 파일을 경로에 맡게 불러 옵니다. """
  pd_dataset = pd.read_csv(dataset_dir)
  # dataset = preprocessing_dataset(pd_dataset)
  dataset = TEMP_preprocessing_dataset(pd_dataset)
  
  return dataset

def label_to_num(label):
  num_label = []
  with open('dict_label_to_num.pkl', 'rb') as f:
    dict_label_to_num = pickle.load(f)
  for v in label:
    num_label.append(dict_label_to_num[v])
  
  return num_label

In [4]:
# Typed Entity Marker(Punct) to Only Query
def TEMP_preprocessing_dataset(dataset):
  """ 처음 불러온 csv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다."""
  remove_idxs = []
  relabel_idxs = []
  relabel_labels = []
  retype_idxs = []
  retype_entitys = []

  # traning에서만 데이터가 3만개가 넘으므로 inference에서는 필터링이 되지않음.
  if len(dataset) > 30000:
    # 중복(삭제대상) 데이터 목록
    # 17142??
    remove_idxs = [3547,3296,10202,27920,22222,7168,15776,19571,25368,10616,
                  25094,20898,18171,27325,22772,8693,12829,14658,31786,24788,
                  8364,29180,31896,10043,22090,32282,14094,22258,31785,28350,
                  21757,31510,29511,20062,27116,31038,26044,22641,24373,30640,
                  28608,29854,28730,28010,29674,30378,32274,
                  18458,27755,20838,7080,25673]

    # 라벨 수정 데이터 목록
    # 17142 ??
    relabel_idxs = [6749,7276,13371]
    relabel_labels = ['org:top_members/employees','per:employee_of',
                      'per:employee_of']

    # type 수정 데이터 목록
    # 11554 ??
    # PER: 사람이름/ LOC: 지명 / ORG: 기관명 / POH: 기타 / DAT: 날짜
    # TIM: 시간 / DUR: 기간 / MNY: 통화 / PNT: 비율 / NOH: 기타 수량표현
    retype_idxs = [2464,30258,6530,7264,15128,11554,28644,28281]
    retype_entitys = [['obj_entity','ORG'],['obj_entity','POH'],['obj_entity','PER'],
                      ['obj_entity','ORG'],['obj_entity','POH'],['obj_entity','LOC'],
                      ['obj_entity','PER'],['sub_entity','ORG']]

  ids = []
  sentences = []
  subject_entity = []
  object_entity = []
  labels = []
  for id_, sentence, sub_ent, obj_ent, label in zip(dataset['id'],
                                                    dataset['sentence'],
                                                    dataset['subject_entity'],
                                                    dataset['object_entity'],
                                                    dataset['label']):
    # 중복 데이터 삭제
    if id_ in remove_idxs:
      continue
    ids.append(id_)
    sentences.append(sentence)

    S_WORD = eval(sub_ent)["word"]
    S_TYPE = eval(sub_ent)["type"]
    S_TEMP = ' '.join(['@', '*[', S_TYPE, ']*', S_WORD, '@'])
    subject_entity.append(S_TEMP)
    
    O_WORD = eval(obj_ent)["word"]
    O_TYPE = eval(obj_ent)["type"]    
    O_TEMP = ' '.join(['#', '^[', O_TYPE, ']^', O_WORD, '#'])
    object_entity.append(O_TEMP)

    # 타입수정
    if id_ in retype_idxs:
      entity = retype_entitys[retype_idxs.index(id_)]
      if entity[0] == 'sub_entity':
        subject_entity[-1] = entity[1]
      else:
        object_entity[-1] = entity[1]

    labels.append(label)
    # 라벨수정
    if id_ in relabel_idxs:
      labels[-1] = relabel_labels[relabel_idxs.index(id_)]

  out_dataset = pd.DataFrame({'id':ids, 'sentence':sentences,'subject_entity':subject_entity,'object_entity':object_entity,'label':labels,})
  return out_dataset

In [5]:
# load dataset dataset/train
dataset = load_data("/opt/ml/dataset/train/train.csv")

label = label_to_num(dataset['label'].values)
dataset.head()

Unnamed: 0,id,sentence,subject_entity,object_entity,label
0,0,〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey R...,@ *[ ORG ]* 비틀즈 @,# ^[ PER ]^ 조지 해리슨 #,no_relation
1,1,호남이 기반인 바른미래당·대안신당·민주평화당이 우여곡절 끝에 합당해 민생당(가칭)으...,@ *[ ORG ]* 민주평화당 @,# ^[ ORG ]^ 대안신당 #,no_relation
2,2,K리그2에서 성적 1위를 달리고 있는 광주FC는 지난 26일 한국프로축구연맹으로부터...,@ *[ ORG ]* 광주FC @,# ^[ ORG ]^ 한국프로축구연맹 #,org:member_of
3,3,균일가 생활용품점 (주)아성다이소(대표 박정부)는 코로나19 바이러스로 어려움을 겪...,@ *[ ORG ]* 아성다이소 @,# ^[ PER ]^ 박정부 #,org:top_members/employees
4,4,1967년 프로 야구 드래프트 1순위로 요미우리 자이언츠에게 입단하면서 등번호는 8...,@ *[ ORG ]* 요미우리 자이언츠 @,# ^[ DAT ]^ 1967 #,no_relation


In [6]:
train_dataset, dev_dataset, train_label, dev_label = train_test_split(dataset, label, test_size=0.2, shuffle=True, stratify=label, random_state=34)

In [7]:
# train_dataset.iloc[0]["sentence"]

In [8]:
# print(train_dataset.iloc[0]["subject_entity"])

In [9]:
def tokenized_dataset(dataset, tokenizer):
  """ tokenizer에 따라 sentence를 tokenizing 합니다."""
  concat_entity = []
  for e01, e02 in zip(dataset['subject_entity'], dataset['object_entity']):
    temp = e01 + '과' + e02 + '의 관계'
    concat_entity.append(temp)

  tokenized_sentences = tokenizer(
      concat_entity,
      list(dataset['sentence']),
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=256,
      add_special_tokens=True,
      return_token_type_ids = False
      )
  return tokenized_sentences

In [10]:
# tokenizing dataset
tokenized_train = tokenized_dataset(train_dataset, tokenizer)
tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

In [11]:
# tokenized_train[0]

In [12]:
# print(tokenized_train[0].tokens)

In [13]:
# print(tokenizer.decode(tokenized_train[0].ids))

In [14]:
# print(tokenizer.decode(tokenized_train[0].ids).count(train_dataset.iloc[0]["subject_entity"].split()[-2]))

In [15]:
# cnt = 0
# for key, values in tokenized_train.items():
#     print(key)
#     print(len(values))
#     print(values)
#     print(len(values[0]))
#     if cnt == 0:
#         break

In [16]:
# 토크나이징을 했을 때 키워드가 사라지는 ID확인
# print('토크나이징을 했을 때 키워드가 사라지는 ID')
# print("train")
# for i in range(len(tokenized_train)):
#     sentence = tokenizer.decode(tokenized_train[i].ids)
#     id_ = train_dataset.iloc[i]["id"]
#     sub_word = train_dataset.iloc[i]["subject_entity"].split()[-2]
#     obj_word = train_dataset.iloc[i]["object_entity"].split()[-2]
#     if sentence.count(sub_word)==1 or sentence.count(obj_word)==1:
#         print('ID:',id_)

# print('dev')
# for i in range(len(tokenized_dev)):
#     sentence = tokenizer.decode(tokenized_dev[i].ids)
#     id_ = dev_dataset.iloc[i]["id"]
#     sub_word = dev_dataset.iloc[i]["subject_entity"].split()[-2]
#     obj_word = dev_dataset.iloc[i]["object_entity"].split()[-2]
#     if sentence.count(sub_word)==1 or sentence.count(obj_word)==1:
#         print('ID:',id_)
        

In [17]:
# for i in range(len(train_dataset)):
#     if train_dataset.iloc[i]["id"] == 2843:
#         print(train_dataset.iloc[i]["sentence"])

In [18]:
# train_sentence_len = []
# for i in range(len(train_dataset)):
#     train_sentence_len.append(len(train_dataset.iloc[i]["sentence"]))
# print('train_sentence 최대길이:',sorted(train_sentence_len,reverse=True)[:50])
    
# dev_sentence_len = []
# for i in range(len(dev_dataset)):
#     dev_sentence_len.append(len(dev_dataset.iloc[i]["sentence"]))
# print('dev_sentence 최대길이:',sorted(dev_sentence_len,reverse=True)[:50])

In [19]:
class RE_Dataset(torch.utils.data.Dataset):
  """ Dataset 구성을 위한 class."""
  def __init__(self, pair_dataset, labels):
    self.pair_dataset = pair_dataset
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [20]:
# make dataset for pytorch.
RE_train_dataset = RE_Dataset(tokenized_train, train_label)
RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

In [21]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [22]:
# # BiGRU
# class Model_BiGRU(nn.Module):
#   def __init__(self, MODEL_NAME):
#     super().__init__()
#     self.model_config =  AutoConfig.from_pretrained(MODEL_NAME)
#     self.model_config.num_labels = 30
#     self.model = AutoModel.from_pretrained(MODEL_NAME, config = self.model_config)
#     self.hidden_dim = self.model_config.hidden_size
#     self.gru= nn.GRU(input_size= self.hidden_dim, hidden_size= self.hidden_dim, num_layers= 1, batch_first= True, bidirectional= True)
#     self.fc = nn.Linear(self.hidden_dim * 2, self.model_config.num_labels)
#     print('*** Model_BiGRU init....')
  
#   def forward(self, input_ids, attention_mask):
#     output = self.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
#     # (batch, max_len, hidden_dim)

#     hidden, last_hidden = self.gru(output)
#     output = torch.cat((last_hidden[0], last_hidden[1]), dim=1)
#     # hidden : (batch, max_len, hidden_dim * 2)
#     # last_hidden : (2, batch, hidden_dim)
#     # output : (batch, hidden_dim * 2)

#     logits = self.fc(output)
#     # logits : (batch, num_labels)

#     return {'logits' : logits}

In [23]:
import torch.nn.functional as F

class ModelStatic(nn.Module):
    def __init__(self, MODEL_NAME):
        
        super().__init__()
        print(MODEL_NAME, 'model loading..')
        self.model_config =  AutoConfig.from_pretrained(MODEL_NAME)
        self.model_config.num_labels = 30
        self.model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config = self.model_config)
        self.static_metrics = self.init_static_metrics(self.model_config.num_labels)
        self.static_cnt = 0
        # self.fc = nn.Linear()
        print('*** Model_Static initialized..!!')

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, 
                position_ids=None, head_mask=None):
        logits = self.model(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
                            attention_mask=attention_mask, head_mask=head_mask).logits
        # print("logits.shape:",logits.shape)
        static_p = torch.stack([F.softmax(self.static_metrics[torch.argmax(logit,dim=0)],dim=0)/10 for logit in logits])
        logits = logits + static_p.to(device)
            # m_logits = F.softmax(logits[i],dim=0).to(device) + F.softmax(self.staic_metrics[torch.argmax(logits[i],dim=0)],dim=0).to(device)/10
            # m_logits.to(device)
            # logits[i] = m_logits
        self.static_cnt += 1
        if self.static_cnt % 50 == 0:
            self.init_static_metrics(self.model_config.num_labels)
      
        return {'logits' : logits}

    def init_static_metrics(self,num_labels):
        return torch.zeros([num_labels, num_labels], dtype=torch.float)
    
    def update_static_metrics(self,logits,labels):
        for logit, label in zip(logits,labels):
            self.static_metrics[torch.argmax(logit)][label] += 1
        return 0

In [24]:
# setting model hyperparameter
model = ModelStatic(MODEL_NAME)
model.model.resize_token_embeddings(tokenizer.vocab_size + added_token_num)
# model.resize_token_embeddings(tokenizer.vocab_size + added_token_num)
model.to(device)

klue/roberta-large model loading..


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'cla

*** Model_Static initialized..!!


ModelStatic(
  (model): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(32006, 1024)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(in_featu

In [25]:
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def compute_loss(self, model, inputs, return_outputs= False):
        device= torch.device('cuda:0' if torch.cuda.is_available else 'cpu:0')
        labels= inputs.pop('labels')
        # forward pass
        outputs= model(**inputs)
        
        # 인덱스에 맞춰서 과거 ouput을 다 저장
        if self.args.past_index >=0:
            self._past= outputs[self.args.past_index]
            
        # compute custom loss (suppose one has 3 labels with different weights)
        custom_loss= torch.nn.CrossEntropyLoss().to(device)
        loss= custom_loss(outputs['logits'], labels)
        # print(outputs['logits'])
        # print(labels)
        model.update_static_metrics(outputs['logits'], labels)
        return (loss, outputs) if return_outputs else loss

In [26]:
def compute_metrics(pred):
  """ validation을 위한 metrics function """
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  probs = pred.predictions

  # calculate accuracy using sklearn's function
  f1 = klue_re_micro_f1(preds, labels)
  auprc = klue_re_auprc(probs, labels)
  acc = accuracy_score(labels, preds) # 리더보드 평가에는 포함되지 않습니다.

  wandb.log({'micro f1 score': f1})
  wandb.log({'accuracy': acc})

  # 모델 예측값 분석을 위한 wandb table
  columns=["preds", "labels"]
  record_table = wandb.Table(columns=columns)
  for pre,lab in zip(preds,labels):
    record_table.add_data(pre,lab)
  wandb.log({"predictions" : record_table})

  return {
      'micro f1 score': f1,
      'auprc' : auprc,
      'accuracy': acc,
  }

In [27]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    save_total_limit=5,              # number of total save model.
    save_strategy='epoch', # 'epoch',
    save_steps=500,                 # Number of model saving step. if logging_strategy="steps".
    num_train_epochs=7,              # total number of training epochs
    learning_rate=5e-5,               # learning_rate
    per_device_train_batch_size=29,  # batch size per device during training
    per_device_eval_batch_size=29,   # batch size for evaluation
    warmup_ratio=0.1,
    # warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    label_smoothing_factor=0.1,
    logging_dir='./logs',            # directory for storing logs
    logging_steps=500,              # log saving step.
    evaluation_strategy='epoch', # evaluation strategy to adopt during training
                                # `no`: No evaluation during training.
                                # `steps`: Evaluate every `eval_steps`.
                                # `epoch`: Evaluate every end of epoch.
    # eval_steps = 500,            # evaluation step.
    load_best_model_at_end = True,
    fp16=True,
    # report_to="wandb",  # enable logging to W&B
    # run_name="bert-base-high-lr"
  )

trainer = CustomTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=RE_train_dataset,      # training dataset
    eval_dataset=RE_dev_dataset,         # evaluation dataset
    compute_metrics=compute_metrics,      # define metrics function
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
  )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using amp fp16 backend


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [28]:
trainer.train()

***** Running training *****
  Num examples = 25934
  Num Epochs = 7
  Instantaneous batch size per device = 29
  Total train batch size (w. parallel, distributed & accumulation) = 29
  Gradient Accumulation steps = 1
  Total optimization steps = 6265
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mboostcamp-nlp10-level2[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 


