In [11]:
import pickle as pickle
import os
import pandas as pd
import numpy as np
import random
import torch
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoModelForSequenceClassification, BertConfig, AutoConfig, ElectraConfig
from transformers import AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup
from load_data import *
import argparse
from importlib import import_module
from pathlib import Path
import glob
import re
import neptune
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
# project_qualified_name = 'jh951229/Pstage-2-EntityRelationExtraction'
# API_Token = 'eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzMmYwY2E4Ny0yYTg5LTRiZmQtODNjZC1mMzRmN2Q5ODFkNDkifQ=='

# neptune.init(project_qualified_name=project_qualified_name,api_token=API_Token)

# seed 고정 
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)


# 평가를 위한 metrics function.
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }


def increment_output_dir(output_path, exist_ok=False):
  
  path = Path(f'./results/{output_path}')
  print(path)
  if (path.exists() and exist_ok) or (not path.exists()):
    return str(path)
  else:
    dirs = glob.glob(f"{path}*")
    matches = [re.search(rf"%s(\d+)" %path.stem, d) for d in dirs]
    i = [int(m.groups()[0]) for m in matches if m]
    n = max(i) + 1 if i else 1
    return f"{path}{n}"

def lower_dir_search(dirname):  # 모델저장된 체크포인트 폴더 경로들을 찾아주는 함수
    filenames = os.listdir(dirname)
    lower_dir_list = []
    for filename in filenames:
        full_filename = os.path.join(dirname, filename)
        lower_dir_list.append(full_filename)
    return lower_dir_list
    
def preprocessing_oversampling_dataset(dataset, label_type, val_ratio):
  label = []
  for i in dataset[8]:
    if i == 'blind':
      label.append(100)
    else:
      label.append(label_type[i])
  out_dataset = pd.DataFrame({'sentence':dataset[1],'entity_01':dataset[2],'entity_02':dataset[5],'label':label,})
  
  # 단순오버샘플링
  label = out_dataset.label
  threshold = int(1/val_ratio)
  need_oversample_label = label.value_counts()[label.value_counts()<threshold].index
  need_oversample_label = np.ceil(threshold/label.value_counts()[need_oversample_label])
  need_oversample_label = dict(need_oversample_label)

  for label,oversample in need_oversample_label.items():
    data_from_label = out_dataset.loc[out_dataset.label==label]
    for _ in range(int(oversample)):
      out_dataset = pd.concat([out_dataset,data_from_label],axis=0)
    out_dataset.index = range(out_dataset.shape[0])
  out_dataset['sentence'] = out_dataset.apply(lambda x: x['entity_01']+"[SEP]"+x['entity_02']+"[SEP]"+x['sentence'], axis=1)
  return out_dataset

def load_oversample_data(dataset_dir):
  # load label_type, classes
  with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
    label_type = pickle.load(f)
  # load dataset
  dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
  # preprecessing dataset
  # dataset = preprocessing_dataset(dataset, label_type)
  dataset = preprocessing_oversampling_dataset(dataset, label_type, 0.2)
  
  return dataset

# bert input을 위한 tokenizing.
# tip! 다양한 종류의 tokenizer와 special token들을 활용하는 것으로도 새로운 시도를 해볼 수 있습니다.
# baseline code에서는 2가지 부분을 활용했습니다.
def tokenized_dataset(dataset, tokenizer):
  concat_entity = []
  for e01, e02 in zip(dataset['entity_01'], dataset['entity_02']):
    temp = ''
    temp = e01 + '[SEP]' + e02
    concat_entity.append(temp)
  tokenized_sentences = tokenizer(
      concat_entity,
      list(dataset['sentence']),
      return_tensors="pt",
      padding=True,
      truncation=True,
      # max_length=100,
      max_length=128,

      add_special_tokens=True,
      )
  return tokenized_sentences


In [13]:
model_name_from_pretrained = "monologg/koelectra-base-v3-discriminator"
model_type_getattr = "Electra"

tokenizer = AutoTokenizer.from_pretrained(model_name_from_pretrained)
model_config = ElectraConfig.from_pretrained(model_name_from_pretrained)
model_config.num_labels = 42
model_config.hidden_dropout_prob = 0.5

model_module = getattr(import_module("transformers"), f'{model_type_getattr}ForSequenceClassification')
# model = model_module.from_pretrained(model_name_from_pretrained, num_labels=42)
model = model_module.from_pretrained(model_name_from_pretrained, config=model_config)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [15]:
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [29]:


dataset = load_oversample_data("/opt/ml/input/data/train/train.tsv")
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
label = dataset['label']
train_idx, val_idx = next(split.split(dataset, label))
train_dataset = dataset.iloc[train_idx]
val_dataset = dataset.iloc[val_idx]
#val_dataset = load_data("./dataset/train/val.tsv")

train_label = train_dataset['label'].values
val_label = val_dataset['label'].values

# tokenizing dataset
tokenized_train = tokenized_dataset(train_dataset, tokenizer)
tokenized_val = tokenized_dataset(val_dataset, tokenizer)

# make dataset for pytorch.
RE_train_dataset = RE_Dataset(tokenized_train, train_label)
RE_val_dataset = RE_Dataset(tokenized_val, val_label)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,num_labels = 42)
model.parameters
model.to(device)

# total_num_epochs = train_dataset.shape[0]*epochs//batch_size+1
# model_saved_dir = increment_output_dir(MODEL_NAME.replace('/','_')) # f'./results/{output_path}'
# training_args = TrainingArguments(
# output_dir=model_saved_dir,          # output directory
# save_total_limit=3,              # number of total save model.
# save_steps=100,               # model saving step.
# num_train_epochs=epochs,              # total number of training epochs
# learning_rate=5e-5,            # learning_rate
# per_device_train_batch_size=batch_size, #batch size per device during training
# per_device_eval_batch_size=32,  # batch size for evaluation
# warmup_steps=total_num_epochs*0.01,                # number of warmup steps for learning rate scheduler
# weight_decay=0.01,               # strength of weight decay
# logging_dir='./logs',            # directory for storing logs
# logging_steps=50,              # log saving step.
# evaluation_strategy='steps', # evaluation strategy to adopt during training
#                             # `no`: No evaluation during training.
#                             # `steps`: Evaluate every `eval_steps`.
#                             # `epoch`: Evaluate every end of epoch.
# eval_steps = 50,            # evaluation step.
# max_grad_norm=1,



# )

# trainer = Trainer(

# model=model,                         # the instantiated 🤗 Transformers model to be trained
# args=training_args,                  # training arguments, defined above
# train_dataset=RE_train_dataset,         # training dataset
# eval_dataset=RE_val_dataset,             # evaluation dataset
# compute_metrics=compute_metrics         # define metrics function
# )



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

(9019, 4)
(8117, 4)
(902, 4)
