## 모델선정 3
- 첫 BERT모델인 BERT_model의 코드를 더 간결하고 성능을 향상시키기 위한 작업을 수행하였습니다
- Hugging Face라이브러리를 도입하여 더욱 간결한 코드 작성 및 학습을 추구하였습니다
- 기존 KoBERT모델에 마지막 Classifiaction을 처리하는 단계에 층을 하나더 추가하여 보다 복잡한 분류를 수행하고자 하였습니다

## 발견한 점
- 정확도가 0.986으로 이전 BERT모델에 비하여 성능 향상을 꾀할 수 있었습니다
- 모델의 틀이 확정되어 하이퍼 파라미터 튜닝을 위한 작업을 계획하게 되었습니다

In [None]:
import os, sys
from google.colab import drive

#내 구글드라이브 디렉토리 설정
drive.mount('/content/drive')

my_path = '/content/notebooks'
# Colab Notebooks 안에 package_collection 폴더에 패키지 저장
#매 실행마다 이 코드를 재실행해야 해당 디렉토리에 있는 패키지를 사용할 수 있음
os.symlink('/content/drive/MyDrive/Colab Notebooks/py_env', my_path)
sys.path.insert(0, my_path)

Mounted at /content/drive


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/py_env/DistilKoBERT-master')
from tokenization_kobert import KoBertTokenizer

tf_tok = KoBertTokenizer.from_pretrained('monologg/kobert')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=371391.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=77779.0), HTML(value='')))




In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from tqdm import tqdm, tqdm_notebook

In [None]:
#전처리 작업을 일원화 하는 함수 만들기

#데이터셋 클래스 선언
class nh_news_Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [None]:
#데이터셋 생성 작업 일원화 함수 생성
import pandas as pd
from sklearn.model_selection import train_test_split

def make_nh_dataset(file_path, tokenizer, eval = False):

  df = pd.read_csv(file_path)

  df_texts = df['tle_cont'].tolist()
  df_labels = df['info'].tolist()

  if eval == True:
    train_texts, val_texts, train_labels, val_labels = train_test_split(df_texts, df_labels,
                                                                    test_size = 0.1, random_state = 156,
                                                                    stratify = df_labels)
    
    train_encodings = tokenizer(train_texts, truncation= True, padding= True, max_length= 128)
    val_encodings = tokenizer(val_texts, truncation= True, padding= True, max_length= 128)

    train_dataset = nh_news_Dataset(train_encodings, train_labels)
    val_dataset = nh_news_Dataset(val_encodings, val_labels)

    return train_dataset, val_dataset

  else:
    text_encodings = tokenizer(df_texts, truncation= True, padding= True, max_length= 128)

    dataset = nh_news_Dataset(text_encodings, df_labels)

    return dataset

In [None]:
tr_dataset, val_dataset = make_nh_dataset('/content/drive/MyDrive/dacon/train_c_df.txt', tf_tok, eval=True)

In [None]:
test_dataset = make_nh_dataset('/content/drive/MyDrive/dacon/test_c_df.txt', tf_tok)

In [None]:
#커스텀 BERT_Classifier 정의
#소스코드는 https://huggingface.co/transformers/v3.0.2/_modules/transformers/modeling_bert.html#BertForSequenceClassification.forward
#버전을 제대로 확인하지 못하고 최신버전의 소스코드를 가져올 경우 에러 발생
#3.0.2 버전 기준으로 소스코드를 가져와서 구현함

from transformers import BertPreTrainedModel
from transformers import BertModel
from torch.nn import CrossEntropyLoss, MSELoss

class custom_bert_clf(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 256)
        self.classifier2 = nn.Linear(256, config.num_labels)
        self.gelu = nn.GELU()
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
    ):


        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)

        cl1_output = self.classifier(pooled_output)

        cl1_output= self.gelu(cl1_output)

        logits = self.classifier2(cl1_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [None]:
tf_bertmodel = custom_bert_clf.from_pretrained('monologg/kobert')

Some weights of custom_bert_clf were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.weight', 'classifier2.bias', 'classifier.bias', 'classifier2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device("cuda")

tf_bertmodel = tf_bertmodel.to(device)

In [None]:
#Trainer 로 학습하기

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(output_dir= '/content/drive/MyDrive/dacon/results_1223', 
                                  num_train_epochs= 3,
                                  per_device_train_batch_size= 32,
                                  per_device_eval_batch_size = 64,
                                  warmup_steps = int(1670 * 0.1), #여기서 1670은 전체데이터 / 배치사이즈
                                  #0.1은 이전 학습에서 사용했던 warmup_ratio = 0.1을 동일하게 사용
                                  weight_decay = 0.01,
                                  save_steps = 500,
                                  overwrite_output_dir=True,
                                  evaluate_during_training=True)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
trainer = Trainer(
    args = training_args,
    model = tf_bertmodel,
    train_dataset = tr_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()

HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=3340.0), HTML(value='')))

HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))






HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))





HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=3340.0), HTML(value='')))

HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))





HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=3340.0), HTML(value='')))

HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))




HBox(children=(HTML(value='Evaluation'), FloatProgress(value=0.0, max=186.0), HTML(value='')))






TrainOutput(global_step=10020, training_loss=0.04666464655079062)

## 런타임 튕김 방지용

In [None]:
# test 데이터셋 이용 평가
tf_bertmodel.eval()

pred = trainer.predict(test_dataset= test_dataset)

HBox(children=(HTML(value='Prediction'), FloatProgress(value=0.0, max=2228.0), HTML(value='')))




  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
pred_result = []

for i in range(len(pred[0])):
  pred_val = pred[0][i]
  max_vals, max_indices = torch.max(torch.Tensor(pred_val.reshape(1,-1)), 1)
  max_indices = max_indices.detach().cpu().numpy()

  pred_result.extend(max_indices)

In [None]:
test_submission = pd.read_csv('/content/drive/MyDrive/dacon/sample_submission.csv')

test_submission['info'] = pred_result

In [None]:
test_submission.to_csv('/content/drive/MyDrive/dacon/mh_4_submission_bert.csv', index = False)
  #0.98636 으로 정확도가 약 0.05 상승
  #성능향상 요인?
  #KoBert 모델 사용, distilBERT를 사용하지 않음
  #classification을 위해서 층을 하나 더 쌓았음
  #입력된 데이터가 제목과 콘텐츠를 동시에 결합한 형태임
  #다만 수행시간은 거의 150을 초과하여 거의 3시간 소요