In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import os
import re
import random
import json
import datetime
import platform
from tqdm import tqdm
import datasets

from transformers import BertConfig, BertModel, BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, Trainer, TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, AdamW, AutoModelForCausalLM, BitsAndBytesConfig, AutoModelForMaskedLM
from transformers import StoppingCriteria, StoppingCriteriaList

## 1. 데이터 로드

In [None]:
df = pd.read_csv('../data/intermediate/Training_medical.csv')

In [None]:
df.head()

In [None]:
df['category'].value_counts()

In [None]:
# 정신질환 문제라서 정신과학만 선택
mental_df = df[df['category']=='정신과학']

## 2. 모델 로드

In [None]:
model_id = "monologg/kobert"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)

In [None]:
# 데이터 전처리 작업을 위해 datasets로 변환
dataset = datasets.Dataset.from_pandas(mental_df[['text']])

## 3. tokenize

In [None]:
def tokenize_function(samples):
    tokens = tokenizer(samples['text'], truncation=True, max_length=512, padding='max_length')
    return tokens

tokenized_dataset = dataset.map(tokenize_function)

## 4. Train

In [None]:
# data_collator 설정
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [None]:
# 학습 설정
training_args = TrainingArguments(
    output_dir="./results/intermediate",    # output 저장 dir
    evaluation_strategy="epoch",            # epoch이 끝날 때 평가 수행
    learning_rate=3e-4,                     # 학습률
    per_device_train_batch_size=8,          # 각 디바이스당 학습 배치 크기
    per_device_eval_batch_size=8,           # 각 디바이스당 평가 배치 크기
    report_to=["tensorboard"],              # tensorboard로 모니터링
    num_train_epochs=3,                     # 학습 epoch 수 
    logging_steps=10,                       # 10 step마다 로그 기록
    weight_decay=0.01,                      # 정규화
    logging_dir="./logs",                   # 로그 저장 dir
)

# Trainer 정의
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator
)

In [None]:
# 경고 무시
model.config.use_cache = False
# 학습
trainer.train()

In [None]:
# 평가
model.eval()