In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob

import warnings
warnings.filterwarnings('ignore')

import os
import re
import random
import json
import datetime
import platform
from tqdm import tqdm
import datasets
from datasets import load_dataset, DatasetDict
import gc

from transformers import BertConfig, BertModel, BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, Trainer, TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, AdamW, AutoModelForCausalLM, BitsAndBytesConfig, AutoModelForMaskedLM
from transformers import StoppingCriteria, StoppingCriteriaList

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, \
                            roc_auc_score, confusion_matrix, classification_report, \
                            matthews_corrcoef, cohen_kappa_score, log_loss, confusion_matrix

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [None]:
os_name = platform.system()
if os_name == 'Darwin' :  # MacOS 
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
elif os_name == 'Windows' :
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else :
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

## 1. 데이터 로드

In [None]:
# def convert_df(file_path):
#     records = []
#     with open(file_path, 'r') as f:
#         json_data = json.load(f)
#     for entry in json_data['data']:
#         record = {
#             'book_id': entry['book_id'],
#             'category': entry['category'],
#             'popularity': entry['popularity'],
#             'text': entry['text'],
#             'word_segment': entry['word_segment'],
#             'publication_ymd': entry['publication_ymd']
#         }
#         records.append(record)
        
#     return pd.DataFrame(records)

In [None]:
# file_path = '../data/intermediate/Training_medical.json'
# df = convert_df(file_path)

In [None]:
# mental_df = df[df['category']=='정신과학']

In [None]:
def txtToDf(path):
    with open(path, 'r') as file:
        lines = file.readlines()

    # # 각 줄에서 불필요한 공백과 쉼표를 제거하고 리스트로 저장
    add = [line.strip().rstrip(',') for line in lines]
    return pd.DataFrame(add, columns=['text'])

In [None]:
# 여기부터 실행
mental1 = pd.read_csv('../data/intermediate/mental1.csv') # 의료 전문서적 말뭉치
mental2 = txtToDf('../data/intermediate/mental2.txt')     # DSM-5 
mental3 = txtToDf('../data/intermediate/mental3.txt')     # 국가건강정보포털 말뭉치
mental4 = pd.read_csv('../data/intermediate/mental4.csv') # 전문분야 한영 말뭉치

## 2. 모델 로드

In [None]:
# 모델 ID 선언
model_id = "snunlp/KR-BERT-char16424"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)

In [None]:
train_df, eval_df = train_test_split(df, test_size=.2, random_state=42)

In [None]:
train_dataset = datasets.Dataset.from_pandas(train_df[['text']])
eval_dataset = datasets.Dataset.from_pandas(eval_df[['text']])

## 3. tokenize

In [None]:
def tokenize_function(samples):
    tokens = tokenizer(samples['text'], truncation=True, padding='max_length', max_length=512)
    return tokens

tokenized_train_dataset = train_dataset.map(tokenize_function)
tokenized_eval_dataset = eval_dataset.map(tokenize_function)

## 4. Train

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [None]:
# 학습 설정
training_args = TrainingArguments(
    output_dir="./results/intermediate",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to=["tensorboard"],
    num_train_epochs=200,
    logging_steps=20,
    weight_decay=0.05,
    logging_dir="./logs",
)

# Trainer 정의
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator
)

In [None]:
model.config.use_cache = False
result = trainer.train()

In [None]:
result

In [None]:
model.eval()

## 5. Save

In [None]:
intermediate_model = trainer.model
torch.save(intermediate_model.state_dict(), './model/mental_intertxt_dsm/intermediate_model_weights.pth')

In [None]:
model.save_pretrained("./model/mental_intertxt_dsm/intermediate_model")
tokenizer.save_pretrained("./model/mental_intertxt_dsm/intermediate_model/tokenize")