In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import os
import re
import random
import datetime
import platform
from tqdm import tqdm


from transformers import BertConfig, BertModel, BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, AdamW

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, \
                            roc_auc_score, confusion_matrix, classification_report, \
                            matthews_corrcoef, cohen_kappa_score, log_loss, confusion_matrix

- KoBERT : monologg/kobert
- KR-BERT : snunlp/KR-BERT-char16424
- KoELECTRA : monologg/koelectra-base-v3-discriminator
    - input : token id, attention mask, token type id
- Mental_BERT : https://huggingface.co/AIMH/mental-bert-base-cased
- Klue-RoBERTa : klue/roberta-base
- KorBERT(후보)

In [None]:
# !pip install ipython-autotime

In [None]:
# 맥북인 경우 mps 할당량 설정
# import torch.mps
# torch.mps.set_per_process_memory_fraction(0.8)

# cache 정리 
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

# 셀 별로 러닝타임 측정
%load_ext autotime

In [None]:
# BERT 모델 딕셔너리
bert_models = {'KoBERT' : 'monologg/kobert', 
               'KR-BERT' : 'snunlp/KR-BERT-char16424', 
               'KoELECTRA' : 'monologg/koelectra-base-v3-discriminator', 
               'Klue-RoBERTa' : 'klue/roberta-base', 
               'Mental_BERT' : 'AIMH/mental-bert-base-cased'}

In [None]:
# BATCH_SIZE = 4
BATCH_SIZE = 32
learning_rate =  5e-5
# epoch = 500
epoch = 10

# EarlyStopping 변수
patience = 3
# early_stopping_epochs = 5 # 이거 뭐지
# best_loss = float('inf')

MAX_LEN = 512
seed_val = 42
ep = 1e-8

In [None]:
'''
BERT 수행 위한 class
@def : __init__
      convert_data
      convert_tensor
      train
'''
class BertModel():

    def __init__(self, MODEL_NAME, TOKEN=None) :
        self.model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=11, token=TOKEN)
        self.tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, token=TOKEN)
        self.seed_val = seed_val
        self.max_len = MAX_LEN
        self.best_loss = float('inf')  # 초기화
        
        # 디바이스 설정
        os_name = platform.system()
        if os_name == 'Darwin' :  # MacOS 
            self.device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
        elif os_name == 'Windows' :
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else :
            # self.device = torch.device('cpu')
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    '''
    문장 전처리
    @param series : tests, targets
    @return list : token_ids_list, 
                   attention_mask, 
                   targets_list

    `240508 기존: 각 문장마다 개별적으로 토큰화하고 패딩함 => 변경: 모든 문장을 처리한 후, 전체 데이터에 대해 한 번에 패딩 적용

    '''
    # def convert_data(self, texts, targets):
    #     token_ids_list, attention_mask, targets_list = [], [], []
    #     for text, target in tqdm(zip(texts, targets), total=len(texts)):
    #         tokens = []
    #         sentences = text.split('.')
    #         # 문장 구분 토큰 생성
    #         sentences = ['[CLS]' + sentence + '[SEP]' for sentence in sentences]
    #         # tokenize
    #         tokens = [self.tokenizer.tokenize(sentence) for sentence in sentences]
    #         token_ids = [self.tokenizer.convert_tokens_to_ids(token) for token in tokens]

    #         # Padding
    #         input_ids = pad_sequences(token_ids, maxlen=self.max_len, dtype='long', truncating='post', padding='post')

    #         # attention masking
    #         attention_mask.append([[float(i>0) for i in seq] for seq in input_ids])
    #         token_ids_list.append(input_ids)
    #         targets_list.extend([target] * len(input_ids))
        
    #     return token_ids_list, attention_mask, targets_list
    def convert_data(self, texts, targets):
        token_ids_list, attention_masks, targets_list = [], [], []
        
        for text, target in tqdm(zip(texts, targets), total=len(texts)):
            encoded_dict = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,  # 문장 시작과 끝에 특수 토큰 추가
                max_length=self.max_len,  # 시퀀스 최대 길이 설정
                pad_to_max_length=True,   # 패딩 적용
                return_attention_mask=True,  # 어텐션 마스크 생성
                return_tensors='pt',     # 파이토치 텐서로 반환
                truncation=True
            )
            
            token_ids_list.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
            targets_list.append(target)
        
        token_ids_tensor = torch.cat(token_ids_list, dim=0)
        attention_mask_tensor = torch.cat(attention_masks, dim=0)
        targets_tensor = torch.tensor(targets_list, dtype=torch.long)
        
        return token_ids_tensor, attention_mask_tensor, targets_tensor

    
    '''
    tensor로 변환
    @param list : token_ids_list, attention_mask, target
    @return tensor : tensor_data

    `240508 김윤겸- convert_data 함수에서 반환된 token_ids_list와 attention_masks는 이미 텐서 객체로 반환되고 있음. 따라서 convert_tensor 삭제 처리
    '''
    def convert_tensor(self, token_ids_list, attention_mask, target) :
        token_ids_tensor = torch.tensor([np.array(item) for sublist in token_ids_list for item in sublist], dtype=torch.long)
        attention_mask_tensor = torch.tensor([np.array(item) for sublist in attention_mask for item in sublist], dtype=torch.float)
        targets_tensor = torch.tensor(np.array(target), dtype=torch.long)

        # 사이즈 확인
        print("Token IDs Tensor Size: ", token_ids_tensor.size())
        print("Attention Mask Tensor Size: ", attention_mask_tensor.size())
        print("Targets Tensor Size: ", targets_tensor.size())

        tensor_data = TensorDataset(token_ids_tensor, attention_mask_tensor, targets_tensor)

        return tensor_data
    
    '''
    train/valid 수행
    @param TensorDataset : train_loader, val_loader
    @return 
    '''
    def train(self, train_loader, val_loader):
        model = self.model
        seed_val = self.seed_val
        device = self.device
        print("현재 device 정보:", device)

        self.optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=ep)
        scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epoch)

        random.seed(seed_val)
        np.random.seed(seed_val)
        torch.manual_seed(seed_val)
        torch.cuda.manual_seed_all(seed_val)  # CUDA seed 고정

        model.to(device)
        model.zero_grad()

        for e in range(epoch): # batch 단위에서 이뤄지도록 여기선 삭제
            model.train()
            total_loss, total_correct, total_samples = 0, 0, 0 # 정확도 개선

            for step, batch in tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Epoch {e+1}/{epoch}', leave=False):
                batch = tuple(item.to(device) for item in batch)
                batch_input_ids, batch_input_mask, batch_labels = batch

                model.zero_grad()
                outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
                loss = outputs.loss
                total_loss += loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                self.optimizer.step()
                scheduler.step()

                logits = outputs.logits.detach()
                predictions = torch.argmax(logits, dim=-1)
                total_correct += (predictions == batch_labels).sum().item()
                total_samples += batch_labels.size(0)

            # 정확도 개선
            avg_train_loss = total_loss / len(train_loader)
            train_accuracy = total_correct / total_samples
            print(f'Epoch {e+1}: Average train loss: {avg_train_loss:.4f} / Accuracy: {train_accuracy:.4f}')

            # Validation
            model.eval()
            val_loss, val_correct, val_samples = 0, 0, 0
            val_true, val_pred = [], []  # 초기화

            for batch in val_loader:
                batch = tuple(item.to(device) for item in batch)
                batch_input_ids, batch_input_mask, batch_labels = batch

                with torch.no_grad():
                    outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
                    loss = outputs.loss
                    val_loss += loss.item()

                    logits = outputs.logits.detach()
                    predictions = torch.argmax(logits, dim=-1)
                    val_correct += (predictions == batch_labels).sum().item()
                    val_samples += batch_labels.size(0)
                    val_true.extend(batch_labels.cpu().numpy())
                    val_pred.extend(predictions.cpu().numpy())

            avg_val_loss = val_loss / len(val_loader)
            val_accuracy = val_correct / val_samples
            print(f'Epoch {e+1}: Average validation loss: {avg_val_loss:.4f} / Accuracy: {val_accuracy:.4f}')

            if e == epoch - 1:  # 마지막 에포크에서만 classification report 출력
                print('Final Validation Classification Report:\n', classification_report(val_true, val_pred))

            # Early Stopping
            if avg_val_loss < self.best_loss:
                self.best_loss = avg_val_loss
                early_stopping_counter = 0
            else:
                early_stopping_counter += 1
                
            if early_stopping_counter >= patience:
                print(">>> Early stopping triggered!")
                print('Final Validation Classification Report:\n', classification_report(val_true, val_pred))
                break


    '''
    학습 모델 저장
    @param  PATH 지정 필요
    @return 

    `240508 윤겸: 파일 이름을 지정할 때 모델 객체의 문자열 표현을 사용하고 있기 때문에 파일 이름이 지나치게 길어져 시스템의 파일 이름 길이 제한을 초과 => 변경
    '''
    def model_save(self, curr_model):
        PATH = f'./{curr_model}'
        if not os.path.exists(PATH):
            os.makedirs(PATH)

        # 모델과 상태 사전을 저장합니다. 파일 이름을 'bert_model.pt'와 'bert_model_state_dict.pt'로 고정합니다.
        model_file_path = os.path.join(PATH, 'bert_model.pt')
        state_dict_path = os.path.join(PATH, 'bert_model_state_dict.pt')
        all_data_path = os.path.join(PATH, 'all.tar')

        torch.save(self.model, model_file_path)  # 모델 전체 저장
        torch.save(self.model.state_dict(), state_dict_path)  # 모델의 상태 사전(state_dict) 저장

        # 옵티마이저 상태 포함 전체 데이터 저장
        torch.save({
            'model': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict()
        }, all_data_path)


In [None]:
# class name - Run
# parameter - df : 수행할 데이터프레임, model_name : 사용할 모델명
# 데이터 전처리부터 학습까지 수행
class Run() :
    def __init__(self, df, model_name, token, curr_model) :
        self.df = df
        self.model_name = model_name
        self.token = token
        self.curr_model = curr_model

    def run(self) :
        print(f'{self.curr_model} Start >>>>>>>>>> ')
        # model 선언
        bert_model = BertModel(self.model_name)

        # train_test_split
        train_x, val_x, train_y, val_y = train_test_split(self.df['HS'], self.df['label'], test_size=0.2, random_state=seed_val)

        # 데이터 전처리
        train_token_ids_list, train_attention_mask, train_targets = bert_model.convert_data(train_x, train_y)
        val_token_ids_list, val_attention_mask, val_targets = bert_model.convert_data(val_x, val_y)

        # # tensor로 변환
        # train_data = bert_model.convert_tensor(train_token_ids_list, train_attention_mask, train_targets)
        # val_data = bert_model.convert_tensor(val_token_ids_list, val_attention_mask, val_targets)

        # # DataLoader
        # train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
        # val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True)

        # DataLoader 생성
        train_data = TensorDataset(train_token_ids_list, train_attention_mask, torch.tensor(train_targets, dtype=torch.long))
        val_data = TensorDataset(val_token_ids_list, val_attention_mask, torch.tensor(val_targets, dtype=torch.long))
        train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)

        # Train
        bert_model.train(train_loader, val_loader)

        # model save   PATH 지정 필요
        bert_model.model_save(self.curr_model)

In [None]:
# 데이터 읽어오기
df = pd.read_csv('../data/train.csv')

# 학습을 위해 수치형으로 변환
df['label'] = df['label'].map({
                                'ADHD' : 0,
                                'PTSD(posttraumatic_stress_disorder)' : 1,
                                'bipolar_disorder'                   : 2,
                                'obsessive_compulsive_disorder'      : 3,
                                'normal'                             : 4,
                                'paranoid_personality_disorder'  : 5,
                                'avoidant_personality_disorder'  : 6,
                                'seperation_anxiety_disorder'    : 7,
                                'MDD(major_depressive_disorder)' : 8,
                                'generalized_anxiety_disorder'   : 9,
                                'neurocognitive_disorders'       : 10
                            })

# 미사용 컬럼 삭제
df.drop('profile_persona_id', axis=1, inplace=True)

# 데이터셋의 일부만 사용 (예: 10% 샘플링)
df_sampled = df.sample(frac=0.1, random_state=42)

In [None]:
# 모델 선언
MODEL_NAME = bert_models['Mental_BERT']
# 저장을 위한 현재 모델명 파라미터로 전단
curr_model = 'Mental_BERT'
# huggingface token
TOKEN = ""
# 데이터 전처리 및 학습을 위한 클래스 선언
bert_run = Run(df_sampled, MODEL_NAME, TOKEN, curr_model)

In [None]:
# process 수행
bert_run.run()