In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import os
import re
import random
import datetime
import platform
from tqdm import tqdm


from transformers import BertConfig, BertModel, BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, AdamW

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, \
                            roc_auc_score, confusion_matrix, classification_report, \
                            matthews_corrcoef, cohen_kappa_score, log_loss, confusion_matrix

- KoBERT : monologg/kobert
- KR-BERT : snunlp/KR-BERT-char16424
- KoELECTRA : monologg/koelectra-base-v3-discriminator
    - input : token id, attention mask, token type id
- Mental_BERT
- Klue-RoBERTa : klue/roberta-base
- KorBERT(후보)

In [None]:
# 맥북인 경우 mps 할당량 설정
# import torch.mps
# torch.mps.set_per_process_memory_fraction(0.8)

# cache 정리 
torch.cuda.empty_cache()

# 셀 별로 러닝타임 측정
%load_ext autotime

In [None]:
# BERT 모델 딕셔너리
bert_models = {'KoBERT' : 'monologg/kobert', 'KR-BERT' : 'snunlp/KR-BERT-char16424', 'KoELECTRA' : 'monologg/koelectra-base-v3-discriminator', 'Klue-RoBERTa' : 'klue/roberta-base'}

In [None]:
BATCH_SIZE = 4
learning_rate =  5e-5
epoch = 500

# EarlyStopping 변수
patience = 10
early_stopping_epochs = 5
best_loss = float('inf')

MAX_LEN = 512
seed_val = 42
ep = 1e-8

In [None]:
'''
BERT 수행 위한 class
@def : __init__
      convert_data
      convert_tensor
      train
'''
class BertModel():

    def __init__(self, MODEL_NAME) :
        self.model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=11)
        self.tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
        self.seed_val = seed_val
        self.max_len = MAX_LEN
        
        # 디바이스 설정
        os_name = platform.system()
        if os_name == 'Darwin' :  # MacOS 
            self.device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
        elif os_name == 'Windows' :
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else :
            self.device = torch.device('cpu')

    
    '''
    문장 전처리
    @param series : tests, targets
    @return list : token_ids_list, 
                   attention_mask, 
                   targets_list
    '''
    def convert_data(self, texts, targets):
        token_ids_list, attention_mask, targets_list = [], [], []
        for text, target in tqdm(zip(texts, targets), total=len(texts)):
            tokens = []
            sentences = text.split('.')
            # 문장 구분 토큰 생성
            sentences = ['[CLS]' + sentence + '[SEP]' for sentence in sentences]
            # tokenize
            tokens = [self.tokenizer.tokenize(sentence) for sentence in sentences]
            token_ids = [self.tokenizer.convert_tokens_to_ids(token) for token in tokens]

            # Padding
            input_ids = pad_sequences(token_ids, maxlen=self.max_len, dtype='long', truncating='post', padding='post')

            # attention masking
            attention_mask.append([[float(i>0) for i in seq] for seq in input_ids])
            token_ids_list.append(input_ids)
            targets_list.extend([target] * len(input_ids))
        
        return token_ids_list, attention_mask, targets_list
    
    '''
    tensor로 변환
    @param list : token_ids_list, attention_mask, target
    @return tensor : tensor_data
    '''
    def convert_tensor(self, token_ids_list, attention_mask, target) :
        token_ids_tensor = torch.tensor([np.array(item) for sublist in token_ids_list for item in sublist], dtype=torch.long)
        attention_mask_tensor = torch.tensor([np.array(item) for sublist in attention_mask for item in sublist], dtype=torch.float)
        targets_tensor = torch.tensor(np.array(target), dtype=torch.long)

        # 사이즈 확인
        print("Token IDs Tensor Size: ", token_ids_tensor.size())
        print("Attention Mask Tensor Size: ", attention_mask_tensor.size())
        print("Targets Tensor Size: ", targets_tensor.size())

        tensor_data = TensorDataset(token_ids_tensor, attention_mask_tensor, targets_tensor)

        return tensor_data
    
    '''
    train/valid 수행
    @param TensorDataset : train_loader, val_loader
    @return 
    '''
    def train(self, train_loader, val_loader) :
        model = self.model
        seed_val = self.seed_val
        device = self.device

        self.optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=ep)
        scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epoch)
        
        # seed 고정
        random.seed(seed_val)
        np.random.seed(seed_val)
        torch.manual_seed(seed_val)
        if self.device == 'cuda' :
            torch.cuda.manual_seed(seed_val)
        elif self.device == 'mps' :
            torch.mps.manual_seed(seed_val)
        
        model.to(device)
        model.zero_grad()
        train_len = len(train_loader)

        for e in tqdm(range(0, epoch)) :
            model.train()
            total_loss, total_accuracy = 0, 0
            train_true, train_pred = [], []
            print(f'Epoch : {e+1} in {epoch} >>>>>>>>>>>>>>>>> ')
            
            for step, batch in tqdm(enumerate(train_loader)):
                batch = tuple(item.to(self.device) for item in batch)
                batch_input_ids, batch_input_mask, batch_labels = batch
                outputs = model(input_ids=batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
                loss = outputs.loss
                total_loss += loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                self.optimizer.step()
                scheduler.step()
                model.zero_grad()

                train_true.extend(batch_labels.tolist())
                train_pred.extend(np.argmax(outputs.logits.detach().cpu().numpy(), axis=1).tolist())
            
            avg_loss = total_loss / train_len
            avg_accuracy = total_accuracy / train_len
            print(f'Epoch {e+1} Average train loss : {avg_loss}    /   accuracy : {avg_accuracy}')

            train_confusion = confusion_matrix(train_true, train_pred)
            print('Train Confusion Matrix:\n', train_confusion)

            train_classification_report = classification_report(train_true, train_pred)
            print('Train Classification Report:\n', train_classification_report)

        
            print(f'Running Validation...........')

            model.eval()
            val_len = len(val_loader)
            val_loss, val_accuracy = 0, 0
            val_true, val_pred = [], []

            for batch in val_loader :
                batch_input_ids, batch_input_mask, batch_labels = [item.to(self.device) for item in batch]
                with torch.no_grad() :
                    outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
                    loss = outputs.loss
                    val_loss += loss.item()

                    val_true.extend(batch_labels.tolist())
                    val_pred.extend(np.argmax(outputs.logits.detach().cpu().numpy(), axis=1).tolist())    

            eval_avg_loss = val_loss / val_len
            eval_avg_accuracy = val_accuracy / val_len
            print(f'Epoch {e+1} Average Validataion loss : {eval_avg_loss}  /   accuracy : {eval_avg_accuracy}')

            val_confusion = confusion_matrix(val_true, val_pred)
            print('Validation Confusion Matrix:\n', val_confusion)

            val_classification_report = classification_report(val_true, val_pred)
            print('Validation Classification Report:\n', val_classification_report)

            # Early Stopping
            if eval_avg_loss < best_loss:
                best_loss = eval_avg_loss
                early_stopping_counter = 0
            else:
                early_stopping_counter += 1
                
            if early_stopping_counter >= patience:
                print("Early stopping")
                break     

    '''
    학습 모델 저장
    @param  PATH 지정 필요
    @return 
    '''
    def model_save(self) :
        PATH = './bert_model'
        if not os.path.exists(PATH):
            os.makedirs(PATH)
        torch.save(self.model, f'{PATH}/{self.model}_model.pt')  # 모델 저장
        torch.save(self.model.state_dict(), f'{PATH}/{self.model}_model_state_dict.pt')  # 모델 객체의 state_dict 저장
        torch.save({
            'model': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict()
        }, PATH + 'all.tar')  

In [None]:
# class name - Run
# parameter - df : 수행할 데이터프레임, model_name : 사용할 모델명
# 데이터 전처리부터 학습까지 수행
class Run() :
    def __init__(self, df, model_name) :
        self.df = df
        self.model_name = model_name
    def run(self) :
        # model 선언
        bert_model = BertModel(self.model_name)
        # train_test_split
        train_x, val_x, train_y, val_y = train_test_split(self.df['HS'], self.df['label'], test_size=0.2, random_state=seed_val)
        # 데이터 전처리
        train_token_ids_list, train_attention_mask, train_targets = bert_model.convert_data(train_x, train_y)
        val_token_ids_list, val_attention_mask, val_targets = bert_model.convert_data(val_x, val_y)
        # tensor로 변환
        train_data = bert_model.convert_tensor(train_token_ids_list, train_attention_mask, train_targets)
        val_data = bert_model.convert_tensor(val_token_ids_list, val_attention_mask, val_targets)
        # DataLoader
        train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True)
        # Train
        bert_model.train(train_loader, val_loader)
        # model save   PATH 지정 필요
        bert_model.model_save()

In [None]:
# 데이터 읽어오기
df = pd.read_csv('../data/train.csv')

In [None]:
# 학습을 위해 수치형으로 변환
df['label'] = df['label'].map({
                                'ADHD' : 0,
                                'PTSD(posttraumatic_stress_disorder)' : 1,
                                'bipolar_disorder'                   : 2,
                                'obsessive_compulsive_disorder'      : 3,
                                'normal'                             : 4,
                                'paranoid_personality_disorder'  : 5,
                                'avoidant_personality_disorder'  : 6,
                                'seperation_anxiety_disorder'    : 7,
                                'MDD(major_depressive_disorder)' : 8,
                                'generalized_anxiety_disorder'   : 9,
                                'neurocognitive_disorders'       : 10
                            })

In [None]:
# 미사용 컬럼 삭제
df.drop('profile_persona_id', axis=1, inplace=True)

In [None]:
# 모델 선언
MODEL_NAME = bert_models['KR-BERT']
# 데이터 전처리 및 학습을 위한 클래스 선언
bert_run = Run(df, MODEL_NAME)

In [None]:
# process 수행
bert_run.run()