### 1. Environment Settings

#### 1.1 Import Library

In [3]:
import pandas as pd 
import numpy as np
import random
import os 
import re
import time
import math
import argparse
import pymysql
import json
import torch
import pickle
import matplotlib.pyplot as plt 

from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, Dataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from fastprogress.fastprogress import master_bar, progress_bar
from attrdict import AttrDict
from transformers import ElectraConfig, ElectraTokenizer, ElectraForSequenceClassification

#### 1.2 Setting Default Value

In [49]:
data_path = "/home/lamda_00/Depression_paper/data/"
model_path = "/home/lamda_00/Depression_paper/model/"
model_name = os.path.join(model_path, 'electra_class.pt')
ckpt_path = "/home/lamda_00/Depression_paper/ckpt/"
config_path = "/home/lamda_00/Depression_paper/config/"
log_path = "/home/lamda_00/Depression_paper/log/"
config_file = "koelectra-base.json"

#### 1.3 Load Dataset 

In [5]:
with open(os.path.join(data_path, 'model1_label.pickle'), 'rb') as f:
    label = pickle.load(f)

In [6]:
with open(os.path.join(data_path, 'model_train.pickle'), 'rb') as f:
    train_data = pickle.load(f)
    
with open(os.path.join(data_path, 'model_dev.pickle'), 'rb') as f:
    val_data = pickle.load(f)
    
with open(os.path.join(data_path, 'model_test.pickle'), 'rb') as f:
    test_data = pickle.load(f)

#### 1.4 Load Pretrained model & tokenizer 

In [29]:
with open(os.path.join(config_path, config_file)) as f:
    args = AttrDict(json.load(f))
    
args.device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

In [30]:
tokenizer = ElectraTokenizer.from_pretrained(args.model_name, do_lower_case=False)

In [8]:
vocab = ['우울감', '무기력', '외로움', '자신감', '자존감', '죄책감', '초조함']
tokenizer.add_tokens(vocab)

7

In [9]:
config = ElectraConfig.from_pretrained(   
        args.model_name,
        num_labels=20,   
        id2label={str(i): label for i, label in enumerate(label.keys())},   # labels: ['0', '1', '2', ... ,'18']
        label2id={label: i for i, label in enumerate(label.keys())},
)

In [11]:
model = ElectraForSequenceClassification.from_pretrained(args.model_name, config=config)

Some weights of the model checkpoint at monologg/koelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-discriminator and are newly initialized: ['clas

In [12]:
model.resize_token_embeddings(len(tokenizer))

Embedding(32207, 768)

In [13]:
model.to(args.device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32207, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

### 2. Define Class

In [14]:
class ElectraDataset(Dataset):
    def __init__(self, data_file):
        self.data = data_file
    
    def __len__(self):
        return len(self.data.label)
    
    def reset_index(self):
        self.data.reset_index(inplace=True, drop=True)
    
    # def clear_text(self)  => 전처리 코드를 여기에 넣을 경우 상당히 느려짐
    
    def __getitem__(self, idx):
        '''
        return text, label
        '''
        self.reset_index()
        text = self.data.text[idx]
        label = self.data.label[idx]
        return text, label

In [15]:
class ElectraProcessor():
    def __init__(self, args, tokenizer, truncation=True):
        self.tokenizer = tokenizer 
        self.max_len = args.max_seq_len
        self.pad = args.pad
        self.batch_size = args.train_batch_size
        self.truncation = truncation
    
    def convert_data(self, data_file):
        context2 = None    # single sentence classification
        batch_encoding = self.tokenizer.batch_encode_plus(
            [(data_file[idx][0], context2) for idx in range(len(data_file))], 
            max_length = self.max_len,
            padding = self.pad,
            truncation = self.truncation
        )
        
        features = []
        for i in range(len(data_file)):
            inputs = {k: batch_encoding[k][i] for k in batch_encoding}
            try:
                inputs['label'] = data_file[i][1] 
            except:
                inputs['label'] = 0 
            features.append(inputs)
        
        all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)
        all_attention_mask = torch.tensor([f['attention_mask'] for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f['token_type_ids'] for f in features], dtype=torch.long)
        all_labels = torch.tensor([f['label'] for f in features], dtype=torch.long)

        dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
        return dataset
    
    def convert_sentence(self, sentence_list):   # 사용자 입력 문장 1개 -> 입력 형태 변환
        pass
    
    def shuffle_data(self, dataset, data_type):
        if data_type == 'train':
            return RandomSampler(dataset)
        elif data_type == 'eval' or data_type == 'test':
            return SequentialSampler(dataset)
        
    def load_data(self, dataset, sampler):
        return DataLoader(dataset, sampler=sampler, batch_size=self.batch_size)

In [16]:
class ElectraTrainer():
    def __init__(self, args, model, train_dataloader, eval_dataloader):
        self.args = args 
        self.model = model
        self.train_dataloader = train_dataloader
        self.eval_dataloader = eval_dataloader
        self.test_dataloader = test_dataloader 
        
    def set_seed(self):
        random.seed(self.args.seed)
        np.random.seed(self.args.seed)
        torch.manual_seed(self.args.seed)
        if not self.args.no_cuda and torch.cuda.is_available():
            torch.cuda.manual_seed_all(self.args.seed)
    
    def train(self):
        train_acc_list = []; eval_acc_list = [] 
        train_loss_list = []; eval_loss_list = []
        best_acc = 0; best_loss = 3000
        nb_eval_steps = 0
        t_total = len(self.train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_epochs

        optimizer = AdamW(self.model.parameters(), lr=self.args.learning_rate, eps=self.args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(t_total * self.args.warmup_proportion), \
                                                    num_training_steps=t_total)

        self.model.zero_grad()
        for epoch in range(int(self.args.num_epochs)):
            train_acc = 0.0; eval_acc = 0.0
            train_loss = 0.0; eval_loss = 0.0 

            for step, batch in enumerate(self.train_dataloader):
                self.model.train()
                batch = tuple(t.to(self.args.device) for t in batch)
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    "labels": batch[3]
                }
                outputs = self.model(**inputs)
                loss = outputs[0]
                loss.backward()
                train_loss += loss.item()
                train_acc += self.calc_accuracy(outputs[1], batch[3])

                optimizer.step()
                scheduler.step()
                self.model.zero_grad()

            train_acc = train_acc / (step + 1)
            print(f'epoch: {epoch}, train_acc: {train_acc}')
            train_acc_list.append(train_acc)
            train_loss_list.append(train_loss)

            for step2, batch2 in enumerate(self.eval_dataloader):
                self.model.eval()
                batch2 = tuple(t.to(self.args.device) for t in batch2)

                with torch.no_grad():
                    inputs = {
                        "input_ids": batch2[0],
                        "attention_mask": batch2[1],
                        "token_type_ids": batch2[2],
                        "labels": batch2[3]
                    }
                    outputs = self.model(**inputs)
                    tmp_eval_loss, logits = outputs[:2]
                    eval_loss += tmp_eval_loss.mean().item()
                    eval_acc += self.calc_accuracy(outputs[1], batch2[3]) 

            eval_loss = eval_loss / (step2 + 1)
            eval_acc = eval_acc / (step2 + 1)
            eval_acc_list.append(eval_acc)
            eval_loss_list.append(eval_loss)

        return train_acc_list, train_loss_list, eval_acc_list, eval_loss_list

    def calc_accuracy(self, X,Y):
        max_vals, max_indices = torch.max(X, 1)
        train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
        return train_acc
    
    def compute_metrics(self, labels, preds):
        assert len(preds) == len(labels)
        acc = (labels == preds).mean()
        return {"acc": acc}
    
    def save_model(self, model_name):
        torch.save(self.model.state_dict(), model_name)

In [48]:
class ElectraTester():
    def __init__(self, args, vocab, load_model):
        self.tokenizer = ElectraTokenizer.from_pretrained(args.model_name, do_lower_case=False)
        self.tokenizer.add_tokens(vocab)
        self.config = ElectraConfig.from_pretrained(
            args.model_name,
            num_labels=20,   
            id2label={str(i): label for i, label in enumerate(label.keys())},
            label2id={label: i for i, label in enumerate(label.keys())}
        )
        self.model = ElectraForSequenceClassification.from_pretrained(args.model_name, config=self.config)
        self.model.resize_token_embeddings(len(self.tokenizer))
        self.model.load_state_dict(torch.load(load_model))
        self.model.to(args.device)
        
    def get_label(self, args, test_dataloader):
        results = {}

        preds = None
        labels = None

        for batch in progress_bar(test_dataloader):
            self.model.eval()
            batch = tuple(t.to(args.device) for t in batch)   # args.device: cuda 

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    "labels": batch[3]
                }
                outputs = self.model(**inputs)
                test_loss, logits = outputs[:2]  
        
            if preds is None:   # 초기 
                preds = logits.detach().cpu().numpy()   # 예측 확률 
                labels = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)  
                out_label_ids = np.append(labels, inputs["labels"].detach().cpu().numpy(), axis=0)

        preds = np.argmax(preds, axis=1)        
        return preds, labels 
    
    def get_f1_score(self, args, test_dataloader):
        y_pred, y_true = self.get_label(args, test_dataloader)
        return round(f1_score(y_true, y_pred, average='micro'), 3) 
     
    def get_cl_report(self, args, test_dataloader):
        y_pred, y_true = self.get_label(args, test_dataloader)
        cr = classification_report(y_true, y_pred).split('\n')
        clr_df = []

        for idx, line in enumerate(cr):
            clr_df.append([])
            if line == '':
                continue

            word_list = line.strip().split(' ')

            for word in word_list:
                if word != '':
                    clr_df[idx].append(word)

        clr_df[-2][0] = ' '.join([clr_df[-2][0], clr_df[-2][1]])
        clr_df[-3][0] = ' '.join([clr_df[-3][0], clr_df[-3][1]])
        clr_df[-4].insert(1, ' ')
        clr_df[-4].insert(2, ' ')
        clr_df[0].insert(0, 'index')

        clr_df[-2].pop(1)
        clr_df[-3].pop(1)
        clr_df.pop(1)
        clr_df.pop(-1)
        clr_df.pop(-4)
        clr_df = pd.DataFrame(clr_df[1:], columns=clr_df[0])
        clr_df.index = clr_df['index']

        del clr_df['index']
        return clr_df

In [17]:
train_file = ElectraDataset(train_data)
val_file = ElectraDataset(val_data)
test_file = ElectraDataset(test_data)

In [18]:
len(train_file), len(val_file), len(test_file)

(27496, 6874, 8593)

In [19]:
electra_processor = ElectraProcessor(args, tokenizer)

In [20]:
train_dataset = electra_processor.convert_data(train_file)
val_dataset = electra_processor.convert_data(val_file)
test_dataset = electra_processor.convert_data(test_file)

In [21]:
train_sampler = electra_processor.shuffle_data(train_dataset, 'train')
val_sampler = electra_processor.shuffle_data(val_dataset, 'eval')
test_sampler = electra_processor.shuffle_data(test_dataset, 'test')

In [22]:
train_dataloader = electra_processor.load_data(train_dataset, train_sampler)
val_dataloader = electra_processor.load_data(val_dataset, val_sampler)
test_dataloader = electra_processor.load_data(test_dataset, test_sampler)

In [24]:
electra_trainer = ElectraTrainer(args, model, train_dataloader, val_dataloader, test_dataloader)

In [25]:
train_acc, train_loss, eval_acc, eval_loss = electra_trainer.train()



epoch: 0, train_acc: 0.695203488372093
epoch: 1, train_acc: 0.8572311046511628
epoch: 2, train_acc: 0.9139898255813953
epoch: 3, train_acc: 0.9427325581395349
epoch: 4, train_acc: 0.9581758720930232
epoch: 5, train_acc: 0.9695494186046512
epoch: 6, train_acc: 0.9750363372093023
epoch: 7, train_acc: 0.9806686046511628
epoch: 8, train_acc: 0.9812136627906977
epoch: 9, train_acc: 0.9866279069767442
epoch: 10, train_acc: 0.9877543604651163
epoch: 11, train_acc: 0.9894985465116279
epoch: 12, train_acc: 0.9897892441860465
epoch: 13, train_acc: 0.9907703488372093
epoch: 14, train_acc: 0.9912790697674418
epoch: 15, train_acc: 0.9921511627906977
epoch: 16, train_acc: 0.9936773255813953
epoch: 17, train_acc: 0.993422965116279
epoch: 18, train_acc: 0.9949127906976745
epoch: 19, train_acc: 0.9952398255813953
epoch: 20, train_acc: 0.9944040697674419
epoch: 21, train_acc: 0.9949491279069768
epoch: 22, train_acc: 0.9967659883720931
epoch: 23, train_acc: 0.9967659883720931
epoch: 24, train_acc: 0.9962

([0.695203488372093,
  0.8572311046511628,
  0.9139898255813953,
  0.9427325581395349,
  0.9581758720930232,
  0.9695494186046512,
  0.9750363372093023,
  0.9806686046511628,
  0.9812136627906977,
  0.9866279069767442,
  0.9877543604651163,
  0.9894985465116279,
  0.9897892441860465,
  0.9907703488372093,
  0.9912790697674418,
  0.9921511627906977,
  0.9936773255813953,
  0.993422965116279,
  0.9949127906976745,
  0.9952398255813953,
  0.9944040697674419,
  0.9949491279069768,
  0.9967659883720931,
  0.9967659883720931,
  0.9962936046511628,
  0.9962936046511628,
  0.997202034883721,
  0.996875,
  0.997202034883721,
  0.9976380813953488,
  0.9974200581395349,
  0.9981831395348837,
  0.9981831395348837,
  0.998328488372093,
  0.9989098837209303,
  0.9987281976744186,
  0.9990188953488373,
  0.9989098837209303,
  0.9994549418604651,
  0.9995276162790697,
  0.9996002906976744,
  0.9990915697674418,
  0.9992369186046511,
  0.9994549418604651,
  0.9996002906976744,
  0.9995639534883721,
  0

In [27]:
electra_trainer.save_model(model_path, 'electra_class.pt')

In [42]:
electra_tester = ElectraTester(args, vocab, model_name)

Some weights of the model checkpoint at monologg/koelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-discriminator and are newly initialized: ['clas

In [44]:
y_pred, y_true = electra_tester.get_label(args, test_dataloader)

In [46]:
from sklearn.metrics import precision_score , recall_score , confusion_matrix, f1_score, classification_report

confusion_mt = pd.DataFrame(confusion_matrix(y_true, y_pred))
confusion_mt

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,281,3,3,1,5,1,1,5,1,6,4,2,4,1,0,6,2,2,1,2
1,4,297,2,9,1,5,1,2,1,1,2,0,17,0,0,5,5,0,2,4
2,4,6,121,1,3,0,0,0,0,0,3,1,0,0,0,3,2,1,0,10
3,0,11,2,163,2,1,0,1,0,3,9,0,7,0,1,1,2,3,1,10
4,14,6,1,1,177,4,0,4,0,5,5,13,1,1,2,5,6,4,0,9
5,3,3,2,4,2,30,0,0,0,0,5,0,0,0,1,2,2,0,0,0
6,0,1,0,0,0,0,42,0,0,0,0,0,0,0,0,0,0,0,0,1
7,1,1,0,1,4,0,0,201,5,0,2,0,0,0,0,0,0,0,0,4
8,0,0,1,0,1,0,0,6,72,0,1,0,0,0,0,0,0,0,0,6
9,5,3,2,0,1,0,0,0,1,334,5,12,0,0,0,0,1,0,5,5
