In [3]:
import os 
import random
import torch
import numpy as np
import copy 
import json
import argparse 
import glob 
import argparse
import json

from torch import nn
from torch.utils.data import TensorDataset
from attrdict import AttrDict
from scipy.stats import pearsonr, spearmanr
from seqeval import metrics as seqeval_metrics
from sklearn import metrics as sklearn_metrics
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# from fastprogress.fastprogress import master_bar, progress_bar
from transformers import (
    AdamW,
    get_linear_schedule_with_warmup
)
from src import (
    CONFIG_CLASSES,
    TOKENIZER_CLASSES,
    MODEL_FOR_REGRESSION,
    set_seed
)

from processor import seq_reg_load_and_cache_examples as load_and_cache_examples  
from processor import seq_reg_processors as processors
from processor import seq_reg_output_modes as output_modes

2022-09-08 10:42:29.836557: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [4]:
task = 'wellness' 
config_dir = '/home/ubuntu/chatbot/code/config/'
config_file = 'koelectra-base.json'

with open(os.path.join(config_dir, task, config_file)) as f:
    args = AttrDict(json.load(f))

args.output_dir = os.path.join(args.ckpt_dir, args.output_dir)
set_seed(args) 

In [5]:
processor = processors[args.task](args) 
processor.get_examples('train')

[{
   "guid": "train-0",
   "label": "\ucd08\uc870\ud568",
   "text_a": "0011122",
   "text_b": null
 },
 {
   "guid": "train-1",
   "label": "\uc678\ub85c\uc6c0",
   "text_a": "0003694",
   "text_b": null
 },
 {
   "guid": "train-2",
   "label": "\uc2ac\ud514",
   "text_a": "0003391",
   "text_b": null
 },
 {
   "guid": "train-3",
   "label": "\uc790\uc2e0\uac10\uc800\ud558",
   "text_a": "0015876",
   "text_b": null
 },
 {
   "guid": "train-4",
   "label": "\uc2ac\ud514",
   "text_a": "0002858",
   "text_b": null
 },
 {
   "guid": "train-5",
   "label": "\ubd88\uba74",
   "text_a": "0008926",
   "text_b": null
 },
 {
   "guid": "train-6",
   "label": "\uc2ac\ud514",
   "text_a": "0002508",
   "text_b": null
 },
 {
   "guid": "train-7",
   "label": "\uc8c4\ucc45\uac10",
   "text_a": "0015416",
   "text_b": null
 },
 {
   "guid": "train-8",
   "label": "\uc790\uc0b4\ucda9\ub3d9",
   "text_a": "0018166",
   "text_b": null
 },
 {
   "guid": "train-9",
   "label": "\uc808\ub9dd\uac10",
  

In [6]:
config = CONFIG_CLASSES[args.model_type].from_pretrained(   
        args.model_name_or_path
)

In [7]:
tokenizer = TOKENIZER_CLASSES[args.model_type].from_pretrained(   # model_type: koelectra-base, ElectraTokenizer 
    args.model_name_or_path,
    do_lower_case = args.do_lower_case,
)

model0 = MODEL_FOR_REGRESSION[args.model_type].from_pretrained(
    args.model_name_or_path,
    config=config
)

model1 = MODEL_FOR_REGRESSION[args.model_type].from_pretrained(
    args.model_name_or_path,
    config=config
)

model2 = MODEL_FOR_REGRESSION[args.model_type].from_pretrained(
    args.model_name_or_path,
    config=config
)

Some weights of the model checkpoint at monologg/koelectra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at monologg/koelectra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator

In [8]:
class ElectraRegressor(nn.Module):
    def __init__(self, electra, config):
        # 부모 생성자 초기화 
        # super().__init__(config) 시 오류 발생 
        super(ElectraRegressor, self).__init__() 
        self.electra = electra
        self.cls_layer = nn.Linear(config.hidden_size, 128)
        self.regressor = nn.Sequential(nn.Dropout(0.1), nn.Linear(128, 1))
        # self.relu1 = nn.ReLU()
        # self.fc1 = nn.Linear(128, 128)
        # self.tanh = nn.Tanh()
        # self.fc2 = nn.Linear(128, 1)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.electra(input_ids=input_ids,
                               attention_mask=attention_mask,
                               token_type_ids=token_type_ids)
        logits = outputs.last_hidden_state[:, 0, :]
        # print(f'logits: {len(logits)}')
        output = self.cls_layer(logits)
        output = self.regressor(output)
        # output = self.fc1(output)
        # output = self.tanh(output)
        # output = self.fc2(output)
        return output

In [9]:
args.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
electra_model0 = ElectraRegressor(model0, config=config).to(args.device)
electra_model1 = ElectraRegressor(model1, config=config).to(args.device)
electra_model2 = ElectraRegressor(model2, config=config).to(args.device)

In [10]:
model_path = '/home/ubuntu/chatbot/code/model/'
model_0 = 'KoELECTRA_label0_50.pt'
model_1 = 'KoELECTRA_label1_50.pt'
model_2 = 'KoELECTRA_label2_50.pt'

In [11]:
electra_model0.load_state_dict(torch.load(os.path.join(model_path, model_0)))
electra_model1.load_state_dict(torch.load(os.path.join(model_path, model_1)))
electra_model2.load_state_dict(torch.load(os.path.join(model_path, model_2)))

<All keys matched successfully>

In [12]:
electra_model0

ElectraRegressor(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

In [13]:
class InputExample(object):
    """
    A single training/test example for simple sequence classification.
    """
    def __init__(self, guid, text_a, text_b):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [14]:
class InputFeatures(object):
    """A single set of features of data."""
    def __init__(self, input_ids, attention_mask, token_type_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [15]:
def convert_examples_to_features(args, examples, tokenizer, max_length, task):
    processor = processors[task](args)
        
    batch_encoding = tokenizer.batch_encode_plus(
        [(example.text_a, example.text_b) for example in examples],
        max_length=max_length,
        padding="max_length",
        add_special_tokens=True,
        truncation=True,
    )

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
        if "token_type_ids" not in inputs:
            inputs["token_type_ids"] = [0] * len(inputs["input_ids"])  # For xlm-roberta
            
        feature = InputFeatures(**inputs)
        features.append(feature)
    return features

In [16]:
def convert_score(score_list):
    '''
    여러 텍스트 데이터의 score가 들어있는 리스트를 입력으로 받아 
    각 테스트 데이터 score의 총합이 100이 되도록 조정하여 return하는 함수 
    '''
    convert_list = []
    for score in score_list:   # [(21, 30, 50), (12, 30, 46), ...]
        # convert_list.append([])
        total_score = score[0] + score[1] + score[2]
        convert_list.append([round(score[0] / total_score, 2) * 100, round(score[1] / total_score, 2) * 100, \
                             round(score[2] / total_score, 2) * 100])
    
    return convert_list

In [17]:
def get_score(args, model0, model1, model2, test_dataset):
    results = {}
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.eval_batch_size)

    preds = None
    for batch in test_dataloader:
        model0.eval()
        model1.eval()
        model2.eval()
        batch = tuple(t.to(args.device) for t in batch)   # args.device: cuda 

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2], 
            }
            outputs_0 = list(map(float, model0(**inputs)))   # model0(**inputs) - Tensor object return 
            outputs_1 = list(map(float, model1(**inputs)))   
            outputs_2 = list(map(float, model2(**inputs)))
            
    pred_score = list(zip(outputs_0, outputs_1, outputs_2))
    score_list = convert_score(pred_score)    
    return score_list

In [18]:
# comments[1]: label 2가 강할 것으로 기대 
comments = ['왜 나한테만 이러는거냐고', '내가 너무 한심하게 느껴져', '왜 이렇게 집중 안되고 슬프지']  

In [19]:
examples = []
for (i, line) in enumerate(comments):
    guid = "%s-%s" % ("test", i)
    text_a = comments[i]
    examples.append(InputExample(guid=guid, text_a=text_a, text_b=None))
    
examples

[{
   "guid": "test-0",
   "text_a": "\uc65c \ub098\ud55c\ud14c\ub9cc \uc774\ub7ec\ub294\uac70\ub0d0\uace0",
   "text_b": null
 },
 {
   "guid": "test-1",
   "text_a": "\ub0b4\uac00 \ub108\ubb34 \ud55c\uc2ec\ud558\uac8c \ub290\uaef4\uc838",
   "text_b": null
 },
 {
   "guid": "test-2",
   "text_a": "\uc65c \uc774\ub807\uac8c \uc9d1\uc911 \uc548\ub418\uace0 \uc2ac\ud504\uc9c0",
   "text_b": null
 }]

In [20]:
features = convert_examples_to_features(
            config, examples, tokenizer, max_length=args.max_seq_len, task='wellness'
      )

# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)

test_dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids)
test_dataset

<torch.utils.data.dataset.TensorDataset at 0x7f3c59e7cb90>

In [21]:
y_pred = get_score(args, electra_model0, electra_model1, electra_model2, test_dataset)

In [22]:
y_pred

[[21.0, 10.0, 69.0], [12.0, 78.0, 10.0], [10.0, 11.0, 79.0]]

In [32]:
label_0 = '현실에 대한 불만과 욕구'
label_1 = '부정적인 자기상'
label_2 = '무기력'

In [33]:
def print_sent(score_list):
    '''
    score list를 입력으로 받아 문장을 출력하는 함수 
    '''
    for idx, score in enumerate(score_list):
        print(f'사용자 발화: {comments[idx]}')
        print(f'당신은 {label_0}을 {score[0]}%, {label_1}을 {score[1]}%, {label_2}를 {score[2]}% 느끼고 있습니다.')
        print(' ')

In [34]:
print_sent(y_pred)

사용자 발화: 왜 나한테만 이러는거냐고
당신은 현실에 대한 불만과 욕구을 21.0%, 부정적인 자기상을 10.0%, 무기력를 69.0% 느끼고 있습니다.
 
사용자 발화: 내가 너무 한심하게 느껴져
당신은 현실에 대한 불만과 욕구을 12.0%, 부정적인 자기상을 78.0%, 무기력를 10.0% 느끼고 있습니다.
 
사용자 발화: 왜 이렇게 집중 안되고 슬프지
당신은 현실에 대한 불만과 욕구을 10.0%, 부정적인 자기상을 11.0%, 무기력를 79.0% 느끼고 있습니다.
 
