In [44]:
import os 
import random
import torch
import numpy as np
import copy 
import json
import argparse 
import glob 
import argparse
import json

from torch import nn
from torch.utils.data import TensorDataset
from attrdict import AttrDict
from scipy.stats import pearsonr, spearmanr
from seqeval import metrics as seqeval_metrics
from sklearn import metrics as sklearn_metrics
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# from fastprogress.fastprogress import master_bar, progress_bar
from transformers import (
    AdamW,
    get_linear_schedule_with_warmup
)
from src import (
    CONFIG_CLASSES,
    TOKENIZER_CLASSES,
    MODEL_FOR_SEQUENCE_CLASSIFICATION,
    set_seed,
    compute_metrics
)

from processor import seq_cls_load_and_cache_examples as load_and_cache_examples  
from processor import seq_cls_tasks_num_labels as tasks_num_labels
from processor import seq_cls_processors as processors
from processor import seq_cls_output_modes as output_modes

In [45]:
task = 'wellness' 
config_dir = '/home/ubuntu/chatbot/code/config/'
config_file = 'koelectra-base.json'

with open(os.path.join(config_dir, task, config_file)) as f:
    args = AttrDict(json.load(f))

args.output_dir = os.path.join(args.ckpt_dir, args.output_dir)
set_seed(args) 

In [46]:
processor = processors[args.task](args) 
labels = processor.get_labels()

len(labels)   # labels 

19

In [47]:
file_to_read = '/home/ubuntu/chatbot/dataset/Wellness_Conversation_intent_train.tsv'

with open(file_to_read, "r", encoding="utf-8") as f:
    lines = []
    for line in f:
        tmp = line.strip(',') 
        lines.append(tmp[:-1]) 

examples = [] 
set_type = 'train'

for (i, line) in enumerate(lines[1:]):
    line = line.split("\t")
    guid = "%s-%s" % (set_type, i)
    text_a = line[2]
    label = line[3]

text_a, label

('몸이 힘들어 죽겠는데 희망 같은게 있을 수가 없지', '16')

In [48]:
processor.get_examples('train')

[{
   "guid": "train-0",
   "label": "10",
   "text_a": "\ub0b4 \ubbf8\ub798\ub97c \uc0dd\uac01\ud558\ub2c8\uae4c \ub108\ubb34 \uac11\uac11\ud574",
   "text_b": null
 },
 {
   "guid": "train-1",
   "label": "2",
   "text_a": "\uc6b8\uc801\ud574\uc9c0\uc9c0 \uc54a\uc73c\ub824\uace0 \ud574\ub3c4 \uc65c \uc774\ub807\uac8c \uc6b8\uc801\ud55c\uc9c0",
   "text_b": null
 },
 {
   "guid": "train-2",
   "label": "1",
   "text_a": "\ubcfc\ub54c\ub9c8\ub2e4 \uc548\uc4f0\ub7fd\uace0 \uc2ac\ud504\uace0.",
   "text_b": null
 },
 {
   "guid": "train-3",
   "label": "14",
   "text_a": "\ub0b4\uac00 \ub9ce\uc774 \uc704\ucd95\ub41c \uac70 \uac19\uc544",
   "text_b": null
 },
 {
   "guid": "train-4",
   "label": "1",
   "text_a": "\uc720\ub9ac\ubcd1 \uae68\uc11c \uc190\ubaa9 \uae0b\uace0 \ubc1c\ub85c \ubc1f\uace0 \ud53c \ud758\ub9ac\uba74\uc11c \uc624\uc5f4\uc744 \ud588\uc5b4",
   "text_b": null
 },
 {
   "guid": "train-5",
   "label": "9",
   "text_a": "\uc790\uafb8 \uc2e0\uacbd\uc4f0\uc5ec\uc11c \uc870

In [49]:
config = CONFIG_CLASSES[args.model_type].from_pretrained(   # ElectraConfig.from_pretrained
        args.model_name_or_path,
        num_labels=tasks_num_labels[args.task],   # args.task: wellness, num_labels = 19 
        id2label={str(i): label for i, label in enumerate(labels)},   # labels: ['0', '1', '2', ... ,'18']
        label2id={label: i for i, label in enumerate(labels)},
)

config

ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "0",
    "1": "1",
    "10": "10",
    "11": "11",
    "12": "12",
    "13": "13",
    "14": "14",
    "15": "15",
    "16": "16",
    "17": "17",
    "18": "18",
    "2": "2",
    "3": "3",
    "4": "4",
    "5": "5",
    "6": "6",
    "7": "7",
    "8": "8",
    "9": "9"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "0": 0,
    "1": 1,
    "10": 10,
    "11": 11,
    "12": 12,
    "13": 13,
    "14": 14,
    "15": 15,
    "16": 16,
    "17": 17,
    "18": 18,
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 5,
    "6": 6,
    "7": 7,
    "8": 8,
    "9": 9
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_l

In [50]:
tokenizer = TOKENIZER_CLASSES[args.model_type].from_pretrained(   # model_type: koelectra-base, ElectraTokenizer 
    args.model_name_or_path,
    do_lower_case=args.do_lower_case,
    
)

model = MODEL_FOR_SEQUENCE_CLASSIFICATION[args.model_type].from_pretrained(
    args.model_name_or_path,
    config=config
)

Some weights of the model checkpoint at monologg/koelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-discriminator and are newly initialized: ['clas

In [51]:
args.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
model.to(args.device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [52]:
model_path = '/home/ubuntu/chatbot/code/model/'
model_name = 'KoELECTRA_intent.pt'

In [53]:
model.load_state_dict(torch.load(os.path.join(model_path, model_name)))

<All keys matched successfully>

In [54]:
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [55]:
class InputExample(object):
    """
    A single training/test example for simple sequence classification.
    """
    def __init__(self, guid, text_a, text_b):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [56]:
class InputFeatures(object):
    """A single set of features of data."""
    def __init__(self, input_ids, attention_mask, token_type_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [57]:
def convert_examples_to_features(args, examples, tokenizer, max_length, task):
    processor = processors[task](args)
        
    batch_encoding = tokenizer.batch_encode_plus(
        [(example.text_a, example.text_b) for example in examples],
        max_length=max_length,
        padding="max_length",
        add_special_tokens=True,
        truncation=True,
    )

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
        if "token_type_ids" not in inputs:
            inputs["token_type_ids"] = [0] * len(inputs["input_ids"])  # For xlm-roberta
            
        feature = InputFeatures(**inputs)
        features.append(feature)
    return features

In [58]:
def get_idx(logits):
    '''
    총합이 1인 Softmax 배열을 입력으로 받아 임계값을 넘는 label idx를 반환하는 함수 
    '''
    threshold = 0.1
    idx_list = [] 
    for idx, prob in enumerate(list(logits)): 
        if prob > threshold: 
            idx_list.append(idx)
  
    return idx_list

In [97]:
def get_multi_label(args, model, test_dataset):
    results = {}
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.eval_batch_size)

    preds = None
    out_label_ids = None

    for batch in test_dataloader:
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)   # args.device: cuda 

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2], 
                # "labels": batch[3]
            }
            outputs = model(**inputs)
            # tmp_eval_loss, logits = outputs[:2]   # label이 있는 test dataset 
            logits = outputs[:1]
        
            m = torch.nn.functional.softmax(logits[0], dim=-1)
            # print(m)  #  [[19개의 확률 값], [19개의 확률 값], ... [19개의 확률 값]]

        if preds is None:   # 초기 
            preds = m.detach().cpu().numpy()   # 예측 확률 
        else:
            preds = np.append(preds, m.detach().cpu().numpy(), axis=0)  
    # print(get_idx(preds[0]), get_idx(preds[1]), get_idx(preds[2]))
    # preds = np.argmax(preds, axis=1)
    
    for idx, comment in enumerate(comments):
        print(f'사용자 발화: {comment}')
        sent = []
        idx_list = get_idx(preds[idx])
        # print(idx_list)
        for idx2 in idx_list:
            sent.append(intent_list[idx2])
        
        print(f'입력받은 사용자 발화에서 {sent}이 감지되었습니다.')
        print('')
    return preds

In [98]:
test_dataset = load_and_cache_examples(args, tokenizer, mode="test") if args.test_file else None

In [99]:
y_pred = get_multi_label(args, model, test_dataset)

사용자 발화: 요즘 너무 불안해서 그런가 잠도 잘 안 오고 우울해
입력받은 사용자 발화에서 ['우울감']이 감지되었습니다.

사용자 발화: 오늘 학교에서 엄청 안 좋은 일이 있었어, 그리고 지금은 좀 초조해
입력받은 사용자 발화에서 ['죄책감']이 감지되었습니다.

사용자 발화: 왜 이렇게 집중 안되고 슬프지
입력받은 사용자 발화에서 ['불안']이 감지되었습니다.



In [100]:
for pred in y_pred:
    print(get_idx(pred))

[0]
[12]
[18]
[10]
[4]
[15]
[1, 18]
[0]
[4]
[11]
[17]
[3]
[7]
[1, 4, 10, 16]
[4]
[3]
[3]
[1]
[1]
[3]
[16]
[10]
[7]
[0]
[15]
[12]
[16]
[15]
[9]
[10]
[6]
[0]
[0]
[15]
[4]
[10]
[12]
[9]
[8]
[11]
[10]
[10]
[7]
[16]
[7, 9]
[12]
[11]
[8]
[15]
[9]
[9]
[9]
[11]
[3]
[9]
[9]
[2]
[3]
[10]
[0, 10, 18]
[1, 12, 17]
[9]
[8]
[7]
[8]
[10]
[10]
[10]
[16]
[4]
[10]
[17]
[7]
[3]
[1]
[0]
[3]
[2]
[2]
[11]
[6]
[15]
[8]
[3]
[9]
[1]
[9]
[8]
[11]
[9]
[9]
[18]
[15]
[1]
[12]
[1]
[4]
[15]
[16]
[14, 15]
[0]
[1]
[11]
[10]
[4]
[12, 15]
[9]
[3]
[3]
[1]
[9]
[10]
[10, 15]
[1]
[8]
[6]
[7]
[14]
[6]
[10]
[4]
[10]
[0]
[1]
[18]
[10]
[11]
[3]
[13]
[18]
[1]
[7]
[7]
[11]
[7]
[9]
[15]
[0]
[9]
[7]
[10]
[7]
[2]
[13]
[17]
[11]
[3, 5]
[3, 5]
[13]
[0]
[8]
[17]
[15]
[10]
[16]
[1]
[7]
[10]
[0]
[2]
[11]
[17]
[7]
[1]
[16]
[0]
[4]
[11]
[15]
[0]
[10]
[10]
[7]
[2]
[2]
[11]
[0]
[16]
[7]
[1, 4]
[4]
[0, 4, 11]
[9]
[12]
[4]
[1]
[16]
[12]
[3]
[14]
[7]
[16]
[6]
[10]
[3]
[0]
[18]
[0]
[4]
[16]
[18]
[10]
[10]
[10]
[16]
[8]
[18]
[2]
[3]
[9]
[12]
[4]
[

In [101]:
comments = ['요즘 너무 불안해서 그런가 잠도 잘 안 오고 우울해', '오늘 학교에서 엄청 안 좋은 일이 있었어, 그리고 지금은 좀 초조해',\
            '왜 이렇게 집중 안되고 슬프지']

In [102]:
examples = []
for (i, line) in enumerate(comments):
    guid = "%s-%s" % ("test", i)
    text_a = comments[i]
    examples.append(InputExample(guid=guid, text_a=text_a, text_b=None))
    
examples

[{
   "guid": "test-0",
   "text_a": "\uc694\uc998 \ub108\ubb34 \ubd88\uc548\ud574\uc11c \uadf8\ub7f0\uac00 \uc7a0\ub3c4 \uc798 \uc548 \uc624\uace0 \uc6b0\uc6b8\ud574",
   "text_b": null
 },
 {
   "guid": "test-1",
   "text_a": "\uc624\ub298 \ud559\uad50\uc5d0\uc11c \uc5c4\uccad \uc548 \uc88b\uc740 \uc77c\uc774 \uc788\uc5c8\uc5b4, \uadf8\ub9ac\uace0 \uc9c0\uae08\uc740 \uc880 \ucd08\uc870\ud574",
   "text_b": null
 },
 {
   "guid": "test-2",
   "text_a": "\uc65c \uc774\ub807\uac8c \uc9d1\uc911 \uc548\ub418\uace0 \uc2ac\ud504\uc9c0",
   "text_b": null
 }]

In [103]:
features = convert_examples_to_features(
            config, examples, tokenizer, max_length=args.max_seq_len, task='wellness'
      )

# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)

test_dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids)
test_dataset

<torch.utils.data.dataset.TensorDataset at 0x7f8931991550>

In [104]:
y_pred = get_multi_label(args, model, test_dataset)

사용자 발화: 요즘 너무 불안해서 그런가 잠도 잘 안 오고 우울해
입력받은 사용자 발화에서 ['우울감', '불안']이 감지되었습니다.

사용자 발화: 오늘 학교에서 엄청 안 좋은 일이 있었어, 그리고 지금은 좀 초조해
입력받은 사용자 발화에서 ['초조함']이 감지되었습니다.

사용자 발화: 왜 이렇게 집중 안되고 슬프지
입력받은 사용자 발화에서 ['우울감', '슬픔']이 감지되었습니다.



In [105]:
import pandas as pd

data = pd.read_csv('/home/ubuntu/chatbot/dataset/Wellness_Conversation_intent.csv')
data

Unnamed: 0,id,intent,context,context_morphs,label
0,0,우울감,임신해서 우울해,"['임신', '해서', '우울해']",0
1,1,우울감,아이 가지고 나서 우울해,"['아이', '가지', '고', '나', '서', '우울해']",0
2,2,우울감,아이 가졌는데 기분 하나도 안 좋고 울적해,"['아이', '가졌', '는데', '기분', '하나', '도', '안', '좋', ...",0
3,3,우울감,임신했는데 남편이 하나도 안 챙겨줘서 우울하다,"['임신', '했', '는데', '남편', '이', '하나', '도', '안', '...",0
4,4,우울감,진단 결과 안 좋게 나올 것 같아서 우울해,"['진단', '결과', '안', '좋', '게', '나올', '것', '같', '아...",0
...,...,...,...,...,...
19672,19764,불안,그래도 잠못자고 불안한건 여전해요.,"['그래도', '잠', '못', '자', '고', '불안', '한', '건', '여...",18
19673,19765,불안,불안함에 항상 시달리니까 잠도 못잤어요.,"['불안', '함', '에', '항상', '시달리', '니까', '잠', '도', ...",18
19674,19766,불안,불안하고 초조해서 잠이 안 와.,"['불안', '하', '고', '초조', '해서', '잠', '이', '안', '와...",18
19675,19767,불안,너무 불안하니까 밤만 되면 잠이 안 오고 너무 초조해.,"['너무', '불안', '하', '니까', '밤', '만', '되', '면', '잠...",18


In [25]:
data = data[['intent', 'context']]
data

Unnamed: 0,intent,context
0,우울감,임신해서 우울해
1,우울감,아이 가지고 나서 우울해
2,우울감,아이 가졌는데 기분 하나도 안 좋고 울적해
3,우울감,임신했는데 남편이 하나도 안 챙겨줘서 우울하다
4,우울감,진단 결과 안 좋게 나올 것 같아서 우울해
...,...,...
19672,불안,그래도 잠못자고 불안한건 여전해요.
19673,불안,불안함에 항상 시달리니까 잠도 못잤어요.
19674,불안,불안하고 초조해서 잠이 안 와.
19675,불안,너무 불안하니까 밤만 되면 잠이 안 오고 너무 초조해.


In [26]:
len(data.intent.unique())

19

In [27]:
intent_list = data.intent.unique()
intent_list[0], intent_list[18], intent_list[10], intent_list[1]

('우울감', '불안', '초조함', '슬픔')