In [1]:
import argparse
import json
import os
import glob
import re
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from fastprogress.fastprogress import master_bar, progress_bar
from attrdict import AttrDict
from transformers import (
    AdamW,
    get_linear_schedule_with_warmup
)

from src import (
    CONFIG_CLASSES,
    TOKENIZER_CLASSES,
    MODEL_FOR_SEQUENCE_CLASSIFICATION,
    init_logger,
    set_seed,
    compute_metrics
)

from processor import seq_cls_load_and_cache_examples as load_and_cache_examples
from processor import seq_cls_tasks_num_labels as tasks_num_labels   
from processor import seq_cls_processors as processors
from processor import seq_cls_output_modes as output_modes

2022-09-05 05:35:11.088301: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
task = 'wellness' 
config_dir = '/home/ubuntu/chatbot/code/config/'
config_file = 'koelectra-base.json'

with open(os.path.join(config_dir, task, config_file)) as f:
    args = AttrDict(json.load(f))

args.output_dir = os.path.join(args.ckpt_dir, args.output_dir)
set_seed(args)   # seed 값 설정 

In [3]:
processor = processors[args.task](args) 
processor

<processor.seq_cls.WellnessProcessor at 0x7fda45b73790>

In [4]:
processor = processors[args.task](args) 
labels = processor.get_labels()

# labels
len(labels)

19

In [5]:
config = CONFIG_CLASSES[args.model_type].from_pretrained(   # ElectraConfig.from_pretrained
        args.model_name_or_path,
        num_labels=tasks_num_labels[args.task],   # args.task: wellness, num_labels = 19 
        id2label={str(i): label for i, label in enumerate(labels)},   # labels: ['0', '1', '2', ... ,'18']
        label2id={label: i for i, label in enumerate(labels)},
)

config

ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "0",
    "1": "1",
    "10": "10",
    "11": "11",
    "12": "12",
    "13": "13",
    "14": "14",
    "15": "15",
    "16": "16",
    "17": "17",
    "18": "18",
    "2": "2",
    "3": "3",
    "4": "4",
    "5": "5",
    "6": "6",
    "7": "7",
    "8": "8",
    "9": "9"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "0": 0,
    "1": 1,
    "10": 10,
    "11": 11,
    "12": 12,
    "13": 13,
    "14": 14,
    "15": 15,
    "16": 16,
    "17": 17,
    "18": 18,
    "2": 2,
    "3": 3,
    "4": 4,
    "5": 5,
    "6": 6,
    "7": 7,
    "8": 8,
    "9": 9
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_l

In [6]:
tokenizer = TOKENIZER_CLASSES[args.model_type].from_pretrained(   # model_type: koelectra-base, ElectraTokenizer 
    args.model_name_or_path,
    do_lower_case=args.do_lower_case,
    
)

model = MODEL_FOR_SEQUENCE_CLASSIFICATION[args.model_type].from_pretrained(
    args.model_name_or_path,
    config=config
)

Some weights of the model checkpoint at monologg/koelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-discriminator and are newly initialized: ['clas

In [8]:
class WellnessProcessor(object):
    """Processor for the Wellness data set """
    def __init__(self, args):
        self.args = args

    def get_labels(self):
        label_list = list(range(19))
        label_list = list(map(str, label_list)) 
        return label_list 

    @classmethod
    def _read_file(cls, input_file):
        """
        Reads a tab separated value file (csv).
        f: data/wellness/Wellness_Conversation_intent_train.tsv 
        """
        with open(input_file, "r", encoding="utf-8") as f:
            lines = []
            for line in f:
                tmp = line.strip(',') 
                lines.append(tmp[:-1]) 
            return lines

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines[1:]):
            line = line.split("\t")
            guid = "%s-%s" % (set_type, i)
            text_a = line[1]
            label = line[2]
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples

    def get_examples(self, mode):
        """
        Args:
            mode: train, dev, test
        """
        file_to_read = None
        if mode == "train":
            file_to_read = self.args.train_file
        elif mode == "dev":
            file_to_read = self.args.dev_file
        elif mode == "test":
            file_to_read = self.args.test_file

        return self._create_examples(
            self._read_file(os.path.join(self.args.data_dir, self.args.task, file_to_read)), mode
        )

In [9]:
processor = WellnessProcessor(args)   # WellnessProcessor(args)
output_mode = 'classification'   

NameError: name 'seq_cls_processors' is not defined

In [11]:
file_to_read = args.test_file
file_to_read

'Wellness_Conversation_intent_test.tsv'

In [13]:
examples = processor.get_examples("test")
examples

[{
   "guid": "test-0",
   "label": "0",
   "text_a": "\uae09 \uae30\uc5b5\ub825 \uac10\ud1f4 \uc640\uc11c \uc9c4\uc2ec \uc6b0\uc6b8\ud574",
   "text_b": null
 },
 {
   "guid": "test-1",
   "label": "17",
   "text_a": "\uc790\ucc45\uac10\uc73c\ub85c \ud3c9\uc0dd\uc744 \uc0b4\ub77c\uace0 \uc720\uc11c\ub3c4 \uc368\ub193\uace0 \uc606\uc5d0 \ub480\uc5b4.",
   "text_b": null
 },
 {
   "guid": "test-2",
   "label": "18",
   "text_a": "\uae34\uc7a5\uac10 \ub54c\ubb38\uc5d0 \ud798\ub4e4\ub2e4",
   "text_b": null
 },
 {
   "guid": "test-3",
   "label": "10",
   "text_a": "\uc2e0\uacbd\uc9c8\uc774 \ub098\ub294\ub370 \uadf8 \uc0ac\ub78c \uc637\uc744 \ubc97\uaca8\uc8fc\uace0\ub294 \uce68\ub300\uc5d0\uc11c \uc7ac\uc6e0\uc5b4\uc694.",
   "text_b": null
 },
 {
   "guid": "test-4",
   "label": "4",
   "text_a": "\uc544\ubb34 \uac83\ub3c4 \ud558\uae30 \uc2eb\uc5b4\uc11c \uadf8\ub0e5 \ub204\uc6cc\ub9cc \uc788\uace0\u2026",
   "text_b": null
 },
 {
   "guid": "test-5",
   "label": "15",
   "text_a": "\ub

In [17]:
lines = processor._read_file(os.path.join(args.data_dir, args.task, file_to_read))   # lines 
lines

['id\tcontext\tlabel',
 '0000776\t급 기억력 감퇴 와서 진심 우울해\t0',
 '0018775\t자책감으로 평생을 살라고 유서도 써놓고 옆에 뒀어.\t17',
 '0019081\t긴장감 때문에 힘들다\t18',
 '0013023\t신경질이 나는데 그 사람 옷을 벗겨주고는 침대에서 재웠어요.\t10',
 '0006285\t아무 것도 하기 싫어서 그냥 누워만 있고…\t4',
 '0017358\t뭔가 이렇게 생각하는 거 보면 내가 너무 엄살 부리는 것 같아서 한심하다.\t15',
 '0019425\t안정이 안 되는 것 같아\t18',
 '0000981\t그냥 하루종일 제 감정이 우울한 거 하나인 거 같아요.\t0',
 '0006614\t말도 안통하는데 싸우는 것도 이제 지쳐.\t5',
 '0013581\t어제 늦게 잤더니 몸이 처져\t11',
 '0018348\t남편때문에 인생 끝내버리고 싶다\t17',
 '0004479\t사람들이 미워요\t3',
 '0007156\t입맛이 없네요.\t7',
 '0005895\t지금 그때 생각만 해도 끔찍하다.\t4',
 '0006480\t그래서 계속 누워있게되고 움직이기도 싫었어요.\t4',
 '0004997\t도망 간 사장 생각하면 진짜 죽여 버리고 싶어.\t3',
 '0004268\t막 화낸 게 미안해서 또 울어\t3',
 '0004196\t하소연 할데도 없고..\t2',
 '0001680\t사는 것 같지 않고 슬퍼요\t1',
 '0004243\t한번 화가 나면 속에서 온갖 욕이란 욕은 다 하는 것 같아\t3',
 '0017559\t인생이 무너져내리고 있다는 생각이 들었어\t16',
 '0010711\t요즘 싸움이 잦아서 신경이 더 날카로워졌어\t10',
 '0007221\t식욕저하 때문에 식사를 못해서 움직일 힘도 없어요.\t7',
 '0000880\t머리도 계속 멍하고, 계속 생각에 잠겨있어.\t0',
 '0016720\t신랑 집은 잘 사는데 나는 능력도 없고 집도 별로니까 너무 주눅들게 되고 힘