## import modules

In [1]:
from tqdm import tqdm
import re
import torch
import os
import random
import json
import numpy as np

import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer

In [2]:
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"  # 디버깅 위한 세팅
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

print('Device:', torch.cuda.device)  # 출력결과: cuda 
print('Count of using GPUs:', torch.cuda.device_count())   
print('Current cuda device:', torch.cuda.current_device()) 

Device: <class 'torch.cuda.device'>
Count of using GPUs: 2
Current cuda device: 0


### Load Data
Data가 위치한 PATH에서 Data를 불러오기

In [3]:
def jsonload(fname, encoding="utf-8"):
    with open(fname, encoding=encoding) as f:
        j = json.load(f)

    return j

# json 개체를 파일이름으로 깔끔하게 저장
def jsondump(j, fname):
    with open(fname, "w", encoding="UTF8") as f:
        json.dump(j, f, ensure_ascii=False)

# jsonl 파일 읽어서 list에 저장
def jsonlload(fname, encoding="utf-8"):
    json_list = []
    with open(fname, encoding=encoding) as f:
        for line in f.readlines():
            json_list.append(json.loads(line))
    return json_list

In [4]:
train_data = jsonlload('nikluge-sa-2022-train.jsonl')
dev_data = jsonlload('nikluge-sa-2022-dev.jsonl')

## Preprocessing method

In [5]:
def preprocess(text: str, only_kor: bool = True):
    """한국어 문장을 옵션에 맞게 전처리"""
    # 한국어 모음과 특수 문자, 숫자 및 영어 제거
    if only_kor:
        text = re.sub(f"[^가-힣| |]+", "", text)
    else:
        text = re.sub(f"[^가-힣|ㄱ-ㅎ|0-9|]+", "", text)

    # 연속 공백 제거
    text = re.sub(" +", " ", text)

    # 좌우 불필요한 공백 제거
    return text.strip()

## 재현성 위한 seed 고정

In [6]:
def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
seed_everything(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## 태그셋 정의

In [7]:
#개체#속성 태그셋 정의
entity_property_pair = [
    '제품 전체#품질', '제품 전체#편의성', '제품 전체#디자인', '제품 전체#일반', '제품 전체#가격', 
    '제품 전체#인지도',  '제품 전체#다양성',
    '패키지/구성품#디자인', '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#일반',
    '패키지/구성품#편의성', '패키지/구성품#품질',
    '본품#일반', '본품#다양성', '본품#품질', '본품#편의성', '본품#디자인', '본품#가격',
    '브랜드#인지도', '브랜드#일반', '브랜드#디자인', '브랜드#품질', '브랜드#가격']

rep_entity_property_pair = [
    '제품전체#품질', '제품전체#편의성', '제품전체#디자인', '제품전체#일반', '제품전체#가격', 
    '제품전체#인지도',
    '패키지/구성품#디자인', '패키지/구성품#일반',
    '패키지/구성품#편의성', '패키지/구성품#품질',
    '본품#일반', '본품#다양성', '본품#품질', '본품#편의성', '본품#디자인',
    '브랜드#인지도', '브랜드#일반', '브랜드#품질']


"""
Train데이터에 등장 안하거나 희소하게 등장한 태그들 ==> rep에서 모두 지워주었음.

제품 전체#다양성  0

패키지/구성품#가격 0 
패키지/구성품#다양성 1

본품#인지도 1
본품#가격  2
본품#인지도 1

브랜드#디자인  0
브랜드#가격  3
"""

len(rep_entity_property_pair)

18

## Load Tokenizer

In [8]:
MODEL_NAME = 'klue/roberta-large'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer

PreTrainedTokenizerFast(name_or_path='klue/roberta-large', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [9]:
special_tokens_dict = {
    'additional_special_tokens': rep_entity_property_pair
}

print(tokenizer.vocab_size)

# 토크나이저는 각 모델 별로 만들지 않아도 된다. [임베딩 layer에만 영향 주므로]
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print(num_added_toks)

32000
18


In [10]:
print(tokenizer.tokenize("야 아이패드 가격 왜 이래 본품#일반"))
print(tokenizer("야 아이패드 가격 왜 이래 본품#가격"))
print(tokenizer.decode(tokenizer.encode("야 아이패드 가격 왜 이래 본품#가격")))

['야', '아이패드', '가격', '왜', '이래', '본품#일반']
{'input_ids': [0, 1396, 15641, 3852, 1460, 5625, 1163, 2425, 7, 3852, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] 야 아이패드 가격 왜 이래 본품 # 가격 [SEP]


## Load  ACD Model

In [11]:
ACD_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, 
                        num_labels=2)  #True/False 
ACD_model.resize_token_embeddings(tokenizer.vocab_size + num_added_toks)

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'cla

Embedding(32018, 1024)

## 데이터 준비

In [12]:
polarity_id_to_name = ['positive', 'negative', 'neutral']  # 차례대로 0, 1, 2
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

In [13]:
ACD_Datas = []  
ACD_labels = []

POL_Datas = []
POL_labels = []

### sentence들 먼저 모으고 한꺼번에 tokenize 하자


for datas in train_data:
    sen = preprocess(datas['sentence_form'])
    annos = datas['annotation']  # 여러 개 일 수도 있음. (이중 리스트)
    
    for annotation in annos:
        for pair in rep_entity_property_pair:
            entity_property = annotation[0]  # raw_data의 annotation 추출
            entity_property = entity_property.replace(" ", "")  # 띄어쓰기 제거
            polarity = annotation[2]

            if entity_property == pair:
                ACD_Datas.append(sen + " " + pair) 
                ACD_labels.append(1)
                POL_Datas.append(sen + " " + pair)
                POL_labels.append(polarity_name_to_id[polarity])

            else:
                ACD_Datas.append(sen + " " + pair) 
                ACD_labels.append(0)

In [14]:
len(ACD_Datas), len(ACD_labels), len(POL_Datas), len(POL_labels)

(58590, 58590, 3248, 3248)

In [15]:
devs = dev_data[:838]
tests = dev_data[838:]

In [16]:
################### Dev set #######################
dev_ACD_Datas = []  
dev_ACD_labels = []

dev_POL_Datas = []
dev_POL_labels = []

### sentence들 먼저 모으고 한꺼번에 tokenize 하자


for datas in devs:
    sen = preprocess(datas['sentence_form'])
    annos = datas['annotation']  # 여러 개 일 수도 있음. (이중 리스트)
    
    for annotation in annos:
        for pair in rep_entity_property_pair:
            entity_property = annotation[0]  # raw_data의 annotation 추출
            entity_property = entity_property.replace(" ", "")  # 띄어쓰기 제거
            polarity = annotation[2]

            if entity_property == pair:
                dev_ACD_Datas.append(sen + " " + pair) 
                dev_ACD_labels.append(1)
                dev_POL_Datas.append(sen + " " + pair)
                dev_POL_labels.append(polarity_name_to_id[polarity])

            else:
                dev_ACD_Datas.append(sen + " " + pair) 
                dev_ACD_labels.append(0)

In [17]:
len(dev_ACD_Datas), len(dev_ACD_labels), len(dev_POL_Datas), len(dev_POL_labels)

(16884, 16884, 934, 934)

In [18]:
################### Test set #######################
test_ACD_Datas = []  
test_ACD_labels = []

test_POL_Datas = []
test_POL_labels = []

### sentence들 먼저 모으고 한꺼번에 tokenize 하자


for datas in tests:
    sen = preprocess(datas['sentence_form'])
    annos = datas['annotation']  # 여러 개 일 수도 있음. (이중 리스트)
    
    for annotation in annos:
        for pair in rep_entity_property_pair:
            entity_property = annotation[0]  # raw_data의 annotation 추출
            entity_property = entity_property.replace(" ", "")  # 띄어쓰기 제거
            polarity = annotation[2]

            if entity_property == pair:
                test_ACD_Datas.append(sen + " " + pair) 
                test_ACD_labels.append(1)
                test_POL_Datas.append(sen + " " + pair)
                test_POL_labels.append(polarity_name_to_id[polarity])

            else:
                test_ACD_Datas.append(sen + " " + pair) 
                test_ACD_labels.append(0)

In [19]:
len(test_ACD_Datas), len(test_ACD_labels), len(test_POL_Datas), len(test_POL_labels)

(38538, 38538, 2138, 2138)

In [20]:
class klue_Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, label):  # 전처리된 데이터 셋이 들어옴
        self.dataset = dataset
        self.label = label

    def __getitem__(self, idx):
        # getitem이므로 gradient 계산에 영향을 주지 않게 clone().detach() 실행
        item = {key: val[idx].clone().detach() for key, val in self.dataset.items()}
        item['label'] = torch.tensor(self.label[idx])
        
        return item

    def __len__(self):  # 샘플 수
        return len(self.label)

In [21]:
ACD_tok_sen = tokenizer(ACD_Datas, padding='max_length', return_tensors="pt",
                max_length=256, truncation=True, add_special_tokens=True)  

ACD_klue_sets = klue_Dataset(ACD_tok_sen, ACD_labels)  # klue_Dataset에서 1차원 텐서로 바뀜.

In [22]:
ACD_klue_sets.__getitem__(758)

{'input_ids': tensor([    0,  5560,  7036,  2116, 27316,  4015,  2460,  5153, 13679,  1785,
          2235,  2119,   856,  2126,  2496,  2051,  1513,  2051,  2112,  3737,
          8286,  2069,  1552,  2210,  1295,  1513,  2062, 32002,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [23]:
print(ACD_klue_sets.__len__())

58590


In [24]:
dev_ACD_tok_sen = tokenizer(dev_ACD_Datas, padding='max_length', return_tensors="pt",
                max_length=256, truncation=True, add_special_tokens=True)  

dev_ACD_klue_sets = klue_Dataset(dev_ACD_tok_sen, dev_ACD_labels)  # klue_Dataset에서 1차원 텐서로 바뀜.
    
print(dev_ACD_klue_sets.__getitem__(758))
print(dev_ACD_klue_sets.__len__())

{'input_ids': tensor([    0, 11398,  2079,  6548,  2116,  7847,  3954,  5763, 27208, 13964,
          848,  4442,  3853,  2259,  5009,  6548,  3681,  2052,  2209,  4181,
         2190,  2995,  4671,  2085,  2154,  6159,  5370,   887, 25219,  2118,
         1899,  2357,  2052,  3614,  3707,  3904, 11187,   831,  6758,  2318,
        12110, 32002,     2,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1, 

In [25]:
test_ACD_tok_sen = tokenizer(test_ACD_Datas, padding='max_length', return_tensors="pt",
                max_length=256, truncation=True, add_special_tokens=True)  

test_ACD_klue_sets = klue_Dataset(test_ACD_tok_sen, test_ACD_labels)  # klue_Dataset에서 1차원 텐서로 바뀜.
    
print(test_ACD_klue_sets.__getitem__(758))
print(test_ACD_klue_sets.__len__())

{'input_ids': tensor([    0,  5204, 31302,  2716,  2073,  9366,  2079,  9471, 32002,     2,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1, 

### Polarity datasets

In [26]:
pol_tok_sen = tokenizer(POL_Datas, padding='max_length', return_tensors="pt",
                    max_length=256, truncation=True, add_special_tokens=True)  

POL_klue_sets = klue_Dataset(pol_tok_sen, POL_labels)

POL_klue_sets.__getitem__(758)

{'input_ids': tensor([    0, 22869,  2145, 13408, 14360,  1560,  2073,  2532,  2073,  4458,
          4019, 21820,  2047,  2377,  2052, 23548,  1540,  2052,  1141,  2052,
          2170,  2182, 32012,     2,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [27]:
pol_tok_sen = tokenizer(dev_POL_Datas, padding='max_length', return_tensors="pt",
                    max_length=256, truncation=True, add_special_tokens=True)  

dev_POL_klue_sets = klue_Dataset(pol_tok_sen, dev_POL_labels)

dev_POL_klue_sets.__getitem__(758)

{'input_ids': tensor([    0,  3776,  2170,  1904,  2119,  5723,  2112,  4514, 31302,  6509,
           623,  2535,  4168, 32010,     2,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [28]:
pol_tok_sen = tokenizer(test_POL_Datas, padding='max_length', return_tensors="pt",
                    max_length=256, truncation=True, add_special_tokens=True)  

test_POL_klue_sets = klue_Dataset(pol_tok_sen, test_POL_labels)

test_POL_klue_sets.__getitem__(758)

{'input_ids': tensor([    0, 18391,  2069,  1523,  2259,  1284,  2119,  6978,  2088, 22045,
          2205,  2307,  4975,  2118,  2259,  8146, 19630,  2119,  1560,  2088,
         32001,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

## Trainer 정의

In [29]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [None]:
odir = '/mnt/HDD4T/egg2018037024/ABSA_ACD'

training_ars = TrainingArguments(
    output_dir=odir,
    num_train_epochs=5,
 #   max_steps=5000,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=5,
    save_strategy = "epoch",
 #   save_steps=1000,
    learning_rate=5e-5,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    load_best_model_at_end = True,
)

trainer = Trainer(
    model=ACD_model,
    args=training_ars,
    train_dataset=ACD_klue_sets,
    eval_dataset=dev_ACD_klue_sets,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
ACD_mpodel.save_pretrained(odir + "_best")

***** Running training *****
  Num examples = 58590
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 9155


Epoch,Training Loss,Validation Loss
