## 데이터

In [2]:
import os
my_hf_token = ''
my_hf_id = 'HueyVault'
my_hf_repo_name = 'sentiment_analysis_klue_sts'
my_hf_repo = f'{my_hf_id}/{my_hf_repo_name}'
my_hf_dir = f'./{my_hf_repo_name}'
os.environ['HF_TOKEN'] = my_hf_token

In [4]:
from datasets import load_dataset

In [6]:
# https://huggingface.co/datasets/klue/klue/viewer/ynat
klue_sts_train = load_dataset('klue','sts',split='train')
klue_sts_validation = load_dataset('klue','sts',split='validation')

In [8]:
type(klue_sts_train)

datasets.arrow_dataset.Dataset

In [9]:
klue_sts_train[0]

{'guid': 'klue-sts-v1_train_00000',
 'source': 'airbnb-rtt',
 'sentence1': '숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.',
 'sentence2': '숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.',
 'labels': {'label': 3.7, 'real-label': 3.714285714285714, 'binary-label': 1}}

In [10]:
vars(klue_sts_train)['_info'] # 데이터 셋이 가진 여러 정보 HF 는 다 이렇게 되어 있다.

DatasetInfo(description='', citation='', homepage='', license='', features={'guid': Value(dtype='string', id=None), 'source': Value(dtype='string', id=None), 'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'labels': {'label': Value(dtype='float64', id=None), 'real-label': Value(dtype='float64', id=None), 'binary-label': ClassLabel(names=['negative', 'positive'], id=None)}}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='klue', config_name='sts', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=2837266, num_examples=11668, shard_lengths=None, dataset_name='klue'), 'validation': SplitInfo(name='validation', num_bytes=122836, num_examples=519, shard_lengths=None, dataset_name='klue')}, download_checksums={'hf://datasets/klue@349481ec73fff722f88e0453ca05c77a447d967c/sts/train-00000-of-00001.parquet': {'num_bytes': 1519075, 'checksum': None}, 'hf://datasets/klue@349481ec73fff722f88e0453ca05c77a447d9

In [17]:
klue_sts_train.features['labels']['binary-label'].names # 설명 카테고리 

['negative', 'positive']

In [45]:
klue_sts_train.features['labels']['binary-label']

ClassLabel(names=['negative', 'positive'], id=None)

In [50]:
from datasets import ClassLabel, Features, Value

# 새로운 feature 구조 정의
new_features = Features({
    'sentence1': Value('string'),
    'label': ClassLabel(names=['negative', 'positive'])
})

# 데이터셋 변환
klue_sts_train_data = klue_sts_train.map(
    lambda x: {
        'sentence1': x['sentence1'],
        'label': x['labels']['binary-label']
    },
    remove_columns=klue_sts_train.column_names,
    features=new_features
)

# 확인
print(klue_sts_train_data.features)  # 데이터셋 구조 확인
print(klue_sts_train_data[0])        # 첫 번째 샘플 확인

{'sentence1': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive'], id=None)}
{'sentence1': '숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.', 'label': 1}


In [51]:
# 데이터셋 변환
klue_sts_validation_data = klue_sts_validation.map(
    lambda x: {
        'sentence1': x['sentence1'],
        'label': x['labels']['binary-label']
    },
    remove_columns=klue_sts_validation.column_names,
    features=new_features
)

print(klue_sts_validation_data.features)  # 데이터셋 구조 확인
print(klue_sts_validation_data[0])        # 첫 번째 샘플 확인

{'sentence1': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive'], id=None)}
{'sentence1': '무엇보다도 호스트분들이 너무 친절하셨습니다.', 'label': 1}


In [52]:
klue_sts_train_data, klue_sts_validation_data

(Dataset({
     features: ['sentence1', 'label'],
     num_rows: 11668
 }),
 Dataset({
     features: ['sentence1', 'label'],
     num_rows: 519
 }))

In [53]:
## 데이터 분할
klue_sts_train_data_split = klue_sts_train_data.train_test_split(test_size=0.3, shuffle=True, seed=24)
klue_sts_train_data_split

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'label'],
        num_rows: 8167
    })
    test: Dataset({
        features: ['sentence1', 'label'],
        num_rows: 3501
    })
})

In [54]:
train_dataset_before = klue_sts_train_data_split['train']
test_dataset_before = klue_sts_train_data_split['test']
train_dataset_before, test_dataset_before

(Dataset({
     features: ['sentence1', 'label'],
     num_rows: 8167
 }),
 Dataset({
     features: ['sentence1', 'label'],
     num_rows: 3501
 }))

## 모델

In [55]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [57]:
model_id = "klue/roberta-base"
num_labels = len(train_dataset_before.features['label'].names ) # 'negative' : 0, 'positive' : 1
model_ynat = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_labels)
# model_ynat.state_dict() # pretrained weight 
model_ynat

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [58]:
# Load model directly
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [59]:
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], padding="max_length", truncation=True)

In [60]:
train_dataset = train_dataset_before.map(tokenize_function, batched=True)
train_dataset

Map:   0%|          | 0/8167 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence1', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8167
})

In [61]:
test_dataset = test_dataset_before.map(tokenize_function, batched=True)
test_dataset

Map:   0%|          | 0/3501 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence1', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3501
})

In [62]:
train_dataset[0].keys()

dict_keys(['sentence1', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

In [63]:
validation_dataset = klue_sts_validation_data.map(tokenize_function, batched=True)
validation_dataset

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence1', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 519
})

In [64]:
# 모델의 예측 아이디와 문자열 레이블을 연결할 데이터를 모델 config에 저장
id2label = {i: label for i, label in enumerate(train_dataset.features['label'].names)}
label2id = {label: i for i, label in id2label.items()}
model_ynat.config.id2label = id2label
model_ynat.config.label2id = label2id

In [67]:
from transformers import Trainer, TrainingArguments # Trainer 학습 도구, TrainingArguments 학습에 필요한 셋팅
training_args = TrainingArguments(output_dir=my_hf_dir,
                 num_train_epochs=1,
                 per_device_train_batch_size=8,
                 per_device_eval_batch_size=8,
                 eval_strategy='epoch',
                 learning_rate=0.00001, # fine tune에서는 작게 잡음. 더 작게 해야함.
                 push_to_hub=False, 
                 logging_steps=1, 
                 report_to="none"  # WandB, TensorBoard 등 모두 비활성화
                 )

In [68]:
import numpy as np
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).mean()}

In [69]:
trainer = Trainer(model=model_ynat,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
       )

  trainer = Trainer(model=model_ynat,


In [70]:
# fine tunning 시작
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.6989,0.772502,0.439306




TrainOutput(global_step=511, training_loss=0.6799547163707637, metrics={'train_runtime': 475.3513, 'train_samples_per_second': 17.181, 'train_steps_per_second': 1.075, 'total_flos': 2148827989125120.0, 'train_loss': 0.6799547163707637, 'epoch': 1.0})

## 모델 평가

In [71]:
# 정확도율 확인
trainer.evaluate(validation_dataset)

{'eval_loss': 0.7725024223327637,
 'eval_accuracy': 0.4393063583815029,
 'eval_runtime': 9.4062,
 'eval_samples_per_second': 55.176,
 'eval_steps_per_second': 3.508,
 'epoch': 1.0}

## 모델 서비스

In [72]:
# model upload to huggingface
from huggingface_hub import login

login(token=my_hf_token)
repo_id = my_hf_repo
trainer.push_to_hub(repo_id)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HueyVault/sentiment_analysis_klue_sts/commit/bf5f00ffbd4ceaf37045e13fa928712056d7dea1', commit_message='HueyVault/sentiment_analysis_klue_sts', commit_description='', oid='bf5f00ffbd4ceaf37045e13fa928712056d7dea1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/HueyVault/sentiment_analysis_klue_sts', endpoint='https://huggingface.co', repo_type='model', repo_id='HueyVault/sentiment_analysis_klue_sts'), pr_revision=None, pr_num=None)

In [73]:
from transformers import pipeline

# model_id = 'otter35/roberta-base-klue-ynat-classification'
# model_pipeline = pipeline('text-classification', model=model_id)
model_pipeline = pipeline('text-classification', model=repo_id)

config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

Device set to use cuda:0


In [78]:
test_dataset[4:10]['sentence1'], test_dataset[4:10]['label']

(['메일 관리를 할 때에는 리눅스 말고 윈도우를 설치하여 해주세요.',
  '키패드로 운영되는 곳이라서 편리합니다.',
  '아울러 국토부는 전국 철도역 전광판, 도로 VMS, 역사 및 차량 내 안내방송 등을 통해 감염병 예방수칙 등 국민 행동요령을 홍보한다.',
  '아주 가까이 25번 트램 정거장이 있습니다.',
  '위치 숙소청결 그리고 옥상뷰가 너무 좋습니다!',
  '지원금액은 현재 수소연료 구입 단가와 사업자가 손익분기점을 달성할 수 있는 수준의 기준단가간 차액의 70%로 산정된다.'],
 [0, 0, 1, 1, 1, 1])

In [76]:
model_pipeline(test_dataset[4:10]['sentence1'])

[{'label': 'negative', 'score': 0.5109409093856812},
 {'label': 'positive', 'score': 0.5634434819221497},
 {'label': 'negative', 'score': 0.55405193567276},
 {'label': 'positive', 'score': 0.7216688990592957},
 {'label': 'positive', 'score': 0.679872453212738},
 {'label': 'negative', 'score': 0.5153777003288269}]