In [1]:
import os
my_hf_token = ''
my_hf_id = 'Jo-2j'
my_hf_repo_name = 'sentiment_analysis_klue_sts'
my_hf_repo = f'{my_hf_id}/{my_hf_repo_name}'
my_hf_dir = f'./{my_hf_repo_name}'
os.environ['HF_TOKEN'] = my_hf_token

## Data Refining

In [2]:
from datasets import load_dataset

In [3]:
# https://huggingface.co/datasets/klue/klue/viewer/ynat
klue_sts_train = load_dataset('klue', 'sts', split='train')
klue_sts_validation = load_dataset('klue', 'sts', split='validation')
# klue_ynat_train

README.md:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.52M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/68.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/519 [00:00<?, ? examples/s]

In [4]:
klue_sts_train[0]

{'guid': 'klue-sts-v1_train_00000',
 'source': 'airbnb-rtt',
 'sentence1': '숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.',
 'sentence2': '숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.',
 'labels': {'label': 3.7, 'real-label': 3.714285714285714, 'binary-label': 1}}

In [5]:
vars(klue_sts_train).keys()

dict_keys(['_info', '_split', '_indexes', '_data', '_indices', '_format_type', '_format_kwargs', '_format_columns', '_output_all_columns', '_fingerprint'])

In [6]:
# klue_sts_train['sentence1']

In [7]:
klue_sts_train_data = klue_sts_train.remove_columns(['guid', 'source', 'sentence2'])
klue_sts_validation_data = klue_sts_validation.remove_columns(['guid', 'source', 'sentence2'])

In [8]:
klue_sts_train_data, klue_sts_validation_data

(Dataset({
     features: ['sentence1', 'labels'],
     num_rows: 11668
 }),
 Dataset({
     features: ['sentence1', 'labels'],
     num_rows: 519
 }))

In [9]:
# klue_sts_train_data.features['sentence1'].int2str(1) int2str 이 정확하게 뭘까? 여기는 그냥 다 문장(글자)이어서 
# klue_sts_train_data.features['labels'].int2str(1) ???

In [10]:
# # type(klue_sts_train['labels'])
# # klue_sts_train['labels']

# sentiCol = klue_sts_train['labels']

# # # # sentiments = list(map(lambda x: x['label'], sentiCol))
# # # # print(sentiments)

# for sentiment in sentiCol:
#      print(sentiment['binary-label'])  # 각 딕셔너리의 'name' 값 출력


In [11]:
from datasets import ClassLabel, Features, Value

# 새로운 feature 구조 정의
new_features = Features({
    'sentence1': Value('string'),
    'label': ClassLabel(names=['negative', 'positive'])
})

# 데이터셋 변환
klue_sts_train_data = klue_sts_train.map(
    lambda x: {
        'sentence1': x['sentence1'],
        'label': x['labels']['binary-label']
    },
    remove_columns=klue_sts_train.column_names,
    features=new_features
)

# 확인
print(klue_sts_train_data.features)  # 데이터셋 구조 확인
print(klue_sts_train_data[0])        # 첫 번째 샘플 확인

# 데이터셋 변환
klue_sts_validation_data = klue_sts_validation.map(
    lambda x: {
        'sentence1': x['sentence1'],
        'label': x['labels']['binary-label']
    },
    remove_columns=klue_sts_validation.column_names,
    features=new_features
)

print(klue_sts_validation_data.features)  # 데이터셋 구조 확인
print(klue_sts_validation_data[0])        # 첫 번째 샘플 확인

# 참조 N/I

Map:   0%|          | 0/11668 [00:00<?, ? examples/s]

{'sentence1': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive'], id=None)}
{'sentence1': '숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.', 'label': 1}


Map:   0%|          | 0/519 [00:00<?, ? examples/s]

{'sentence1': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive'], id=None)}
{'sentence1': '무엇보다도 호스트분들이 너무 친절하셨습니다.', 'label': 1}


In [12]:
klue_sts_train_data, klue_sts_validation_data

(Dataset({
     features: ['sentence1', 'label'],
     num_rows: 11668
 }),
 Dataset({
     features: ['sentence1', 'label'],
     num_rows: 519
 }))

In [13]:
klue_sts_train_data_split = klue_sts_train_data.train_test_split(test_size=0.3
                                                                   , shuffle=True, seed=24)
# klue_ynat_train_data_split['test']
klue_sts_train_data_split['test'] # 데이터 스플릿을 하는 이유?

Dataset({
    features: ['sentence1', 'label'],
    num_rows: 3501
})

In [14]:
train_dataset_before = klue_sts_train_data_split['train']
test_dataset_before = klue_sts_train_data_split['test']
train_dataset_before, test_dataset_before

# 문제지(split) train은 공부, test 모의고사, validation 마지막 배포(수능) 

(Dataset({
     features: ['sentence1', 'label'],
     num_rows: 8167
 }),
 Dataset({
     features: ['sentence1', 'label'],
     num_rows: 3501
 }))

## Modelling

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_id = 'klue/roberta-base'
out_features = len(klue_sts_train.features['labels']['binary-label'].names)
out_features

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

2

In [16]:
# AutoModelForSequenceClassification.from_pretrained(model_id)  # out_feature 렌덤하게 부여됨

model_id = "klue/roberta-base"
num_labels = len(train_dataset_before.features['label'].names ) # 'negative' : 0, 'positive' : 1
model_sts = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=out_features)

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:

tokenizer = AutoTokenizer.from_pretrained(model_id)
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], padding="max_length", truncation=True)

tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [18]:
train_dataset = train_dataset_before.map(tokenize_function, batched=True)
test_dataset  = test_dataset_before.map(tokenize_function, batched=True)

Map:   0%|          | 0/8167 [00:00<?, ? examples/s]

Map:   0%|          | 0/3501 [00:00<?, ? examples/s]

In [19]:
validation_dataset = klue_sts_validation_data.map(tokenize_function, batched=True)
validation_dataset

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence1', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 519
})

In [20]:
# 모델의 예측 아이디와 문자열 레이블을 연결할 데이터를 모델 config에 저장
id2label = {i: label for i, label in enumerate(train_dataset.features['label'].names)}
label2id = {label: i for i, label in id2label.items()}
model_sts.config.id2label = id2label
model_sts.config.label2id = label2id

In [21]:
from transformers import Trainer, TrainingArguments # Trainer 학습 도구, TrainingArguments 학습에 필요한 셋팅
training_args = TrainingArguments(output_dir=my_hf_dir,
                 num_train_epochs=1,
                 per_device_train_batch_size=8,
                 per_device_eval_batch_size=8,
                 eval_strategy='epoch',
                 learning_rate=0.00001, # fine tune에서는 작게 잡음. 더 작게 해야함.
                 push_to_hub=False, 
                 logging_steps=1, 
                 report_to="none"  # WandB, TensorBoard 등 모두 비활성화
                 )

In [22]:
import numpy as np
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).mean()}

In [23]:
trainer = Trainer(model=model_sts,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
       )

  trainer = Trainer(model=model_sts,


In [24]:
# 파인 튜닝

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.7206,0.773744,0.454721




TrainOutput(global_step=511, training_loss=0.6804792903174159, metrics={'train_runtime': 460.2537, 'train_samples_per_second': 17.745, 'train_steps_per_second': 1.11, 'total_flos': 2148827989125120.0, 'train_loss': 0.6804792903174159, 'epoch': 1.0})

## Model Eval

In [25]:
trainer.evaluate(validation_dataset)

{'eval_loss': 0.7737435102462769,
 'eval_accuracy': 0.45472061657032753,
 'eval_runtime': 9.2112,
 'eval_samples_per_second': 56.345,
 'eval_steps_per_second': 3.583,
 'epoch': 1.0}

## Model Service

In [26]:
# model upload to huggingface
from huggingface_hub import login

login(token=my_hf_token)
repo_id = my_hf_repo
trainer.push_to_hub(repo_id)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Jo-2j/sentiment_analysis_klue_sts/commit/68e61948f541437d7ab4c340f55dc72036b8e8d8', commit_message='Jo-2j/sentiment_analysis_klue_sts', commit_description='', oid='68e61948f541437d7ab4c340f55dc72036b8e8d8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Jo-2j/sentiment_analysis_klue_sts', endpoint='https://huggingface.co', repo_type='model', repo_id='Jo-2j/sentiment_analysis_klue_sts'), pr_revision=None, pr_num=None)

In [27]:
from transformers import pipeline

# model_id = 'otter35/roberta-base-klue-ynat-classification'
# model_pipeline = pipeline('text-classification', model=model_id)
model_pipeline = pipeline('text-classification', model=repo_id)

config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

Device set to use cuda:0


In [28]:
test_dataset[4:10]['sentence1'], test_dataset[4:10]['label']


(['메일 관리를 할 때에는 리눅스 말고 윈도우를 설치하여 해주세요.',
  '키패드로 운영되는 곳이라서 편리합니다.',
  '아울러 국토부는 전국 철도역 전광판, 도로 VMS, 역사 및 차량 내 안내방송 등을 통해 감염병 예방수칙 등 국민 행동요령을 홍보한다.',
  '아주 가까이 25번 트램 정거장이 있습니다.',
  '위치 숙소청결 그리고 옥상뷰가 너무 좋습니다!',
  '지원금액은 현재 수소연료 구입 단가와 사업자가 손익분기점을 달성할 수 있는 수준의 기준단가간 차액의 70%로 산정된다.'],
 [0, 0, 1, 1, 1, 1])

In [29]:
model_pipeline(test_dataset[4:10]['sentence1'])


[{'label': 'negative', 'score': 0.5342332720756531},
 {'label': 'positive', 'score': 0.554225504398346},
 {'label': 'negative', 'score': 0.5369669198989868},
 {'label': 'positive', 'score': 0.782511830329895},
 {'label': 'positive', 'score': 0.5807557702064514},
 {'label': 'negative', 'score': 0.5401906371116638}]