## 데이터

In [68]:
from datasets import load_dataset

In [69]:
# https://huggingface.co/datasets/klue/klue/viewer/ynat
klue_ynat_train = load_dataset('klue','ynat',split='train')
klue_ynat_validation = load_dataset('klue','ynat',split='validation')

In [70]:
type(klue_ynat_train)

datasets.arrow_dataset.Dataset

In [71]:
klue_ynat_train[0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [72]:
vars(klue_ynat_train)['_info'] # 데이터 셋이 가진 여러 정보 HF 는 다 이렇게 되어 있다.

DatasetInfo(description='', citation='', homepage='', license='', features={'guid': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'label': ClassLabel(names=['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치'], id=None), 'url': Value(dtype='string', id=None), 'date': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='klue', config_name='ynat', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=10115294, num_examples=45678, shard_lengths=None, dataset_name='klue'), 'validation': SplitInfo(name='validation', num_bytes=2040320, num_examples=9107, shard_lengths=None, dataset_name='klue')}, download_checksums={'hf://datasets/klue@349481ec73fff722f88e0453ca05c77a447d967c/ynat/train-00000-of-00001.parquet': {'num_bytes': 4165783, 'checksum': None}, 'hf://datasets/klue@349481ec73fff722f88e0453ca05c77a447d967c/ynat/validation-00000-of-00001.parquet': {'num_bytes': 846520, 'checksum': None}}, downl

In [73]:
klue_ynat_train.features['label'].names # 설명 카테고리 

['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치']

In [74]:
klue_ynat_train_data = klue_ynat_train.remove_columns(['guid','url','date'])
klue_ynat_validation_data = klue_ynat_validation.remove_columns(['guid','url','date'])

In [75]:
klue_ynat_train_data

Dataset({
    features: ['title', 'label'],
    num_rows: 45678
})

In [76]:
# label 카테고리와 index 매칭
klue_ynat_train_data.features['label'].int2str(1), klue_ynat_train_data.features['label'].int2str(2)

('경제', '사회')

In [77]:
## 데이터 분할
klue_ynat_train_data_split = klue_ynat_train_data.train_test_split(test_size=0.8, shuffle=True, seed=24)
klue_ynat_train_data_split

DatasetDict({
    train: Dataset({
        features: ['title', 'label'],
        num_rows: 9135
    })
    test: Dataset({
        features: ['title', 'label'],
        num_rows: 36543
    })
})

In [78]:
klue_ynat_train_data = klue_ynat_train_data_split['train']

## 모델

In [79]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [80]:
model_id = "klue/roberta-base"
num_labels = len(klue_ynat_train_data.features['label'].names) # out classification 의 class 숫자
model_ynat = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_labels)
# model_ynat.state_dict() # pretrained weight 
model_ynat

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [81]:
# Load model directly
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [82]:
def tokenize_function(examples):
    return tokenizer(examples["title"], padding="max_length", truncation=True)

In [83]:
train_dataset = klue_ynat_train_data.map(tokenize_function, batched=True)
train_dataset

Dataset({
    features: ['title', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 9135
})

In [84]:
train_dataset[0].keys()

dict_keys(['title', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

In [85]:
validation_dataset = klue_ynat_validation_data.map(tokenize_function, batched=True)
validation_dataset

Dataset({
    features: ['title', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 9107
})

In [86]:
# 모델의 예측 아이디와 문자열 레이블을 연결할 데이터를 모델 config에 저장
id2label = {i: label for i, label in enumerate(train_dataset.features['label'].names)}
label2id = {label: i for i, label in id2label.items()}
model_ynat.config.id2label = id2label
model_ynat.config.label2id = label2id

In [87]:
from transformers import Trainer, TrainingArguments # Trainer 학습 도구, TrainingArguments 학습에 필요한 셋팅
training_args = TrainingArguments(output_dir='./results_ynat',
                 num_train_epochs=1,
                 per_device_train_batch_size=8,
                 per_device_eval_batch_size=8,
                 eval_strategy='epoch',
                 learning_rate=0.00001, # fine tune에서는 작게 잡음. 더 작게 해야함.
                 push_to_hub=False, 
                 logging_steps=1, 
                 report_to="none"  # WandB, TensorBoard 등 모두 비활성화
                 )

In [88]:
import numpy as np
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).mean()}

In [89]:
trainer = Trainer(model=model_ynat,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
       )

  trainer = Trainer(model=model_ynat,


In [90]:
# fine tunning 시작
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.8108,0.51138,0.839903




TrainOutput(global_step=571, training_loss=0.7055956882843829, metrics={'train_runtime': 660.9771, 'train_samples_per_second': 13.82, 'train_steps_per_second': 0.864, 'total_flos': 2403627391872000.0, 'train_loss': 0.7055956882843829, 'epoch': 1.0})

## 모델 평가

In [91]:
# 정확도율 확인
trainer.evaluate(validation_dataset)

{'eval_loss': 0.5113795399665833,
 'eval_accuracy': 0.8399033710332711,
 'eval_runtime': 153.5277,
 'eval_samples_per_second': 59.318,
 'eval_steps_per_second': 3.713,
 'epoch': 1.0}

## 모델 서비스

In [104]:
# model upload to huggingface
from huggingface_hub import login

login(token=)
repo_id = 'HueyVault/results_ynat'
trainer.push_to_hub(repo_id)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/HueyVault/results_ynat/commit/dc13a15fe8e2dd8dcb329c2f442e734069884c80', commit_message='HueyVault/results_ynat', commit_description='', oid='dc13a15fe8e2dd8dcb329c2f442e734069884c80', pr_url=None, repo_url=RepoUrl('https://huggingface.co/HueyVault/results_ynat', endpoint='https://huggingface.co', repo_type='model', repo_id='HueyVault/results_ynat'), pr_revision=None, pr_num=None)

In [105]:
from transformers import pipeline

# model_id = 'otter35/roberta-base-klue-ynat-classification'
# model_pipeline = pipeline('text-classification', model=model_id)
model_pipeline = pipeline('text-classification', model=repo_id)

Device set to use cuda:0


In [106]:
model_pipeline(train_dataset[4:10]['title'])

[{'label': '세계', 'score': 0.9489774703979492},
 {'label': '세계', 'score': 0.9467717409133911},
 {'label': '세계', 'score': 0.9476671814918518},
 {'label': '사회', 'score': 0.7193306088447571},
 {'label': '세계', 'score': 0.8354392647743225},
 {'label': '경제', 'score': 0.587710440158844}]