## 데이터

In [5]:
from datasets import load_dataset


In [6]:
# https://huggingface.co/datasets/klue/klue/viewer/ynat
klue_ynat_train = load_dataset('klue','ynat',split='train')
klue_ynat_validation = load_dataset('klue','ynat',split='validation')

In [9]:
type(klue_ynat_train)

datasets.arrow_dataset.Dataset

In [8]:
klue_ynat_train[0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [11]:
vars(klue_ynat_train)['_info'] # 데이터 셋이 가진 여러 정보 HF 는 다 이렇게 되어 있다.

DatasetInfo(description='', citation='', homepage='', license='', features={'guid': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'label': ClassLabel(names=['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치'], id=None), 'url': Value(dtype='string', id=None), 'date': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='klue', config_name='ynat', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=10115294, num_examples=45678, shard_lengths=None, dataset_name='klue'), 'validation': SplitInfo(name='validation', num_bytes=2040320, num_examples=9107, shard_lengths=None, dataset_name='klue')}, download_checksums={'hf://datasets/klue@349481ec73fff722f88e0453ca05c77a447d967c/ynat/train-00000-of-00001.parquet': {'num_bytes': 4165783, 'checksum': None}, 'hf://datasets/klue@349481ec73fff722f88e0453ca05c77a447d967c/ynat/validation-00000-of-00001.parquet': {'num_bytes': 846520, 'checksum': None}}, downl

In [13]:
klue_ynat_train.features['label'].names # 설명 카테고리 

['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치']

In [15]:
klue_ynat_train_data = klue_ynat_train.remove_columns(['guid','url','date'])
klue_ynat_validation_data = klue_ynat_validation.remove_columns(['guid','url','date'])

In [16]:
klue_ynat_train_data

Dataset({
    features: ['title', 'label'],
    num_rows: 45678
})

In [21]:
# label 카테고리와 index 매칭
klue_ynat_train_data.features['label'].int2str(1), klue_ynat_train_data.features['label'].int2str(2)

('경제', '사회')

In [24]:
## 데이터 분할
klue_ynat_train_data_split = klue_ynat_train_data.train_test_split(test_size=10000, shuffle=True, seed=24)
klue_ynat_train_data_split

DatasetDict({
    train: Dataset({
        features: ['title', 'label'],
        num_rows: 35678
    })
    test: Dataset({
        features: ['title', 'label'],
        num_rows: 10000
    })
})

## 모델

In [29]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [34]:
model_id = "klue/roberta-base"
num_labels = len(klue_ynat_train_data.features['label'].names) # out classification 의 class 숫자
model_ynat = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_labels)
# model_ynat.state_dict() # pretrained weight 
model_ynat

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [36]:
# Load model directly
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [37]:
def tokenize_function(examples):
    return tokenizer(examples["title"], padding="max_length", truncation=True)

In [38]:
train_dataset = klue_ynat_train_data.map(tokenize_function, batched=True)
train_dataset

Map:   0%|          | 0/45678 [00:00<?, ? examples/s]

Dataset({
    features: ['title', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 45678
})

In [41]:
train_dataset[0].keys()

dict_keys(['title', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

In [47]:
validation_dataset = klue_ynat_validation_data.map(tokenize_function, batched=True)
validation_dataset

Map:   0%|          | 0/9107 [00:00<?, ? examples/s]

Dataset({
    features: ['title', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 9107
})

In [46]:
from transformers import Trainer, TrainingArguments # Trainer 학습 도구, TrainingArguments 학습에 필요한 셋팅
training_args = TrainingArguments(output_dir='./results_ynat',
                 num_train_epochs=1,
                 per_device_train_batch_size=8,
                 per_device_eval_batch_size=8,
                 eval_strategy='epoch',
                 learning_rate=0.00001, # fine tune에서는 작게 잡음. 더 작게 해야함.
                 push_to_hub=False, 
                 logging_steps=1, 
                 report_to="none"  # WandB, TensorBoard 등 모두 비활성화
                 )

In [48]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).mean()}

In [53]:
trainer = Trainer(model=model_ynat,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
       )

  trainer = Trainer(model=model_ynat,


In [54]:
# fine tunning 시작
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

KeyboardInterrupt: 

## 모델 평가