# KLUE-TC Classification Task

이 Notebook은 KLUE-TC (YNAT) Task를 수행하는 예시 코드입니다. Kaggle Leaderboard의 Benchmark가 이 코드를 그대로 실행해서 얻은 결과입니다.

이 코드와 실습 코드, 이전 과목 코드 등을 참고하여 여러분의 AI를 작성해 보세요.

하단의 링크를 통해 과제를 위한 Kaggle Competition에 참가할 수 있습니다: 
https://www.kaggle.com/t/3e0bdda08ecd645d9b2fef4a90b283c3 

In [1]:
# ! pip install datasets transformers evaluate

In [2]:
import torch
import torch.nn as nn
import evaluate
import numpy as np

In [3]:
# pip install evaluate

## 산출물 생성 함수 정의
아래의 코드블럭을 실행하면 됩니다.

In [4]:
# Parameter로 입력된 모델을 테스트 하고 그 결과를 result.csv 파일로 저장합니다. 
def export_result(model, tokenizer = None):
  from pandas import DataFrame
  test_dataset = load_dataset("klue",'ynat')['validation']
  id = []
  pred_label_list = []
  true_label_list = []
  is_warned = False
  with torch.no_grad():
    for idx, datum in enumerate(test_dataset):
        if idx % 100 == 0:
          print("test {}th data".format(idx))
        id.append(datum['guid'])    
        
        if tokenizer is not None:
              tokenized = tokenizer(datum['title'],return_tensors='pt')
              predicted_label = model(**tokenized)[0]
        else:
          # model만 인자로 넘겨준 경우, Model은 숫자 하나 혹은 Tensor를 리턴해야 합니다
          # 혹은 아래 코드를 변경해서 pred_label_list 리스트에 한번에 숫자 하나가 입력 되도록 만들면 됩니다.
          predicted_label = model(datum['title'])

        if isinstance(predicted_label,list):
          predicted_label = torch.argmax(predicted_label[0],dim=1).item()
        elif isinstance(predicted_label,torch.Tensor):
          predicted_label = torch.argmax(predicted_label,dim=1).item()
            
        true_value = datum['label']
        if predicted_label > 6 and not is_warned:
          print("predicted_label의 값이 6보다 큽니다. 출력된 값을 한번 점검해주세요.")
          is_warned = True
        pred_label_list.append(predicted_label)
        true_label_list.append(true_value)
  df = DataFrame({"guid":id,
                  "Category":pred_label_list})
  # save df to result.csv file
  df.to_csv("./result.csv", index=False)

## 데이터셋 불러오기

#### 주의사항: 본 과제에서는 validation set에 대한 성능을 평가하기 때문에 load_dataset("klue","ynat")['validation'] 등을 통해 validation set을 모델에 학습시킨 경우 0점 처리 됩니다.


In [5]:
from datasets import load_dataset

# 본 과제에서는 validation set에 대한 성능을 평가하기 때문에
# load_dataset("klue","ynat")['validation'] 등을 통해 
# validation set을 모델에 학습시킨 경우 0점 처리 됩니다.
dataset = load_dataset("klue",'ynat')

Reusing dataset klue (/home/piai/.cache/huggingface/datasets/klue/ynat/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 45678
    })
    validation: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 9107
    })
})

## 모델 설계

#### 자연어처리 과목 혹은 이전 실습시간에 배운 내용을 바탕으로, KLUE-TC task를 위한 AI모델을 만들어보세요.


In [7]:
# Define Your AI
# 아래의 코드 블럭은 예시 AI입니다. 이를 활용해도 되고, 무시해도 됩니다.

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_checkpoint = "klue/bert-base"
some_tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=7).to(device)

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [9]:
# pip install transformers

## 모델 학습 과정 설계

#### 자연어처리 과목 혹은 이전 실습시간에 배운 내용을 바탕으로, 앞서 정의한 AI를 학습할 수 있는 코드를 작성해보세요. KLUE-TC Data말고 다른 Data를 추가로 사용해도 됩니다.

In [10]:
# 여러분의 AI를 학습해보세요.
# 아래의 코드 블럭 3개는 실습 시간의 코드를 그대로 가져왔습니다. 이를 활용해도 되고, 무시해도 됩니다.

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
metric = evaluate.load("accuracy")
def preprocess_function(examples):
    return some_tokenizer(examples['title'], truncation=True)

# 데이터 전체를 Encode합니다. 
encoded_dataset = dataset.map(preprocess_function, batched=True)
print(encoded_dataset["train"])

  0%|          | 0/46 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

Dataset({
    features: ['guid', 'title', 'label', 'url', 'date', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 45678
})


In [12]:
# 실습 시간의 세팅을 그대로 가져왔습니다
metric_name = "accuracy"
model_name = model_checkpoint.split("/")[-1]
batch_size=16
args = TrainingArguments(
    f"checkpoint",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    do_train=True,
    do_eval=True,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
# Trainer Class의 eval_dataset parameter에 encoded_dataset['validation']을 넘겨주는 것은 부정행위가 아닙니다 (학습에 쓰이는 것이 아니라 성능평가에 쓰이기 때문)
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=some_tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: date, title, url, guid. If date, title, url, guid are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 45678
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 7140
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3654,0.378129,0.868123
2,0.2697,0.368399,0.871637
3,0.1908,0.405749,0.868782
4,0.1295,0.460033,0.865708
5,0.102,0.523857,0.866367


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: date, title, url, guid. If date, title, url, guid are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9107
  Batch size = 32
Saving model checkpoint to checkpoint/checkpoint-1428
Configuration saved in checkpoint/checkpoint-1428/config.json
Model weights saved in checkpoint/checkpoint-1428/pytorch_model.bin
tokenizer config file saved in checkpoint/checkpoint-1428/tokenizer_config.json
Special tokens file saved in checkpoint/checkpoint-1428/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: date, title, url, guid. If date, title, url, guid are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this messag

TrainOutput(global_step=7140, training_loss=0.2246139805523955, metrics={'train_runtime': 1546.0572, 'train_samples_per_second': 147.724, 'train_steps_per_second': 4.618, 'total_flos': 2554421462546400.0, 'train_loss': 0.2246139805523955, 'epoch': 5.0})

## AI 산출물 생성

#### 여러분이 학습한 AI를 export_result 메서드에 인자로 입력해서, 산출물을 생성하세요. 

In [16]:
model = AutoModelForSequenceClassification.from_pretrained("./checkpoint/checkpoint-2856")
test_dataset = load_dataset("klue",'ynat')['validation']
export_result(model,some_tokenizer)

loading configuration file ./checkpoint/checkpoint-2856/config.json
Model config BertConfig {
  "_name_or_path": "./checkpoint/checkpoint-2856",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "t

  0%|          | 0/2 [00:00<?, ?it/s]

Reusing dataset klue (/home/piai/.cache/huggingface/datasets/klue/ynat/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)


  0%|          | 0/2 [00:00<?, ?it/s]

test 0th data
test 100th data
test 200th data
test 300th data
test 400th data
test 500th data
test 600th data
test 700th data
test 800th data
test 900th data
test 1000th data
test 1100th data
test 1200th data
test 1300th data
test 1400th data
test 1500th data
test 1600th data
test 1700th data
test 1800th data
test 1900th data
test 2000th data
test 2100th data
test 2200th data
test 2300th data
test 2400th data
test 2500th data
test 2600th data
test 2700th data
test 2800th data
test 2900th data
test 3000th data
test 3100th data
test 3200th data
test 3300th data
test 3400th data
test 3500th data
test 3600th data
test 3700th data
test 3800th data
test 3900th data
test 4000th data
test 4100th data
test 4200th data
test 4300th data
test 4400th data
test 4500th data
test 4600th data
test 4700th data
test 4800th data
test 4900th data
test 5000th data
test 5100th data
test 5200th data
test 5300th data
test 5400th data
test 5500th data
test 5600th data
test 5700th data
test 5800th data
test 5900