In [None]:
!pip install transformers==4.50.0 datasets==3.5.0 huggingface_hub==0.29.0 -qqq

In [None]:
import torch

if torch.cuda.is_available():
    print("GPU 사용 가능 (CUDA)")
    print(f"→ 사용 중인 GPU: {torch.cuda.get_device_name(0)}")
else:
    print("GPU 사용 불가 (현재 CPU 모드)")

GPU 사용 가능 (CUDA)
→ 사용 중인 GPU: Tesla T4


In [None]:
from datasets import config
print(config.HF_DATASETS_CACHE)

/root/.cache/huggingface/datasets


In [None]:
!pip install datasets==2.16.0
!pip install fsspec==2023.6.0

Collecting datasets==2.16.0
  Using cached datasets-2.16.0-py3-none-any.whl.metadata (20 kB)
Using cached datasets-2.16.0-py3-none-any.whl (507 kB)
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.5.0
    Uninstalling datasets-3.5.0:
      Successfully uninstalled datasets-3.5.0
Successfully installed datasets-2.16.0


[31mERROR: Operation cancelled by user[0m[31m
[0m^C


In [None]:
from datasets import load_dataset

klue_tc_train = load_dataset('klue', 'ynat', split='train')
klue_tc_eval = load_dataset('klue', 'ynat', split='validation')
klue_tc_train

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset({
    features: ['guid', 'title', 'label', 'url', 'date'],
    num_rows: 45678
})

In [None]:
klue_tc_train[0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [None]:
klue_tc_train.features['label'].names

['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치']

In [None]:
klue_tc_train = klue_tc_train.remove_columns(['guid', 'url', 'date'])
klue_tc_eval = klue_tc_eval.remove_columns(['guid', 'url', 'date'])
klue_tc_train

Dataset({
    features: ['title', 'label'],
    num_rows: 45678
})

In [None]:
klue_tc_train.features['label']
# ClassLabel(names=['IT과학', '경제', '사회', '생활문화', '세계', '스포츠', '정치'], id=None)

klue_tc_train.features['label'].int2str(1)
# '경제'

klue_tc_label = klue_tc_train.features['label']

def make_str_label(batch):
  batch['label_str'] = klue_tc_label.int2str(batch['label'])
  return batch

klue_tc_train = klue_tc_train.map(make_str_label, batched=True, batch_size=1000)

klue_tc_train[0]

{'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영', 'label': 3, 'label_str': '생활문화'}

In [None]:
train_dataset = klue_tc_train.train_test_split(test_size=10000, shuffle=True, seed=42)['test']
dataset = klue_tc_eval.train_test_split(test_size=1000, shuffle=True, seed=42)
test_dataset = dataset['test']
valid_dataset = dataset['train'].train_test_split(test_size=1000, shuffle=True, seed=42)['test']

In [None]:
import torch
import numpy as np
from transformers import (
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer
)

def tokenize_function(examples):
    return tokenizer(examples["title"], padding="max_length", truncation=True)

model_id = "monologg/kobert"
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=len(train_dataset.features['label'].names))
tokenizer = AutoTokenizer.from_pretrained(model_id)

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The repository for monologg/kobert contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/monologg/kobert.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    push_to_hub=False,
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).mean()}



In [None]:
import time
import os

# 1. 학습 시작 시간 기록
start_time = time.time()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# 2. 학습 종료 시간 기록 및 학습 시간 출력
end_time = time.time()
training_time = end_time - start_time
print(f"학습 시간: {training_time:.2f} 초")

# 3. GPU 메모리 사용량 (최대)
if torch.cuda.is_available():
    max_memory = torch.cuda.max_memory_allocated() / (1024 ** 2)  # MB 단위
    print(f"최대 GPU 메모리 사용량: {max_memory:.2f} MB")
else:
    print("GPU 사용 안 함")

# 4. Validation & Test 정확도
valid_results = trainer.evaluate(valid_dataset)
test_results = trainer.evaluate(test_dataset)

print(f"Validation 정확도: {valid_results.get('eval_accuracy', 'N/A')}")
print(f"Test 정확도: {test_results.get('eval_accuracy', 'N/A')}")

# 5. 저장된 모델 크기 확인
model_save_path = training_args.output_dir  # 모델 저장 경로
total_size = 0
for dirpath, dirnames, filenames in os.walk(model_save_path):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        total_size += os.path.getsize(fp)
model_size_mb = total_size / (1024 ** 2)
print(f"저장된 모델 크기: {model_size_mb:.2f} MB")

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5154,0.493,0.853


학습 시간: 968.68 초
최대 GPU 메모리 사용량: 3609.06 MB


Validation 정확도: 0.853
Test 정확도: 0.862
저장된 모델 크기: 3165.78 MB
