In [1]:
# 🧩 분류(문장 → 라벨) 파인튜닝
!pip -q install -U transformers datasets accelerate evaluate scikit-learn sentencepiece

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np, evaluate, torch, random, os
from sklearn.metrics import f1_score, accuracy_score

# Reproducibility
SEED=42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# 1) 데이터 로드: KLUE YNAT(연합 뉴스 주제 분류)
ds = load_dataset("klue", "ynat")

# 2) 라벨 매핑
labels = ds["train"].features["label"].names
id2label = {i: l for i, l in enumerate(labels)}
label2id = {l: i for l, i in enumerate(labels)}

# 3) 토크나이저/모델
model_name = "klue/roberta-small"
tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(labels), id2label=id2label, label2id=label2id
)

# 4) 전처리
def preprocess(batch):
    return tok(batch["title"], truncation=True, padding=False, max_length=128)
encoded = ds.map(preprocess, batched=True)

# 5) 메트릭
def compute_metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=-1)
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="macro")
    return {"accuracy": acc, "f1_macro": f1}

# 6) 학습 설정(짧게)
args = TrainingArguments(
    output_dir="./ynat-roberta-small",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=50,
    load_best_model_at_end=False,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded["train"].shuffle(SEED).select(range(5000)),  # 빨리 돌리기 위해 일부만
    eval_dataset=encoded["validation"],
    tokenizer=tok,
    compute_metrics=compute_metrics,
)

trainer.train()
print(trainer.evaluate())

# 7) 추론 예시
test_txt = "애플, 차세대 아이폰 공개…프리미엄 전략 강화"
# pred = trainer.predict(encoded["validation"].select(range(1)))  # 형식상 수행 - not needed for this inference example
inputs = tok(test_txt, return_tensors="pt", truncation=True, max_length=128).to(model.device)
with torch.no_grad():
    out = model(**inputs)[0] # Access the first element of the tuple
    logits = out
    probabilities = torch.softmax(logits, dim=-1).cpu().numpy()[0]
print("📌 예측:", labels[int(probabilities.argmax())], "| 확률 상위3:", sorted(list(zip(labels, probabilities)), key=lambda x:x[1], reverse=True)[:3])

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

ynat/train-00000-of-00001.parquet:   0%|          | 0.00/4.17M [00:00<?, ?B/s]

ynat/validation-00000-of-00001.parquet:   0%|          | 0.00/847k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45678 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9107 [00:00<?, ? examples/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/45678 [00:00<?, ? examples/s]

Map:   0%|          | 0/9107 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.5982,0.73785,0.76842,0.781104


{'eval_loss': 0.7378503084182739, 'eval_accuracy': 0.7684198967826946, 'eval_f1_macro': 0.7811043719926192, 'eval_runtime': 2.6273, 'eval_samples_per_second': 3466.307, 'eval_steps_per_second': 108.477, 'epoch': 1.0}
📌 예측: IT과학 | 확률 상위3: [('IT과학', np.float32(0.7324)), ('경제', np.float32(0.10971536)), ('사회', np.float32(0.0505543))]
