In [1]:
# Colab에 필요한 라이브러리를 설치합니다.
# 'accelerate'는 Trainer가 GPU/TPU를 쉽게 사용하도록 도와줍니다.
!pip install transformers datasets accelerate evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [49]:
import torch
import numpy as np
from datasets import load_dataset#, load_metric # Removed load_metric from datasets
from evaluate import load as load_metric # Imported load_metric from evaluate

from transformers import (
    BertTokenizer,
    AutoModelForSequenceClassification, # ⭐️ 'AutoModel'이 아닌 'AutoModelForSequenceClassification'을 사용
    Trainer,
    TrainingArguments,
    set_seed
)

# 1. 데이터셋 로드
full_datasets = load_dataset("dair-ai/emotion", "unsplit") # 전체 데이터셋 로드
print(full_datasets)

print("---")

# 2. 라벨 정보 확인 (총 6개)
label_names = full_datasets["train"].features["label"].names
num_labels = len(label_names)

# [+] 라벨별 개수까지 출력
label_counts = np.bincount(full_datasets["train"]["label"], minlength=num_labels)
print(f"총 라벨 수: {num_labels}")
for name, count in zip(label_names, label_counts):
    print(f"{name}: {count}개")

print("---")

# 3. GPU/CPU 설정
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용할 장치: {DEVICE}")

# - 실험 재현을 위해 시드를 고정한다
set_seed(42)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 416809
    })
})
---
총 라벨 수: 6
sadness: 121187개
joy: 141067개
love: 34554개
anger: 57317개
fear: 47712개
surprise: 14972개
---
사용할 장치: cuda


In [50]:
# 균등하게 총 20,000개를 라벨별로 추출 후 8:1:1 분할

from datasets import DatasetDict

# 전체 데이터셋
data = full_datasets["train"]
labels = np.array(data["label"])

# 라벨별 목표 개수 계산 (총합이 정확히 20,000이 되도록)
target_total = 20000
base_per_label = target_total // num_labels
extra = target_total % num_labels  # 나머지 1~5개를 앞쪽 라벨에 분배

per_label_counts = [base_per_label + (1 if i < extra else 0) for i in range(num_labels)]

# 라벨별 균등 샘플링
selected_indices = []
for i, count in enumerate(per_label_counts):
    idx = np.where(labels == i)[0]
    chosen = np.random.choice(idx, count, replace=False)
    selected_indices.extend(chosen)

balanced_data = data.select(selected_indices)

balanced_data = balanced_data.shuffle(seed=42)
ds_all = DatasetDict({"all": balanced_data})

tmp = ds_all["all"].train_test_split(test_size=0.2, seed=42, stratify_by_column="label")  # [추가] 80/20
val_test = tmp["test"].train_test_split(test_size=0.5, seed=42, stratify_by_column="label")  # [추가] 10/10

datasets = {  # [추가]
    "train": tmp["train"],
    "validation": val_test["train"],
    "test": val_test["test"],
}

print(f"[데이터셋 생성 완료]")
print(f"train: {len(datasets['train'])}, validation: {len(datasets['validation'])}, test: {len(datasets['test'])}")


[데이터셋 생성 완료]
train: 16000, validation: 2000, test: 2000


In [51]:
# - 학습 관련 핵심 값을 한 곳에서 정의한다
NUM_TRAIN_EPOCHS = 3
PER_DEVICE_TRAIN_BATCH_SIZE = 32
PER_DEVICE_EVAL_BATCH_SIZE = 32
LEARNING_RATE = 5e-5
LOGGING_STEPS = 100
OUTPUT_DIR = "./results"

# - 현재 설정을 요약해 확인한다
def describe_run():
    print("=== Trainer 설정 ===")
    print(f"epochs: {NUM_TRAIN_EPOCHS}")
    print(f"train_batch_size: {PER_DEVICE_TRAIN_BATCH_SIZE}")
    print(f"eval_batch_size: {PER_DEVICE_EVAL_BATCH_SIZE}")
    print(f"learning_rate: {LEARNING_RATE}")
    print(f"logging_steps: {LOGGING_STEPS}")
    print(f"output_dir: {OUTPUT_DIR}")
    print(f"num_labels: {num_labels}")
    print(f"device: {DEVICE}")

describe_run()


=== Trainer 설정 ===
epochs: 3
train_batch_size: 32
eval_batch_size: 32
learning_rate: 5e-05
logging_steps: 100
output_dir: ./results
num_labels: 6
device: cuda


In [52]:
datasets = DatasetDict(datasets)

MODEL_DIRECTORY = "./mini_bert_11k_hf"
tokenizer = BertTokenizer.from_pretrained(MODEL_DIRECTORY, do_lower_case=True)
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

print("커스텀 7k 토크나이저 로드 완료")
tokenized_datasets = datasets.map(preprocess_function, batched=True)
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])
print("전처리 후 데이터 샘플:")
print(tokenized_datasets["train"][0])


커스텀 7k 토크나이저 로드 완료


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

전처리 후 데이터 샘플:
{'label': tensor(4), 'input_ids': tensor([11003, 11000, 11000, 11000, 11000, 11000, 11000, 11000, 11000, 11000,
        11000, 11000, 11000, 11000, 11000, 11000, 11001, 11002, 11002, 11002,
        11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002,
        11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002,
        11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002,
        11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002,
        11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002,
        11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002,
        11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002,
        11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002,
        11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002,
        11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002, 11002,
        11002, 1

In [53]:
from transformers import AutoModelForSequenceClassification

MODEL_PATH = "./mini_bert_11k_hf"

# 1. 모델을 로드합니다. (config= 인자를 사용하지 않습니다)
# num_labels만 지정하여 분류(classifier) 헤드를 6개로 새로 초기화합니다.
# 이 단계에서 vocab_size가 맞지 않는다는 경고가 뜰 수 있으나 정상입니다.
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=num_labels
)

# 2. 모델 로드 *완료 후*, 토크나이저 크기에 맞게 임베딩 크기를 조정합니다.
# 이 함수가 기존 임베딩 가중치는 유지하고, 추가된 토큰(단어)에 대한
# 임베딩 가중치만 새로(랜덤하게) 초기화합니다.
model.resize_token_embeddings(len(tokenizer))

# 3. (권장) 모델의 config 객체에도 변경된 vocab_size와 pad_token_id를
#    명시적으로 업데이트해줍니다.
model.config.vocab_size = len(tokenizer)
model.config.pad_token_id = tokenizer.pad_token_id

# 4. 모든 설정이 완료된 모델을 DEVICE로 이동시킵니다.
model.to(DEVICE)

print("커스텀 Hugging Face 체크포인트 로드 및 임베딩 리사이징 완료.")

# config를 출력하여 vocab_size와 num_labels가 올바른지 확인합니다.
print(model.config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_11k_hf and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.La

커스텀 Hugging Face 체크포인트 로드 및 임베딩 리사이징 완료.
BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 256,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 6,
  "pad_token_id": 11002,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 11005
}



In [54]:
# 1. 사용할 평가 지표 로드 (여기서는 'accuracy')
metric = load_metric("accuracy")

# 2. 'Trainer'가 사용할 평가 함수 정의
def compute_metrics(eval_pred):
    """
    Trainer가 평가 시 호출하는 함수입니다.
    logits과 labels을 받아 accuracy를 계산합니다.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

print("평가 지표(accuracy) 계산 함수 준비 완료.")

평가 지표(accuracy) 계산 함수 준비 완료.


In [55]:
# - Hugging Face Trainer에 전달할 학습/로깅 옵션을 정의한다
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    # --- 학습 관련 설정 ---
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    # --- 평가 및 저장 설정 ---
    eval_strategy="epoch",
    save_strategy="epoch",
    # --- 기타 설정 ---
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=LOGGING_STEPS,
    report_to="none"
)

print(f"'{training_args.output_dir}' 폴더에 결과가 저장됩니다.")


'./results' 폴더에 결과가 저장됩니다.


In [56]:
# 1. Trainer 객체 생성
# 학습에 필요한 모든 재료(모델, 설정, 데이터, 토크나이저, 평가함수)를 넣어줍니다.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 2. ⭐️ 학습 시작! ⭐️
# 이 한 줄이 Native PyTorch의 긴 '셀 6' 전체를 대체합니다.
print("--- Trainer로 학습 시작 ---")
trainer.train()
print("--- 학습 완료! ---")

--- Trainer로 학습 시작 ---


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.7937,1.791232,0.168
2,1.7897,1.78855,0.1845
3,1.7884,1.787577,0.1765


--- 학습 완료! ---


In [57]:
from transformers import pipeline

print("--- 학습된 모델로 Softmax 확률 값 예측 ---")

# 1. 'text-classification' 파이프라인 생성
#    trainer.model은 'load_best_model_at_end=True'에 의해
#    가장 accuracy가 높았던 모델입니다.
classifier_pipeline = pipeline(
    "text-classification",
    model=trainer.model,     # ⭐️ 학습 완료된 베스트 모델
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1 # 0: GPU, -1: CPU
)

# 2. 테스트할 문장
test_text_1 = "it is so scary"
test_text_2 = "This is so frustrating and makes me angry."

# 3. 예측 실행 (return_all_scores=True로 모든 라벨 확률 받기)
results_1 = classifier_pipeline(test_text_1, return_all_scores=True)
results_2 = classifier_pipeline(test_text_2, return_all_scores=True)

# 4. 결과 출력
def print_results(text, results):
    print(f"\n입력 문장: \"{text}\"")
    print("--- 6개 라벨 Softmax 확률 값 ---")

    # 라벨 이름을 매칭시켜서 보기 좋게 출력
    for res in results[0]:
        label_name = label_names[int(res['label'].split('_')[-1])]
        print(f"{label_name:10}: {res['score']:.4f} ( {res['score']*100:6.2f} % )")

print_results(test_text_1, results_1)
print_results(test_text_2, results_2)

Device set to use cuda:0


--- 학습된 모델로 Softmax 확률 값 예측 ---

입력 문장: "it is so scary"
--- 6개 라벨 Softmax 확률 값 ---
sadness   : 0.2001 (  20.01 % )
joy       : 0.1591 (  15.91 % )
love      : 0.1119 (  11.19 % )
anger     : 0.1800 (  18.00 % )
fear      : 0.1896 (  18.96 % )
surprise  : 0.1593 (  15.93 % )

입력 문장: "This is so frustrating and makes me angry."
--- 6개 라벨 Softmax 확률 값 ---
sadness   : 0.1846 (  18.46 % )
joy       : 0.1657 (  16.57 % )
love      : 0.1316 (  13.16 % )
anger     : 0.1773 (  17.73 % )
fear      : 0.1757 (  17.57 % )
surprise  : 0.1651 (  16.51 % )




In [58]:
# =========================
# 진단 원샷 블록 (토크나이저/모델/파이프라인)
# 전제: tokenizer, model, datasets, tokenized_datasets, DEVICE, num_labels 가 이미 존재
# =========================
import numpy as np
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from evaluate import load as load_metric

SEED = 42
RUN_BASELINE = True          # bert-base-uncased 기준선 체크 실행 여부
MINI_N = 512                 # 미니 오버핏 샘플 수
BS = 32                      # 미니 배치

rng = np.random.default_rng(SEED)

# 1) OOV 비율
sample_texts = datasets["train"]["text"][:2000]
batch_tok = tokenizer(sample_texts, padding=True, truncation=True, max_length=128)
unk_id = tokenizer.unk_token_id
ids = np.array(batch_tok["input_ids"])
oov_ratio_token = (ids == unk_id).mean()                    # 토큰 기준 OOV 비율
oov_ratio_seq   = ((ids == unk_id).any(axis=1)).mean()      # 시퀀스 기준 OOV 포함 비율
print(f"[OOV] token-level={oov_ratio_token:.3f}, sequence-level={oov_ratio_seq:.3f}")

# 2) 스페셜 토큰/설정
print("[SPECIAL TOKENS]", tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token, tokenizer.unk_token)
print("[SPECIAL IDS]   ", tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id, tokenizer.unk_token_id)
print("[CONFIG] vocab_size=", model.config.vocab_size,
      " type_vocab_size=", getattr(model.config, "type_vocab_size", None),
      " pad_token_id=", getattr(model.config, "pad_token_id", None))

# 3) 학습 가능 파라미터 수
p_train = sum(p.numel() for p in model.parameters() if p.requires_grad)
p_all   = sum(p.numel() for p in model.parameters())
print(f"[PARAM] trainable/all = {p_train:,} / {p_all:,}")

# 4) 미니 오버핏 테스트 (동일 데이터로 학습·평가)
metric_f1 = load_metric("f1")
def compute_metrics_diag(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric_f1.compute(predictions=preds, references=labels, average="macro")

mini_idx = list(range(min(MINI_N, len(datasets["train"]))))
mini_raw = {"text": [datasets["train"]["text"][i] for i in mini_idx],
            "label": [datasets["train"]["label"][i] for i in mini_idx]}

mini_tok = tokenizer(mini_raw["text"], padding="max_length", truncation=True, max_length=128)
mini_ds = {
    "input_ids": torch.tensor(mini_tok["input_ids"]),
    "attention_mask": torch.tensor(mini_tok["attention_mask"]),
    "labels": torch.tensor(mini_raw["label"])
}
# torch Dataset 래퍼
class TDS(torch.utils.data.Dataset):
    def __init__(self, d): self.d = d
    def __len__(self): return len(self.d["labels"])
    def __getitem__(self, i):
        return {k: v[i] for k, v in self.d.items()}

mini_dataset = TDS(mini_ds)

# TrainingArguments (구버전 호환: eval_strategy)
mini_args = TrainingArguments(
    output_dir="./diag_out",
    num_train_epochs=5,
    per_device_train_batch_size=BS,
    per_device_eval_batch_size=BS,
    learning_rate=5e-5,
    warmup_ratio=0.06,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=50,
    report_to="none",
    seed=SEED
)

diag_trainer = Trainer(
    model=model,
    args=mini_args,
    train_dataset=mini_dataset,
    eval_dataset=mini_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_diag
)

print("[MINI-OVERFIT] start")
diag_trainer.train()
diag_res = diag_trainer.evaluate()
print("[MINI-OVERFIT] eval_f1=", float(diag_res.get("eval_f1", 0.0)), "  eval_loss=", float(diag_res.get("eval_loss", 0.0)))

# 5) 기준선 컨트롤 테스트(옵션): bert-base-uncased로 같은 미니 세트 비교
if RUN_BASELINE:
    print("[BASELINE] bert-base-uncased")
    base_tok = AutoTokenizer.from_pretrained("bert-base-uncased")
    base_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels).to(DEVICE)

    base_mini_tok = base_tok(mini_raw["text"], padding="max_length", truncation=True, max_length=128)
    base_ds = {
        "input_ids": torch.tensor(base_mini_tok["input_ids"]),
        "attention_mask": torch.tensor(base_mini_tok["attention_mask"]),
        "labels": torch.tensor(mini_raw["label"])
    }
    base_dataset = TDS(base_ds)

    base_args = TrainingArguments(
        output_dir="./diag_out_baseline",
        num_train_epochs=3,
        per_device_train_batch_size=BS,
        per_device_eval_batch_size=BS,
        learning_rate=5e-5,
        warmup_ratio=0.06,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=50,
        report_to="none",
        seed=SEED
    )
    base_trainer = Trainer(
        model=base_model,
        args=base_args,
        train_dataset=base_dataset,
        eval_dataset=base_dataset,
        tokenizer=base_tok,
        compute_metrics=compute_metrics_diag
    )
    base_trainer.train()
    base_res = base_trainer.evaluate()
    print("[BASELINE] eval_f1=", float(base_res.get("eval_f1", 0.0)), "  eval_loss=", float(base_res.get("eval_loss", 0.0)))

# 해석 가이드:
# - OOV 비율이 높거나 special token/ID가 None이면 VOCAB/토크나이저 설정 문제.
# - mini overfit에서 F1이 0.4 이상으로 안 오르면 파이프라인/모델 설정 문제.
# - baseline이 정상인데 커스텀이 비정상이면 VOCAB/커스텀 BERT 쪽 문제 확정.


[OOV] token-level=0.301, sequence-level=1.000
[SPECIAL TOKENS] [CLS] [SEP] [PAD] [UNK]
[SPECIAL IDS]    11003 11001 11002 11000
[CONFIG] vocab_size= 11005  type_vocab_size= 2  pad_token_id= 11002
[PARAM] trainable/all = 7,689,734 / 7,689,734


  diag_trainer = Trainer(


[MINI-OVERFIT] start


Epoch,Training Loss,Validation Loss,F1
1,No log,1.78541,0.103439
2,No log,1.784618,0.104708
3,No log,1.784448,0.101946
4,1.787900,1.784281,0.101946
5,1.787900,1.783981,0.103439


[MINI-OVERFIT] eval_f1= 0.10343915343915344   eval_loss= 1.783981204032898
[BASELINE] bert-base-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  base_trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,No log,1.747022,0.146147
2,No log,1.558455,0.432054
3,No log,1.432651,0.485065


[BASELINE] eval_f1= 0.48506511244146927   eval_loss= 1.432651400566101


In [59]:
unk_id = tokenizer.unk_token_id
batch = tokenizer(datasets["train"]["text"][:2000], padding=True, truncation=True, max_length=128)
unk_count = (np.array(batch["input_ids"]) == unk_id).sum()
total_tokens = np.array(batch["input_ids"]).size
print("UNK count:", unk_count)
print("Total tokens:", total_tokens)
print("UNK ratio:", unk_count / total_tokens)


UNK count: 39782
Total tokens: 132000
UNK ratio: 0.30137878787878786


## 🔍 진단 요약

| 구분 | 커스텀 BERT (7k vocab) | 기준선 BERT (bert-base-uncased) |
|------|-------------------------|----------------------------------|
| OOV 비율 | token 30.1%, seq 100% | 정상 수준 |
| F1 (mini-overfit) | **0.10** | **0.48** |
| Loss | 1.78 | 1.43 |
| 파이프라인 | 정상 작동 | 정상 작동 |
| 문제 원인 | 토크나이저 / vocab 불일치 | - |

---

## ⚠️ 결론
- 데이터·코드는 정상.  
- 커스텀 **vocab·토크나이저가 지나치게 작고 OOV 과다**로 학습 불능 상태.  
- 모델/하이퍼탐색 중단 권장.

---

## ✅ 다음 단계
### A. 실용 경로 (즉시 결과)
- `bert-base-uncased` 모델·토크나이저로 교체  
- 전체 데이터 재토크나이즈 후 1단계 재실행  

### B. 커스텀 유지 (재설계)
- vocab 30k 이상으로 재학습  
- MLM 50k~100k step 적응학습  
- OOV ≤ 5% 도달 후 분류 파인튜닝  

## 🧪 1단계: Macro-F1 스크리닝
- **목표:** 빠른 하이퍼 탐색으로 유망한 조합 선별  
- **조절 변수:** learning_rate, dropout, warmup_ratio, batch_size  
- **고정값:** epoch=1, weight_decay=0.01, max_length=128


In [45]:
# 1단계에서 사용할 평가 지표 : Macro-F1 Metric Definition
def compute_metrics_step1(eval_pred):
    metric_step1 = load_metric("f1")
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric_step1.compute(predictions=preds, references=labels, average="macro")

In [44]:
# 사용할 model variation : (learning rate, dropout, warmup_ratio, batch_size)
HYPER_CANDIDATES_STEP1 = [
    (5e-5, 0.0, 0.00, 16),
    (5e-5, 0.0, 0.06, 32),
    (5e-5, 0.1, 0.10, 16),
    (5e-5, 0.1, 0.00, 32),
    (5e-5, 0.2, 0.06, 16),
    (5e-5, 0.2, 0.10, 32),

    (3e-5, 0.0, 0.10, 16),
    (3e-5, 0.0, 0.00, 32),
    (3e-5, 0.1, 0.06, 16),
    (3e-5, 0.1, 0.10, 32),
    (3e-5, 0.2, 0.00, 16),
    (3e-5, 0.2, 0.06, 32),

    (2e-5, 0.0, 0.06, 16),
    (2e-5, 0.0, 0.10, 32),
    (2e-5, 0.1, 0.00, 16),
    (2e-5, 0.1, 0.06, 32),
    (2e-5, 0.2, 0.10, 16),
    (2e-5, 0.2, 0.00, 32),

    (1e-5, 0.0, 0.10, 16),
    (1e-5, 0.0, 0.00, 32),
    (1e-5, 0.1, 0.06, 16),
    (1e-5, 0.1, 0.10, 32),
    (1e-5, 0.2, 0.00, 16),
    (1e-5, 0.2, 0.06, 32),
]

# 모델을 자동으로 만들어 평가 결과 반환
def level1_train(hyper_tuple):
  lr, dr, wp, bs = hyper_tuple

  # 드롭아웃 적용
  model.config.hidden_dropout_prob = dr
  model.config.attention_probs_dropout_prob = dr

  # TrainingArguments 생성
  training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    learning_rate=lr,
    warmup_ratio=wp,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=LOGGING_STEPS,
    report_to="none"
  )

  # Trainer 구성
  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_step1,
  )

  # 학습 및 평가
  trainer.train()
  result = trainer.evaluate()

  # eval_f1 반환
  return result.get("eval_f1", 0.0)

In [46]:
results_list = []

for i, hyper in enumerate(HYPER_CANDIDATES_STEP1):
    print(f"\n[{i+1}/{len(HYPER_CANDIDATES_STEP1)}] 조합 실행 중: {hyper}")

    try:
        f1_score = level1_train(hyper)
        results_list.append({
            "index": i + 1,
            "params": hyper,
            "eval_f1": f1_score
        })
        print(f"→ 완료: eval_f1 = {f1_score:.4f}")

    except Exception as e:
        print(f"→ 오류 발생: {e}")
        results_list.append({
            "index": i + 1,
            "params": hyper,
            "eval_f1": None,
            "error": str(e)
        })

print("\n=== 전체 완료 ===")
print(f"총 {len(results_list)}개 결과 수집됨.")



[1/24] 조합 실행 중: (5e-05, 0.0, 0.0, 16)


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,1.7883,1.790467,0.082601
2,1.7901,1.78812,0.153868
3,1.7867,1.787473,0.110183


Downloading builder script: 0.00B [00:00, ?B/s]

→ 완료: eval_f1 = 0.1539

[2/24] 조합 실행 중: (5e-05, 0.0, 0.06, 32)


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,1.7887,1.789393,0.093476
2,1.7891,1.788021,0.139369
3,1.7887,1.787542,0.104853


→ 완료: eval_f1 = 0.1394

[3/24] 조합 실행 중: (5e-05, 0.1, 0.1, 16)


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,1.785,1.789821,0.108311
2,1.7892,1.788349,0.126856
3,1.7861,1.787643,0.110443


→ 완료: eval_f1 = 0.1269

[4/24] 조합 실행 중: (5e-05, 0.1, 0.0, 32)


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,1.7863,1.789215,0.103628
2,1.7885,1.78816,0.137562
3,1.7886,1.787757,0.110658


→ 완료: eval_f1 = 0.1376

[5/24] 조합 실행 중: (5e-05, 0.2, 0.06, 16)


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,1.7796,1.790083,0.131209
2,1.7872,1.788702,0.125545


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

f1_scores = [r["eval_f1"] for r in results_list]  # 24개 결과
labels = [f"#{i+1}" for i in range(len(f1_scores))]

plt.figure(figsize=(10,4))
plt.bar(labels, f1_scores)
plt.ylabel("Macro-F1")
plt.xlabel("Model index (1–24)")
plt.title("Step 1 – Macro-F1 across 24 hyperparameter combinations")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()