
# 1. 환경 설정 및 라이브러리 설치



In [6]:
!pip install transformers datasets accelerate evaluate

import os
import torch
import numpy as np
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from evaluate import load as load_metric
from google.colab import drive



## 1.1 GPU 설정

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용 장치: {device}")

사용 장치: cuda


## 1.2 구글 드라이브 마운트

In [5]:
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

# 2. 하이브리드 토크나이저 로드

In [8]:
VOCAB_FILE = "/content/drive/MyDrive/bert/hybrid_bert_vocab.txt"

if not os.path.exists(VOCAB_FILE):
    raise FileNotFoundError(f" 오류: {VOCAB_FILE} 경로에 보캡 파일이 없습니다. 파일을 업로드하거나 경로를 수정하세요.")

# .txt 파일로 BertTokenizer 생성
tokenizer = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=True)
print(f" 하이브리드 Vocab 로드 완료! 크기: {len(tokenizer)} (15,477개 예상)")


 하이브리드 Vocab 로드 완료! 크기: 15482 (15,477개 예상)


# 3. 데이터셋 로드 및 전처리


In [9]:
datasets = load_dataset("dair-ai/emotion")

def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# 토큰화 수행
tokenized_datasets = datasets.map(preprocess_function, batched=True)

# 불필요한 컬럼 제거 및 PyTorch 포맷 설정
columns_to_keep = ['input_ids', 'attention_mask', 'label']
columns_to_remove = [col for col in tokenized_datasets['train'].column_names if col not in columns_to_keep]
tokenized_datasets = tokenized_datasets.remove_columns(columns_to_remove)
tokenized_datasets.set_format("torch")

print(" 데이터 전처리 완료.")

 데이터 전처리 완료.


# 4. 모델 로드 및 임베딩 사이즈 조정

In [10]:
# MODEL_NAME = "bert-base-uncased"
# num_labels = 6

# model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

# print(f" 임베딩 사이즈 조정: {model.config.vocab_size} -> {len(tokenizer)}")
# model.resize_token_embeddings(len(tokenizer))
# model.to(device)

## 4.1 vocab 크기에 맞춘 BERT config 생성 (구조: bert-base-uncased)

In [11]:
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, AutoModelForSequenceClassification

base_config = BertConfig.from_pretrained("bert-base-uncased")
base_config.vocab_size = len(tokenizer)      # vocab_size를 우리 hybrid vocab 크기로 변경

mlm_model = BertForMaskedLM(config=base_config)
mlm_model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(15482, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

## 4-2. MLM용 데이터 전처리

In [12]:
def mlm_preprocess(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

mlm_dataset = datasets["train"].map(
    mlm_preprocess,
    batched=True,
    remove_columns=datasets["train"].column_names,  # text/label 제거, 토큰화 결과만 남김
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

## 4-3. MLM pre-training 설정

In [13]:
mlm_training_args = TrainingArguments(
    output_dir="./mini_bert_pretrain",
    num_train_epochs=8,
    per_device_train_batch_size=32,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=500,
    save_steps=500,
    save_total_limit=1,
    prediction_loss_only=True,
    fp16=True,
    report_to="none"
)

mlm_trainer = Trainer(
    model=mlm_model,
    args=mlm_training_args,
    train_dataset=mlm_dataset,
    data_collator=data_collator,
)

## 4-4. MLM pre-training 실행

In [14]:
mlm_trainer.train()

save_dir = "./mini_bert_pretrain_final"
mlm_trainer.save_model(save_dir)

# 방금 pretrain한 체크포인트로부터 분류용 BERT 로드
pretrained_dir = "./mini_bert_pretrain"
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_dir,
    num_labels=6,
)

model.to(device)

Step,Training Loss
500,6.4774
1000,5.9131
1500,5.7768
2000,5.7007
2500,5.6412
3000,5.523
3500,5.4813
4000,5.4046


OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ./mini_bert_pretrain.

In [15]:
pretrained_dir = "./mini_bert_pretrain/checkpoint-4000"  # 실제 가장 큰 번호로 수정
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_dir,
    num_labels=6,
)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(15482, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# 5. 학습 설정 및 시작 (Trainer)

In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    return {
        "accuracy": acc,
        "macro_f1": macro_f1,
    }

# 학습 파라미터
training_args = TrainingArguments(
    output_dir="./results_hybrid_finetuning",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    fp16=True,                      # GPU 가속 및 메모리 절약
    gradient_accumulation_steps=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.216,0.614369,0.8135
2,0.4019,0.309556,0.8905
3,0.2541,0.25343,0.909
4,0.1907,0.238426,0.906
5,0.1512,0.235127,0.9085


Downloading builder script: 0.00B [00:00, ?B/s]

TrainOutput(global_step=1250, training_loss=0.46433181648254396, metrics={'train_runtime': 549.8058, 'train_samples_per_second': 145.506, 'train_steps_per_second': 2.274, 'total_flos': 5262410096640000.0, 'train_loss': 0.46433181648254396, 'epoch': 5.0})

# 6. 최적화

## 6.1 hyper parameter 조합 만들기

In [23]:
import itertools
import random

LEARNING_RATES = [3e-5, 5e-5, 7e-5]
DROPOUTS = [0.0, 0.05, 0.1]
WARMUP_RATIOS = [0.0, 0.02, 0.04, 0.06]
BATCH_SIZES = [16, 32]

"""
72개 전체 조합 중 균등 랜덤 샘플링
반환 형식: [{learning_rate, dropout, warmup_ratio, batch_size}, ...]
"""
def sample_hyper_candidates(n_samples=24, seed=42):
    # 전체 그리드 생성
    full_grid = list(itertools.product(
        LEARNING_RATES,
        DROPOUTS,
        WARMUP_RATIOS,
        BATCH_SIZES,
    ))

    if n_samples > len(full_grid):
        raise ValueError(f"요청한 샘플 수 {n_samples}가 전체 조합 수 {len(full_grid)}보다 큼")

    rng = random.Random(seed)
    rng.shuffle(full_grid)

    selected = full_grid[:n_samples]

    # HYPER_CANDIDATES 형태로 변환
    hyper_candidates = [
        {
            "learning_rate": lr,
            "dropout": dp,
            "warmup_ratio": wr,
            "batch_size": bs,
        }
        for (lr, dp, wr, bs) in selected
    ]

    return hyper_candidates



## 6.2 1개의 모델 조합에 대한 1epoch씩의 24개 모델 training

In [24]:
def train_macro(hyper_cfg):
  lr = hyper_cfg["learning_rate"]
  dr = hyper_cfg["dropout"]
  wp = hyper_cfg["warmup_ratio"]
  bs = hyper_cfg["batch_size"]

  model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_dir,
    num_labels=6,
  )
  model.config.hidden_dropout_prob = dr
  model.config.attention_probs_dropout_prob = dr
  model.config.classifier_dropout = dr

  training_args = TrainingArguments(
      output_dir="./results_hybrid_finetuning",
      num_train_epochs=1,                         # epochs : 1
      per_device_train_batch_size=bs,             # batch_size
      per_device_eval_batch_size=bs,
      learning_rate=lr,                           # learning_rate
      warmup_ratio=wp,                            # warmup_ratio

      fp16=True,                                  # GPU 가속 및 메모리 절약
      gradient_accumulation_steps=2,
      eval_strategy="epoch",
      save_strategy="epoch",
      load_best_model_at_end=True,
      metric_for_best_model="macro_f1",
      logging_steps=100,
      report_to="none"
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets["validation"],
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
  )

  trainer.train()
  result = trainer.evaluate()
  return result.get("eval_macro_f1", 0.0)

In [25]:
results_list = []

HYPER_CANDIDATES = sample_hyper_candidates()
for i, hyper in enumerate(HYPER_CANDIDATES):
    print(f"\n[{i+1}/{len(HYPER_CANDIDATES)}] 조합 실행 중: {hyper}")

    try:
        f1_score = train_macro(hyper)
        results_list.append({
            "index": i + 1,
            "params": hyper,
            "eval_f1": f1_score
        })

    except Exception as e:
        print(f"→ 오류 발생: {e}")
        results_list.append({
            "index": i + 1,
            "params": hyper,
            "eval_f1": None,
            "error": str(e)
        })

print("\n=== 전체 완료 ===")
print(f"총 {len(results_list)}개 결과 수집됨.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[1/24] 조합 실행 중: {'learning_rate': 7e-05, 'dropout': 0.0, 'warmup_ratio': 0.0, 'batch_size': 32}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7494,0.407719,0.864


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[2/24] 조합 실행 중: {'learning_rate': 3e-05, 'dropout': 0.1, 'warmup_ratio': 0.02, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6072,0.478753,0.8435


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[3/24] 조합 실행 중: {'learning_rate': 5e-05, 'dropout': 0.0, 'warmup_ratio': 0.06, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4083,0.314553,0.8905


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[4/24] 조합 실행 중: {'learning_rate': 5e-05, 'dropout': 0.1, 'warmup_ratio': 0.06, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4083,0.314553,0.8905


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[5/24] 조합 실행 중: {'learning_rate': 3e-05, 'dropout': 0.0, 'warmup_ratio': 0.06, 'batch_size': 32}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2492,0.821153,0.725


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[6/24] 조합 실행 중: {'learning_rate': 3e-05, 'dropout': 0.1, 'warmup_ratio': 0.04, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5824,0.454963,0.8485


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[7/24] 조합 실행 중: {'learning_rate': 3e-05, 'dropout': 0.05, 'warmup_ratio': 0.06, 'batch_size': 32}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2492,0.821153,0.725


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[8/24] 조합 실행 중: {'learning_rate': 5e-05, 'dropout': 0.05, 'warmup_ratio': 0.0, 'batch_size': 32}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8341,0.467263,0.846


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[9/24] 조합 실행 중: {'learning_rate': 7e-05, 'dropout': 0.0, 'warmup_ratio': 0.04, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4474,0.353715,0.8745


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[10/24] 조합 실행 중: {'learning_rate': 3e-05, 'dropout': 0.0, 'warmup_ratio': 0.04, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5824,0.454963,0.8485


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[11/24] 조합 실행 중: {'learning_rate': 5e-05, 'dropout': 0.05, 'warmup_ratio': 0.04, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4502,0.345822,0.8725


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[12/24] 조합 실행 중: {'learning_rate': 5e-05, 'dropout': 0.1, 'warmup_ratio': 0.02, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4847,0.386177,0.851


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[13/24] 조합 실행 중: {'learning_rate': 5e-05, 'dropout': 0.1, 'warmup_ratio': 0.02, 'batch_size': 32}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9356,0.505608,0.8265


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[14/24] 조합 실행 중: {'learning_rate': 7e-05, 'dropout': 0.0, 'warmup_ratio': 0.0, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5545,0.500814,0.821


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[15/24] 조합 실행 중: {'learning_rate': 3e-05, 'dropout': 0.1, 'warmup_ratio': 0.0, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5834,0.459975,0.84


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[16/24] 조합 실행 중: {'learning_rate': 7e-05, 'dropout': 0.05, 'warmup_ratio': 0.0, 'batch_size': 32}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7724,0.41441,0.8655


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[17/24] 조합 실행 중: {'learning_rate': 5e-05, 'dropout': 0.0, 'warmup_ratio': 0.04, 'batch_size': 32}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8728,0.462923,0.845


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[18/24] 조합 실행 중: {'learning_rate': 7e-05, 'dropout': 0.05, 'warmup_ratio': 0.0, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5545,0.500814,0.821


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[19/24] 조합 실행 중: {'learning_rate': 7e-05, 'dropout': 0.1, 'warmup_ratio': 0.04, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4474,0.353715,0.8745


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[20/24] 조합 실행 중: {'learning_rate': 7e-05, 'dropout': 0.05, 'warmup_ratio': 0.06, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4188,0.330581,0.8735


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[21/24] 조합 실행 중: {'learning_rate': 7e-05, 'dropout': 0.0, 'warmup_ratio': 0.04, 'batch_size': 32}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7458,0.359585,0.873


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[22/24] 조합 실행 중: {'learning_rate': 7e-05, 'dropout': 0.1, 'warmup_ratio': 0.06, 'batch_size': 16}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4188,0.330581,0.8735


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[23/24] 조합 실행 중: {'learning_rate': 5e-05, 'dropout': 0.05, 'warmup_ratio': 0.06, 'batch_size': 32}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9054,0.466409,0.8395


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_pretrain/checkpoint-4000 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[24/24] 조합 실행 중: {'learning_rate': 5e-05, 'dropout': 0.1, 'warmup_ratio': 0.06, 'batch_size': 32}


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 15479}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9054,0.466409,0.8395



=== 전체 완료 ===
총 24개 결과 수집됨.


In [None]:
for i in valid_results
print valid_results

## 6.3 최적 모델을 찾아 최적 epoch 찾기

In [26]:
from collections import Counter

def select_best_hypers(results_list, top_k=5):
    # eval_f1이 None이 아닌 결과만 사용
    valid_results = [r for r in results_list if r.get("eval_f1") is not None]

    if not valid_results:
        raise ValueError("유효한 eval_f1 결과가 없습니다.")

    # top_k 개수 보정
    top_k = min(top_k, len(valid_results))

    # eval_f1 기준 내림차순 정렬 후 top_k 선택
    top_results = sorted(
        valid_results,
        key=lambda r: r["eval_f1"],
        reverse=True
    )[:top_k]

    # 하이퍼파라미터별 최빈값 계산
    keys = ["learning_rate", "dropout", "warmup_ratio", "batch_size"]
    best_params = {}

    for key in keys:
        values = [r["params"][key] for r in top_results]
        cnt = Counter(values)
        # (빈도, 값) 기준으로 최대값 선택 (동일 빈도면 값이 큰 쪽 선택)
        best_val = max(cnt.items(), key=lambda kv: (kv[1], kv[0]))[0]
        best_params[key] = best_val

    return best_params, top_results


# 사용 예시: 위에서 하이퍼 탐색을 끝낸 뒤
best_params, top5_results = select_best_hypers(results_list, top_k=5)

print("\n=== Top 5 결과 ===")
for r in top5_results:
    print(f"idx={r['index']}, f1={r['eval_f1']:.4f}, params={r['params']}")

print("\n=== 최종 선택된 하이퍼파라미터 ===")
print(best_params)


=== Top 5 결과 ===
idx=1, f1=0.0000, params={'learning_rate': 7e-05, 'dropout': 0.0, 'warmup_ratio': 0.0, 'batch_size': 32}
idx=2, f1=0.0000, params={'learning_rate': 3e-05, 'dropout': 0.1, 'warmup_ratio': 0.02, 'batch_size': 16}
idx=3, f1=0.0000, params={'learning_rate': 5e-05, 'dropout': 0.0, 'warmup_ratio': 0.06, 'batch_size': 16}
idx=4, f1=0.0000, params={'learning_rate': 5e-05, 'dropout': 0.1, 'warmup_ratio': 0.06, 'batch_size': 16}
idx=5, f1=0.0000, params={'learning_rate': 3e-05, 'dropout': 0.0, 'warmup_ratio': 0.06, 'batch_size': 32}

=== 최종 선택된 하이퍼파라미터 ===
{'learning_rate': 5e-05, 'dropout': 0.0, 'warmup_ratio': 0.06, 'batch_size': 16}


In [27]:
from transformers import EarlyStoppingCallback

# pretrained_dir 는 위에서 정의되어 있다고 가정
# compute_metrics 는 macro_f1 을 반환하고 있어야 함:
# return {"accuracy": acc, "macro_f1": macro_f1}

def train_final_model(params, output_dir):
    lr = params["learning_rate"]
    dr = params["dropout"]
    wp = params["warmup_ratio"]
    bs = params["batch_size"]

    # fresh model 로드
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_dir,
        num_labels=6,
    )

    # dropout 적용
    model.config.hidden_dropout_prob = dr
    model.config.attention_probs_dropout_prob = dr
    model.config.classifier_dropout = dr

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=10,                 # 최대 epoch 상한
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        learning_rate=lr,
        warmup_ratio=wp,

        fp16=True,
        gradient_accumulation_steps=2,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        logging_steps=100,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    print("\n=== 최종 학습 완료 ===")
    print(f"output_dir: {output_dir}")
    print(f"best_model_checkpoint: {trainer.state.best_model_checkpoint}")
    print(f"best_metric (val macro_f1): {trainer.state.best_metric}")

    return trainer


In [None]:
# 파라미터 두 종류 준비
# best_params : top5에서 최빈값 기반
# top1_params : f1이 가장 높은 1개 조합 (top5 中 index 0)
top1_params = top5_results[0]["params"]

print("\n[선택된 파라미터]")
print("vote(best_params):", best_params)
print("top1_params      :", top1_params)

# 두 번 학습
trainer_vote = train_final_model(best_params, output_dir="./results_hybrid_final_vote")
trainer_top1 = train_final_model(top1_params, output_dir="./results_hybrid_final_top1")

# 성능 비교
best_vote = trainer_vote.state.best_metric
best_top1 = trainer_top1.state.best_metric

print("\n=== 최종 비교 (val macro_f1 기준) ===")
print(f"vote 기반 best macro_f1 : {best_vote}")
print(f"top1 기반 best macro_f1 : {best_top1}")

if best_vote is not None and best_top1 is not None:
    if best_vote > best_top1:
        print("→ vote 기반 하이퍼파라미터가 더 우위.")
    elif best_vote < best_top1:
        print("→ top1 기반 하이퍼파라미터가 더 우위.")
    else:
        print("→ 두 설정의 best macro_f1 이 동일.")