In [5]:
# Colab에 필요한 라이브러리를 설치합니다.
# 'accelerate'는 Trainer가 GPU/TPU를 쉽게 사용하도록 도와줍니다.
!pip install transformers datasets accelerate evaluate



In [6]:
import torch
import numpy as np
from datasets import load_dataset#, load_metric # Removed load_metric from datasets
from evaluate import load as load_metric # Imported load_metric from evaluate

from transformers import (
    BertTokenizer,
    AutoModelForSequenceClassification, # ⭐️ 'AutoModel'이 아닌 'AutoModelForSequenceClassification'을 사용
    Trainer,
    TrainingArguments,
    set_seed
)

# 1. 데이터셋 로드
datasets = load_dataset("dair-ai/emotion")
print(datasets)

# 2. 라벨 정보 확인 (총 6개)
label_names = datasets["train"].features["label"].names
num_labels = len(label_names)
print(f"라벨 수: {num_labels}, 라벨: {label_names}")

# 3. GPU/CPU 설정
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용할 장치: {DEVICE}")

# - 실험 재현을 위해 시드를 고정한다
set_seed(42)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})
라벨 수: 6, 라벨: ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
사용할 장치: cuda


In [7]:
# - 학습 관련 핵심 값을 한 곳에서 정의한다
NUM_TRAIN_EPOCHS = 3
PER_DEVICE_TRAIN_BATCH_SIZE = 32
PER_DEVICE_EVAL_BATCH_SIZE = 32
LEARNING_RATE = 5e-5
LOGGING_STEPS = 100
OUTPUT_DIR = "./results"

# - 현재 설정을 요약해 확인한다
def describe_run():
    print("=== Trainer 설정 ===")
    print(f"epochs: {NUM_TRAIN_EPOCHS}")
    print(f"train_batch_size: {PER_DEVICE_TRAIN_BATCH_SIZE}")
    print(f"eval_batch_size: {PER_DEVICE_EVAL_BATCH_SIZE}")
    print(f"learning_rate: {LEARNING_RATE}")
    print(f"logging_steps: {LOGGING_STEPS}")
    print(f"output_dir: {OUTPUT_DIR}")
    print(f"num_labels: {num_labels}")
    print(f"device: {DEVICE}")

describe_run()


=== Trainer 설정 ===
epochs: 3
train_batch_size: 32
eval_batch_size: 32
learning_rate: 5e-05
logging_steps: 100
output_dir: ./results
num_labels: 6
device: cuda


In [8]:
MODEL_DIRECTORY = "./mini_bert_7k_hf"
tokenizer = BertTokenizer.from_pretrained(MODEL_DIRECTORY, do_lower_case=True)
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

print("커스텀 7k 토크나이저 로드 완료")
tokenized_datasets = datasets.map(preprocess_function, batched=True)
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])
print("전처리 후 데이터 샘플:")
print(tokenized_datasets["train"][0])


커스텀 7k 토크나이저 로드 완료


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

전처리 후 데이터 샘플:
{'label': tensor(0), 'input_ids': tensor([7003, 7000, 7000, 7000, 7000, 7001, 7002, 7002, 7002, 7002, 7002, 7002,
        7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002,
        7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002,
        7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002,
        7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002,
        7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002,
        7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002,
        7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002,
        7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002,
        7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002,
        7002, 7002, 7002, 7002, 7002, 7002, 7002, 7002]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
from transformers import AutoModelForSequenceClassification

MODEL_PATH = "./mini_bert_7k_hf"

# 1. 모델을 로드합니다. (config= 인자를 사용하지 않습니다)
# num_labels만 지정하여 분류(classifier) 헤드를 6개로 새로 초기화합니다.
# 이 단계에서 vocab_size가 맞지 않는다는 경고가 뜰 수 있으나 정상입니다.
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=num_labels
)

# 2. 모델 로드 *완료 후*, 토크나이저 크기에 맞게 임베딩 크기를 조정합니다.
# 이 함수가 기존 임베딩 가중치는 유지하고, 추가된 토큰(단어)에 대한
# 임베딩 가중치만 새로(랜덤하게) 초기화합니다.
model.resize_token_embeddings(len(tokenizer))

# 3. (권장) 모델의 config 객체에도 변경된 vocab_size와 pad_token_id를
#    명시적으로 업데이트해줍니다.
model.config.vocab_size = len(tokenizer)
model.config.pad_token_id = tokenizer.pad_token_id

# 4. 모든 설정이 완료된 모델을 DEVICE로 이동시킵니다.
model.to(DEVICE)

print("커스텀 Hugging Face 체크포인트 로드 및 임베딩 리사이징 완료.")

# config를 출력하여 vocab_size와 num_labels가 올바른지 확인합니다.
print(model.config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./mini_bert_7k_hf and are newly initialized: ['bert.embeddings.LayerNorm.bias', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.Lay

커스텀 Hugging Face 체크포인트 로드 및 임베딩 리사이징 완료.
BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 256,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 6,
  "pad_token_id": 7002,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 7005
}



In [10]:
# 1. 사용할 평가 지표 로드 (여기서는 'accuracy')
metric = load_metric("accuracy")

# 2. 'Trainer'가 사용할 평가 함수 정의
def compute_metrics(eval_pred):
    """
    Trainer가 평가 시 호출하는 함수입니다.
    logits과 labels을 받아 accuracy를 계산합니다.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

print("평가 지표(accuracy) 계산 함수 준비 완료.")

Downloading builder script: 0.00B [00:00, ?B/s]

평가 지표(accuracy) 계산 함수 준비 완료.


In [11]:
# - Hugging Face Trainer에 전달할 학습/로깅 옵션을 정의한다
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    # --- 학습 관련 설정 ---
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    # --- 평가 및 저장 설정 ---
    eval_strategy="epoch",
    save_strategy="epoch",
    # --- 기타 설정 ---
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=LOGGING_STEPS,
    report_to="none"
)

print(f"'{training_args.output_dir}' 폴더에 결과가 저장됩니다.")


'./results' 폴더에 결과가 저장됩니다.


In [12]:
# 1. Trainer 객체 생성
# 학습에 필요한 모든 재료(모델, 설정, 데이터, 토크나이저, 평가함수)를 넣어줍니다.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 2. ⭐️ 학습 시작! ⭐️
# 이 한 줄이 Native PyTorch의 긴 '셀 6' 전체를 대체합니다.
print("--- Trainer로 학습 시작 ---")
trainer.train()
print("--- 학습 완료! ---")

  trainer = Trainer(


--- Trainer로 학습 시작 ---


Epoch,Training Loss,Validation Loss,Accuracy
1,1.5817,1.583192,0.351
2,1.5805,1.579163,0.3505
3,1.5871,1.579064,0.3505


--- 학습 완료! ---


In [17]:
from transformers import pipeline

print("--- 학습된 모델로 Softmax 확률 값 예측 ---")

# 1. 'text-classification' 파이프라인 생성
#    trainer.model은 'load_best_model_at_end=True'에 의해
#    가장 accuracy가 높았던 모델입니다.
classifier_pipeline = pipeline(
    "text-classification",
    model=trainer.model,     # ⭐️ 학습 완료된 베스트 모델
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1 # 0: GPU, -1: CPU
)

# 2. 테스트할 문장
test_text_1 = "it is so scary"
test_text_2 = "This is so frustrating and makes me angry."

# 3. 예측 실행 (return_all_scores=True로 모든 라벨 확률 받기)
results_1 = classifier_pipeline(test_text_1, return_all_scores=True)
results_2 = classifier_pipeline(test_text_2, return_all_scores=True)

# 4. 결과 출력
def print_results(text, results):
    print(f"\n입력 문장: \"{text}\"")
    print("--- 6개 라벨 Softmax 확률 값 ---")

    # 라벨 이름을 매칭시켜서 보기 좋게 출력
    for res in results[0]:
        label_name = label_names[int(res['label'].split('_')[-1])]
        print(f"{label_name:10}: {res['score']:.4f} ( {res['score']*100:6.2f} % )")

print_results(test_text_1, results_1)
print_results(test_text_2, results_2)

Device set to use cuda:0


--- 학습된 모델로 Softmax 확률 값 예측 ---

입력 문장: "it is so scary"
--- 6개 라벨 Softmax 확률 값 ---
sadness   : 0.3162 (  31.62 % )
joy       : 0.3166 (  31.66 % )
love      : 0.0679 (   6.79 % )
anger     : 0.1278 (  12.78 % )
fear      : 0.1338 (  13.38 % )
surprise  : 0.0376 (   3.76 % )

입력 문장: "This is so frustrating and makes me angry."
--- 6개 라벨 Softmax 확률 값 ---
sadness   : 0.3034 (  30.34 % )
joy       : 0.3290 (  32.90 % )
love      : 0.0705 (   7.05 % )
anger     : 0.1266 (  12.66 % )
fear      : 0.1328 (  13.28 % )
surprise  : 0.0377 (   3.77 % )


# Task
To fix the `CUDA error: device-side assert triggered` during training, I will first modify cell `O7xT05NRb_KL` to set `config.type_vocab_size = 2` before model initialization. After this modification, I will re-run cell `O7xT05NRb_KL` to apply the updated configuration and then re-run cell `JXuxOgMkcHfV` to restart the training process.

## Update model configuration

### Subtask:
Modify cell `O7xT05NRb_KL` to explicitly set `config.type_vocab_size = 2` to ensure the `token_type_embeddings` layer is correctly initialized. This will prevent the `CUDA error: device-side assert triggered` during training.


**Reasoning**:
To fix the CUDA error, I need to update the model configuration in cell `O7xT05NRb_KL` by explicitly setting `config.type_vocab_size = 2` as instructed.



In [None]:
from transformers import BertConfig, AutoModelForSequenceClassification

MODEL_PATH = "./mini_bert_7k_hf"

# Load the base configuration
config = BertConfig.from_pretrained(MODEL_PATH)

# Update the vocab_size and pad_token_id in the config based on the tokenizer's actual values
# The tokenizer's length accounts for all special tokens, including the pad_token_id
config.vocab_size = len(tokenizer)
config.pad_token_id = tokenizer.pad_token_id
# Set num_labels in the config object
config.num_labels = num_labels
config.type_vocab_size = 2 # Added this line as instructed

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    config=config # Pass the updated config
)

# Explicitly resize model embeddings to match tokenizer vocabulary size
# This is crucial if the tokenizer's vocab_size is different from the pre-trained model's original vocab_size
model.resize_token_embeddings(len(tokenizer))

# Move model to device after resizing embeddings
model.to(DEVICE)

print("커스텀 Hugging Face 체크포인트 로드 완료.")
print(model.config)

**Reasoning**:
The traceback indicates a CUDA error during the `forward` pass in `trainer.train()` when processing `token_type_embeddings`. This suggests that the `token_type_ids` are either missing or incorrectly handled. By explicitly adding `token_type_ids` to the preprocessed dataset, we can ensure they are correctly passed to the model during training.



In [None]:
VOCAB_FILE = "./mini_emotion_tokenizer_7k.txt"
tokenizer = BertTokenizer.from_pretrained(VOCAB_FILE, do_lower_case=True)

def preprocess_function(examples):
    # Ensure token_type_ids are explicitly returned by the tokenizer
    encoding = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    # The tokenizer often generates token_type_ids automatically, but we ensure it's present.
    # For single-sentence classification, these are typically all zeros.
    if "token_type_ids" not in encoding:
        # Fallback if tokenizer does not provide them; generate all zeros
        encoding["token_type_ids"] = [[0] * len(ids) for ids in encoding["input_ids"]]
    return encoding

print("커스텀 7k 토크나이저 로드 완료")
tokenized_datasets = datasets.map(preprocess_function, batched=True)
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"]) # Added token_type_ids
print("전처리 후 데이터 샘플:")
print(tokenized_datasets["train"][0])

**Reasoning**:
The 'CUDA error: device-side assert triggered' persists during `model.to(DEVICE)` even after setting `config.type_vocab_size`. This error often indicates a dimension mismatch, especially in embedding layers. Since `max_length` for tokenization is 128, the model's `max_position_embeddings` must be at least this value to prevent out-of-bounds indexing for position embeddings. Explicitly setting `config.max_position_embeddings = 128` will align the model's capacity with the tokenizer's output, potentially resolving the error.



In [None]:
from transformers import BertConfig, AutoModelForSequenceClassification

MODEL_PATH = "./mini_bert_7k_hf"

# Load the base configuration
config = BertConfig.from_pretrained(MODEL_PATH)

# Update the vocab_size and pad_token_id in the config based on the tokenizer's actual values
# The tokenizer's length accounts for all special tokens, including the pad_token_id
config.vocab_size = len(tokenizer)
config.pad_token_id = tokenizer.pad_token_id
# Set num_labels in the config object
config.num_labels = num_labels
config.type_vocab_size = 2 # Added this line as instructed
config.max_position_embeddings = 128 # Add this line to set max_position_embeddings

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    config=config # Pass the updated config
)

# Explicitly resize model embeddings to match tokenizer vocabulary size
# This is crucial if the tokenizer's vocab_size is different from the pre-trained model's original vocab_size
model.resize_token_embeddings(len(tokenizer))

# Move model to device after resizing embeddings
model.to(DEVICE)

print("커스텀 Hugging Face 체크포인트 로드 완료.")
print(model.config)

**Reasoning**:
The CUDA error during `model.to(DEVICE)` persists despite previous adjustments to `config.type_vocab_size` and `config.max_position_embeddings`. This low-level error, often related to embedding layer initialization on the GPU, might be caused by an unexpected interaction if the base model was pre-trained with `type_vocab_size=1` (common for single-sentence tasks) while the current configuration attempts to set it to `2`. By explicitly setting `config.type_vocab_size = 1`, we align with the expectation for single-sentence inputs where `token_type_ids` are typically all zeros, potentially resolving the underlying tensor incompatibility on the GPU.



In [None]:
from transformers import BertConfig, AutoModelForSequenceClassification

MODEL_PATH = "./mini_bert_7k_hf"

# Load the base configuration
config = BertConfig.from_pretrained(MODEL_PATH)

# Update the vocab_size and pad_token_id in the config based on the tokenizer's actual values
# The tokenizer's length accounts for all special tokens, including the pad_token_id
config.vocab_size = len(tokenizer)
config.pad_token_id = tokenizer.pad_token_id
# Set num_labels in the config object
config.num_labels = num_labels
config.type_vocab_size = 1 # Changed from 2 to 1
config.max_position_embeddings = 128 # Add this line to set max_position_embeddings

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    config=config # Pass the updated config
)

# Explicitly resize model embeddings to match tokenizer vocabulary size
# This is crucial if the tokenizer's vocab_size is different from the pre-trained model's original vocab_size
model.resize_token_embeddings(len(tokenizer))

# Move model to device after resizing embeddings
model.to(DEVICE)

print("커스텀 Hugging Face 체크포인트 로드 완료.")
print(model.config)