In [34]:
import tensorflow
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorWithPadding
from datasets import load_metric

## 1. NSMC 데이터 분석 및 Huggingface dataset 구성

In [3]:
# 데이터셋 불러오기
from datasets import load_dataset

# NSMC 데이터셋 로드
nsmc_dataset = load_dataset('nsmc')
print(nsmc_dataset)

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/807 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset nsmc/default (download: 18.62 MiB, generated: 20.90 MiB, post-processed: Unknown size, total: 39.52 MiB) to /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/6.33M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.89M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset nsmc downloaded and prepared to /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})


In [4]:
print(nsmc_dataset['train'][0])

{'id': '9976970', 'label': 0, 'document': '아 더빙.. 진짜 짜증나네요 목소리'}


In [5]:
train = nsmc_dataset['train']
cols = train.column_names
cols

['id', 'document', 'label']

In [6]:
for i in range(5):
    for col in cols:
        print(col, ":", train[col][i])
    print('\n')

id : 9976970
document : 아 더빙.. 진짜 짜증나네요 목소리
label : 0


id : 3819312
document : 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
label : 1


id : 10265843
document : 너무재밓었다그래서보는것을추천한다
label : 0


id : 9045019
document : 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
label : 0


id : 6483659
document : 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다
label : 1




## 2. klue/bert-base model 및 tokenizer 불러오기

In [9]:
# 모델과 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')
model = AutoModelForSequenceClassification.from_pretrained('klue/bert-base', num_labels=2)  # 데이터셋이 이진 분류이므로 2

print(tokenizer)
print(model)

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

PreTrainedTokenizerFast(name_or_path='klue/bert-base', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, 

## 3. 위에서 불러온 tokenizer으로 데이터셋을 전처리하고, model 학습 진행해 보기

In [10]:
# 데이터 전처리 함수
def transform(data):
    return tokenizer(
        data['document'],  
        truncation=True,
        padding='max_length',
        max_length=128,  
        return_token_type_ids=False
    )

In [11]:
# 데이터셋에 전처리 함수 적용
nsmc_dataset = nsmc_dataset.map(transform, batched=True)

# 학습을 위해 'train' 및 'test' 데이터셋을 나누기
train_dataset = nsmc_dataset['train'].shuffle(seed=42).select(range(1000))  # 학습시간 줄이려고 샘플 작게 잡음
test_dataset = nsmc_dataset['test'].shuffle(seed=42).select(range(200))

  0%|          | 0/150 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [12]:
# 매핑 시 오류를 확인하기 위해 try-except 사용
try:
    # train 데이터셋에 transform 함수 매핑
    tf_train_dataset = nsmc_dataset['train'].map(transform, batched=True)
    print("매핑 완료: 오류가 발생하지 않았습니다.")
except Exception as e:
    print(f"오류 발생: {e}")

  0%|          | 0/150 [00:00<?, ?ba/s]

매핑 완료: 오류가 발생하지 않았습니다.


In [20]:
# 훈련 인자 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10
)

# 평가 지표 불러오기
metric = load_metric('accuracy')

def compute_metrics(eval_pred):    
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

# 트레이너 초기화
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# 학습 시작
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1207,1.074975,0.79
2,0.0043,1.20983,0.805
3,0.0005,1.285302,0.815


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-125
Configuration saved in ./results/checkpoint-125/config.json
Model weights saved in ./results/checkpoint-125/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-250
Configuration saved in ./results/checkpoint-250/config.json
Model weights saved in ./results/checkpoint-250/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluatio

TrainOutput(global_step=375, training_loss=0.09171615905314684, metrics={'train_runtime': 87.1884, 'train_samples_per_second': 34.408, 'train_steps_per_second': 4.301, 'total_flos': 197333291520000.0, 'train_loss': 0.09171615905314684, 'epoch': 3.0})

## 4. Fine-tuning을 통하여 모델 성능(accuarcy) 향상시키기

In [22]:
# TrainingArguments 수정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # 에포크 수 증가
    per_device_train_batch_size=16,  # 배치 크기 증가
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=3e-5,  # 학습률 조정
    logging_dir='./logs',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics  
)

# 학습 시작
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 1000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 315


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0446,1.348969,0.82
2,0.0267,1.599494,0.81
3,0.0002,1.492635,0.82
4,0.0594,1.412945,0.83
5,0.0408,1.417326,0.84


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-63
Configuration saved in ./results/checkpoint-63/config.json
Model weights saved in ./results/checkpoint-63/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-126
Configuration saved in ./results/checkpoint-126/config.json
Model weights saved in ./results/checkpoint-126/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation

TrainOutput(global_step=315, training_loss=0.013433332489766477, metrics={'train_runtime': 166.3325, 'train_samples_per_second': 30.06, 'train_steps_per_second': 1.894, 'total_flos': 328888819200000.0, 'train_loss': 0.013433332489766477, 'epoch': 5.0})

In [24]:
# 데이터셋 크기 조정...
train_dataset = nsmc_dataset['train'].shuffle(seed=42).select(range(10000))  
test_dataset = nsmc_dataset['test'].shuffle(seed=42).select(range(2000)) 

Loading cached shuffled indices for dataset at /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-9c452e20bc4f5fcc.arrow
Loading cached shuffled indices for dataset at /aiffel/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3/cache-92f2a57afd4479ec.arrow


In [28]:
trainer.train_dataset = train_dataset
trainer.eval_dataset = test_dataset

trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 10000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3125


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6594,1.332021,0.8415
2,1.179,1.332021,0.8415
3,0.981,1.332021,0.8415
4,0.9544,1.332021,0.8415
5,1.1296,1.332021,0.8415


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-625
Configuration saved in ./results/checkpoint-625/config.json
Model weights saved in ./results/checkpoint-625/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1250
Configuration saved in ./results/checkpoint-1250/config.json
Model weights saved in ./results/checkpoint-1250/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Ev

TrainOutput(global_step=3125, training_loss=1.234233298034668, metrics={'train_runtime': 1157.7422, 'train_samples_per_second': 43.188, 'train_steps_per_second': 2.699, 'total_flos': 3293097968885760.0, 'train_loss': 1.234233298034668, 'epoch': 5.0})

In [33]:
# 2차 대공사
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # 에포크 수 증가
    per_device_train_batch_size=32,  # 배치 크기 증가
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,  # Best model 로드 옵션 추가
    metric_for_best_model='accuracy',  # accuracy 기준 stop
    learning_rate=1e-5,  # 더 낮은 학습률
    logging_dir='./logs',
    logging_steps=10,
)

# EarlyStoppingCallback 추가
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # 개선되지 않은 에포크가 3회 발생하면 조기 종료
    early_stopping_threshold=0.01  # 최소 개선 폭 설정 (선택 사항)
)

# Trainer 초기화 시 콜백 추가
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]  # Early Stopping 콜백 추가
)

# 학습 시작
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 10000
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3130


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3336,0.342181,0.8635
2,0.2436,0.364103,0.8655
3,0.1748,0.383504,0.872
4,0.1379,0.457265,0.876


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-313
Configuration saved in ./results/checkpoint-313/config.json
Model weights saved in ./results/checkpoint-313/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-626
Configuration saved in ./results/checkpoint-626/config.json
Model weights saved in ./results/checkpoint-626/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evalu

TrainOutput(global_step=1252, training_loss=0.2354513618369072, metrics={'train_runtime': 873.0536, 'train_samples_per_second': 114.541, 'train_steps_per_second': 3.585, 'total_flos': 2631110553600000.0, 'train_loss': 0.2354513618369072, 'epoch': 4.0})

## 5. Bucketing을 적용하여 학습시키고, 4의 결과와 비교

1. Data Collator는 배치 단위로 데이터를 처리할 때 샘플을 모델 입력으로 변환하고 패딩을 추가하는 클래스임. 고정 길이 패딩이나 동적 패딩을 지원함. 동적 패딩은 각 배치에서 가장 긴 입력 시퀀스에 맞게 패딩하여 불필요한 메모리 사용을 줄이는 방법. -> transformers 라이브러리의 DataCollatorWithPadding 클래스를 사용하여 적용
(참고 링크 : https://huggingface.co/docs/transformers/v4.30.0/en/main_classes/data_collator)

2. group_by_length는 모델 학습 시 비슷한 길이의 입력 시퀀스를 함께 배치로 묶는 Bucketing을 구현함.이 옵션을 사용하면 각 배치의 입력 길이 차이가 최소화되어, 각 배치에서 필요한 패딩의 양이 줄어들어 패딩 효율성을 높이고 학습 속도 개선, 메모리 절약 효과 있음
(참고 링크 : https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments)

In [35]:
# Data Collator with dynamic padding 설정
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# TrainingArguments 수정 (bucketing 적용)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # 에포크 수 증가
    per_device_train_batch_size=32,  # 배치 크기 증가
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,  # Best model 로드 옵션 추가
    metric_for_best_model='accuracy',  # EarlyStopping에서 사용할 지표
    learning_rate=1e-5,  # 더 낮은 학습률 사용
    logging_dir='./logs',
    logging_steps=10,
    group_by_length=True  # Bucketing 적용
)

# EarlyStoppingCallback 추가
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # 개선되지 않은 에포크가 3회 발생하면 조기 종료
)

# Trainer 초기화 시 data_collator 및 callback 추가
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,  # 동적 패딩 적용
    callbacks=[early_stopping_callback]  # Early Stopping 콜백 추가
)

# 학습 시작
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running training *****
  Num examples = 10000
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3130


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0585,0.640007,0.8665
2,0.0303,0.826058,0.859
3,0.0366,0.848997,0.8625
4,0.0076,0.97668,0.8675
5,0.0576,0.915006,0.871
6,0.0573,0.905343,0.8695
7,0.0374,0.977087,0.8655
8,0.0131,0.949328,0.873
9,0.0013,0.967539,0.87
10,0.0936,0.972168,0.87


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-313
Configuration saved in ./results/checkpoint-313/config.json
Model weights saved in ./results/checkpoint-313/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-626
Configuration saved in ./results/checkpoint-626/config.json
Model weights saved in ./results/checkpoint-626/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, document.
***** Running Evalu

TrainOutput(global_step=3130, training_loss=0.028202080339514695, metrics={'train_runtime': 2224.9307, 'train_samples_per_second': 44.945, 'train_steps_per_second': 1.407, 'total_flos': 6577776384000000.0, 'train_loss': 0.028202080339514695, 'epoch': 10.0})

비교 : Bucketing 적용 여부에 따라 모델의 성능과 연산 효율성을 비교한 결과, Bucketing을 적용하지 않은 경우가 성능 면에서 더 효율적이었음. Bucketing을 적용하지 않았을 때 최종 Validation Accuracy는 0.876으로, Bucketing을 적용한 경우의 최종 Validation Accuracy인 0.870보다 약간 높았음. 또한, Validation Loss 역시 Bucketing을 적용하지 않은 경우 더 낮게 나타났음. 이는 Bucketing을 적용한 상태에서 모델이 학습 데이터에 과적합될 가능성을 시사함.

Training Loss를 비교해 보면, Bucketing을 적용한 경우 더 낮은 값을 기록했지만, 이는 모델이 학습 데이터에 더 잘 맞춰졌음을 의미하며 과적합의 위험을 높일 수 있음. 반면, Bucketing을 적용하지 않은 경우에는 Early Stopping이 4번째 Epoch에서 작동하여 빠르게 학습을 마쳤고, Validation Accuracy도 더 높은 수준에서 유지되었음. Bucketing을 적용하지 않은 학습이 더 적은 Epoch 내에 안정적인 성능을 달성했다는 점에서 연산 효율성 측면에서도 이점이 있다고 할 수 있음.

Bucketing의 주요 목적은 비슷한 길이의 시퀀스를 묶어 메모리 효율을 개선하고 학습 속도를 높이는 것임. 그러나 이번 실험에서는 학습 속도에서 눈에 띄는 개선이 없었으며, 이는 데이터셋의 시퀀스 길이가 이미 비슷해 Bucketing의 효과가 제한적이었을 가능성을 시사함.

결론적으로, Bucketing은 모델 학습의 안정성과 메모리 효율성을 개선할 수 있는 방법이지만, 모든 데이터셋에 대해 성능을 향상시키는 것은 아님. 오히려 Bucketing을 적용하지 않은 경우 더 적은 Epoch 내에 효율적인 학습과 높은 성능을 달성할 수 있었음. 향후 Bucketing의 효과를 극대화하기 위해서는 더 다양한 시퀀스 길이를 가진 데이터셋이나 메모리 제약이 큰 환경에서 실험을 시도해보는 것이 좋겠음.

## 회고

느낀점 : 샘플 수 적게 잡아서 시간 아끼려다가 파인튜닝에 고~대로 씀ㅎㅎ... 꾀부리지 말자....... 
배운점 : pre-trained 모델 사용법과 Dlthon 때 개념만 알고 사용하지 못했던 bucketing 사용해봐서 좋았음. 
아쉬운점 : accuracy 목표값 달성 못한 것. 파인튜닝을 잘 못하는듯... 성능을 개선시키려면 어떻게 해야하는지 아직 잘 모르겠다