* flash-attention은 죄가 없음
* fp32가 왜 나오는 걸까

In [1]:
from datasets import load_dataset
from dataclasses import dataclass, field, fields    ## For TrlParser

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,
    set_seed
)
from trl import SFTTrainer, SFTConfig, TrlParser, setup_chat_format
from peft import LoraConfig

from sklearn.model_selection import train_test_split

import logging
import torch

import os
import json
import random
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_ds = load_dataset("json", data_files = os.path.join("./data", "sft_train_dataset.json"), split = "train")
test_ds = load_dataset("json", data_files = os.path.join("./data", "sft_test_dataset.json"), split = "train")

## 토크나이저 로드 및 설정
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    use_fast = True,            ## Rust로 구현된 Fast Tokenizer 사용 (Qwen, RoPE, ChatGLM 등의 특이한 구조에서는 호환 안됨)
    trust_remote_code = True)   ## 모델 코드 전체 다운로드 후 사용
tokenizer.pad_token = tokenizer.eos_token       ## 패딩할 토큰 설정
tokenizer.padding_side = "left"                 ## 디코더이므로 왼쪽을 패딩 (마지막 토큰을 보고 생성)

In [3]:
LLAMA_3_CHAT_TEMPLATE = (
    "{{ bos_token }}"
    "{% for message in messages %}"
        "{% if message['role'] == 'system' %}"
            "{{ '<|start_header_id|>system<|end_header_id|>\n\n' + message['content'] + eos_token }}"
        "{% elif message['role'] == 'user' %}"
            "{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] +  eos_token }}"
        "{% elif message['role'] == 'assistant' %}"
            "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n'}}"
            "{% generation %}"
            "{{ message['content'] +  eos_token }}"
            "{% endgeneration %}"
        "{% endif %}"
    "{% endfor %}"
    "{%- if add_generation_prompt %}"
    "{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
    "{%- endif %}"
)

tokenizer.chat_template = LLAMA_3_CHAT_TEMPLATE

In [4]:
## 템플릿 적용사항 확인
print("======== Log a few random samples from the processed training set ========")
for index in random.sample(range(len(train_ds)), 2):
    print(tokenizer.apply_chat_template(train_ds[index]["messages"], tokenize = False))

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Read the passage below and choose the right answer to the following question (choices are increases or decreases ):  More UV rays increase skin cancer rates.   Mona lives in a country that experiences high levels of sunlight. If she moves to a less sunny country, what happens to her chance of getting skin cancer increases or decreases?
decreases
(Question)
Read the passage below and choose the right answer to the following question (choices are expanding or contracting ):  When your brain senses that your body temperature is increasing, it sends messages to the blood vessels in the skin to increase in diameter.   Derek felt warm as he stood in the sun, and since his body was heating up, he knew that his blood vessels in his skin were expanding or contracting?
(Answer)
expanding
Ques: Read the passage below and choose the right answer to the following question (choices are directly in front of the speaker or two miles away? ):  

In [5]:
## 양자화 설정
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,                    ## 4비트 양자화
    bnb_4bit_use_double_quant = True,       ## 추가 양자화로 성능 손실 없이 파라미터당 0.4bit 추가 절약
    bnb_4bit_quant_type = "nf4",            ## 양자화 데이터 타입 지정: 4비트 기반 모델 훈련 시 사용
    bnb_4bit_compute_dtype = torch.bfloat16 ## Llama-3.1-8B의 학습 자료형. 저장은 4비트지만, attention 연산은 해당 포맷으로 역양자화하여 처리
)

## 모델 로드 및 설정
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    device_map = "cuda:0",
    use_cache = False,                          ## VRAM 캐시 미사용, 추론 속도 저하. gradienc_checkpointing과 동시 사용 불가
    low_cpu_mem_usage = True,                   ## CPU RAM 사용량 적게 사용...
    attn_implementation = "flash_attention_2",  ## flash_attention 연산 사용. sdpa가 더 빠르고 효율적일 수도 있음.
    quantization_config = bnb_config,
    dtype = torch.bfloat16                      ## 가중치 로드 데이터 타입. Llama-3.1-8B의 자료형으로 설정
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.44s/it]


> 7800MB

In [6]:
model.gradient_checkpointing_enable()

peft_config = LoraConfig(
    r = 32,
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "up_proj",
        "down_proj",
        "gate_proj"
    ]
)

In [7]:
training_args = SFTConfig(
    max_length = 1024,
    output_dir = "./results/test",
    report_to = "wandb",
    assistant_only_loss = True,
    learning_rate = 1e-4,
    lr_scheduler_type = "cosine_with_restarts",
    lr_scheduler_kwargs = {"num_cycles":3},
    num_train_epochs = 6,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    gradient_accumulation_steps = 2,
    do_eval = True,
    eval_steps = 500,
    eval_strategy = "steps",
    optim = "adamw_torch_fused",
    logging_steps = 100,
    save_strategy = "epoch",
    weight_decay = 0.01,
    max_grad_norm = 0.5,
    warmup_ratio = 0.06,
    bf16 = True,
    tf32 = True,
    gradient_checkpointing = True,   
    packing = True,
    dataloader_num_workers = 4,
    push_to_hub = True,
    dataset_kwargs = {
        "add_special_tokens": False,
        "append_concat_token": False
    }
)

trainer = SFTTrainer(
    model = model,
    args = training_args,
    train_dataset = train_ds,
    eval_dataset = test_ds,
    processing_class = tokenizer,
    peft_config = peft_config
)

In [8]:
if training_args.assistant_only_loss:
    print("======== Log a first sample from the processed training set ========")
    print(f"masking area: {next(iter(trainer.train_dataset))["assistant_masks"][:100]} ...")

## 학습이 중단된 경우 이어서 진행할 수 있도록 설정
checkpoint = None
if training_args.resume_from_checkpoint is not None:
    checkpoint = training_args.resume_from_checkpoint

masking area: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ...


In [9]:
for name, param in model.named_parameters():
    if param.dtype == torch.float32:
        print("FP32 parameter:", name)

FP32 parameter: model.layers.0.self_attn.q_proj.lora_A.default.weight
FP32 parameter: model.layers.0.self_attn.q_proj.lora_B.default.weight
FP32 parameter: model.layers.0.self_attn.k_proj.lora_A.default.weight
FP32 parameter: model.layers.0.self_attn.k_proj.lora_B.default.weight
FP32 parameter: model.layers.0.self_attn.v_proj.lora_A.default.weight
FP32 parameter: model.layers.0.self_attn.v_proj.lora_B.default.weight
FP32 parameter: model.layers.0.self_attn.o_proj.lora_A.default.weight
FP32 parameter: model.layers.0.self_attn.o_proj.lora_B.default.weight
FP32 parameter: model.layers.0.mlp.gate_proj.lora_A.default.weight
FP32 parameter: model.layers.0.mlp.gate_proj.lora_B.default.weight
FP32 parameter: model.layers.0.mlp.up_proj.lora_A.default.weight
FP32 parameter: model.layers.0.mlp.up_proj.lora_B.default.weight
FP32 parameter: model.layers.0.mlp.down_proj.lora_A.default.weight
FP32 parameter: model.layers.0.mlp.down_proj.lora_B.default.weight
FP32 parameter: model.layers.0.input_layer

In [9]:
trainer.train(resume_from_checkpoint = checkpoint)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.
[34m[1mwandb[0m: Currently logged in as: [33mhollyriver[0m ([33mhollyriver-jbnu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x70603b1a51c0>> (for post_run_cell), with arguments args (<ExecutionResult object at 70603b1d9e80, execution_count=9 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 70603b1da060, raw_cell="trainer.train(resume_from_checkpoint = checkpoint)" transformed_cell="trainer.train(resume_from_checkpoint = checkpoint).." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ssh-remote%2B113.198.65.146/root/HFRL/SFT_DPO/%EC%88%98%EB%8F%99%20%EB%94%94%EB%B2%84%EA%B9%85.ipynb#X16sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


ConnectionResetError: Connection lost

In [None]:
trainer.save_model()

NameError: name 'training_args' is not defined