### 데이터 셋 준비
데이터셋 타입 json

In [1]:
from datasets import load_dataset

# 시스템 메세지 설정
system_message = """You are the AI assistant created by BC Card to be helpful and honest. Your knowledge spans a wide range of topics, allowing you to engage in substantive conversations and provide analysis on complex subjects."""

# Load Dataset from hub
dataset = load_dataset("beomi/KoAlpaca-v1.1a")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 현재 데이터 지시어, 결과, URL 형태
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'url'],
        num_rows: 21155
    })
})

In [3]:
dataset["train"]

Dataset({
    features: ['instruction', 'output', 'url'],
    num_rows: 21155
})

In [4]:
#컬럼명만 가져오기(추후 해당 컬럼 삭제)
columns_to_remove = list(dataset["train"].features)

In [5]:
#시스템, 유저, 어시스턴트 형태로 변경(' ,", 스페이스  문자열로 변경)
train_dataset = dataset.map(
    lambda sample: { 
        'messages' : [
            {"role": "system", "content": system_message},
            {"role": "user", "content": sample['instruction'].replace("'",'').replace('"','').replace('\0xa0',' ')},
            {"role": "assistant", "content": sample['output'].replace("'",'').replace('"','').replace('\0xa0',' ')}
            ]
        },
    )

In [6]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'url', 'messages'],
        num_rows: 21155
    })
})

In [7]:
# messages만 남기기
train_dataset = train_dataset.map(remove_columns=columns_to_remove, batched=False)

In [8]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 21155
    })
})

In [9]:
train_dataset["train"].to_json("train_dataset.json", orient="records", force_ascii=False)

Creating json from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Creating json from Arrow format: 100%|██████████| 22/22 [00:00<00:00, 24.01ba/s]


28286290

## test 데이터

In [41]:
import json, random, pathlib

root = pathlib.Path(".")           # dataset_path 와 동일해야 함
train_path = root / "train_dataset.json"
test_path  = root / "test_dataset.json"

test_ratio = 0.1                        # 10 % 를 검증용으로
seed = 42
random.seed(seed)

with train_path.open() as f:
    all_records = [json.loads(line) for line in f]

random.shuffle(all_records)
split_idx = int(len(all_records) * (1 - test_ratio))

train_records = all_records[:split_idx]
test_records  = all_records[split_idx:]

with train_path.open("w") as f:
    for r in train_records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

with test_path.open("w") as f:
    for r in test_records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"👉 train: {len(train_records)}개, test: {len(test_records)}개 저장 완료")

👉 train: 19039개, test: 2116개 저장 완료


## FSDP

In [28]:
import logging
from dataclasses import dataclass, field
import os
import random
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments
from trl.cli import  TrlParser # 커맨드라인 인자 파싱 유틸

from transformers import(
    AutoModelForCausalLM,
    BitsAndBytesConfig, # 양자화 설정
    set_seed, # 재현성 고정
)
from trl import setup_chat_format # 대화 포맷 템플릿 세팅 
from peft import LoraConfig

from trl import(
    SFTTrainer # Supervised Fine-Tuning 전용
)

# Comment in if you want to use the Llama 3 instruct template but make sure to add modules_to_save
# LLAMA_3_CHAT_TEMPLATE="{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"


# LLAMA_3_CHAT_TEMPLATE = """
#     {% set loop_messages = messages %}
    
#     {% for message in loop_messages %}
#         {%- if loop.index0 == 0 -%}
#             {% set content = bos_token + '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + (message['content'] | trim) + '<|eot_id|>' %}
#         {%- else -%}
#             {% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + (message['content'] | trim) + '<|eot_id|>' %}
#         {%- endif -%}
#         {{ content }}
#     {% endfor %}
#     {%- if add_generation_prompt -%}
#         {{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
#     {%- endif -%}
# """


# Anthropic/Vicuna 스타일 대화 템플릿
# system, user, assistant 메시지를 순회하며 프롬프트 형식으로 변경
LLAMA_3_CHAT_TEMPLATE = (
    "{% for message in messages %}"
        "{% if message['role'] == 'system' %}"
            "{{ message['content'] }}" 
        "{% elif message['role'] == 'user' %}"
            "{{ '\n\nHuman: ' + message['content'] +  eos_token }}"
        "{% elif message['role'] == 'assistant' %}"
            "{{ '\n\nAssistant: '  + message['content'] +  eos_token  }}"
        "{% endif %}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
    "{{ '\n\nAssistant: ' }}"
    "{% endif %}"
)

In [30]:
# 1) ScriptArguments 정의
@dataclass
class ScriptArguments:
    model_id: str = field(
        metadata={"help": "Hugging Face model id (예: Bllossom/llama-3-Korean-Bllossom-70B)"}
    )
    dataset_path: str = field(
        default=".",
        metadata={"help": "path to dataset (train_dataset.json, test_dataset.json 위치)"}
    )
    max_seq_len: int = field(
        default=2048,
        metadata={"help": "max sequence length for model and packing of the dataset"}
    )

## Traning

In [27]:

def training_function(script_args, training_args):
    
    # Dataset
    # >>>>>>>>>>>>>>>>>>>
    
    train_dataset = load_dataset(
        "json",
        data_files = os.path.join(script_args.dataset_path, "train_dataset.json"),
        split="train"
    )
    
    test_dataset = load_dataset(
        "json",
        data_files = os.path.join(script_args.dataset_path, "test_dataset.json"),
        split="train"
    )
    
    # Tokenizer
    # >>>>>>>>>>>>>>>>>>>
    tokenizer = AutoTokenizer.from_pretrained(script_args.model_id, use_fast = True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.chat_template = LLAMA_3_CHAT_TEMPLATE
    
    # template dataset -> json 형태의 messages를 단일 문자열로 포맷팅
    def template_dataset(examples):
        return{"text": tokenizer.apply_chat_template(examples["messages"], tokenizer=False)}
    
    train_dataset = train_dataset.map(template_dataset, remove_columns=["messages"])
    test_dataset = test_dataset.map(template_dataset, remove_columns=["messages"])
    
    # print random sample
    with training_args.main_process_first(desc="Log a few random samples from the processed training set"):
        for index in random.sample(range(len(train_dataset)), 2):
            print(train_dataset[index]["text"])

    # Model
    # >>>>>>>>>>>>>>>>>>>    
    
    # 양자화 형식
    torch_dtype = torch.bfloat16
    quant_storage_dtype = torch.bfloat16
    
    # 4bit 압축된 파라미ㅓ를 bfloat16으로 복원하여 연산
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch_dtype,
        bnb_4bit_quant_storage=quant_storage_dtype,
    )

    # 모델 로더
    model = AutoModelForCausalLM.from_pretrained(
        script_args.model_id,
        quantization_config=quantization_config, 
        attn_implementation="sdpa", # 어텐션 연산 설정 (sdpa, flash_attention_2)
        torch_dtype=quant_storage_dtype,
        use_cache=False if training_args.gradient_checkpointing else True,  # this is needed for gradient checkpointing
    )
    
    if training_args.gradient_checkpointing:
        model.gradient_checkpointing_enable()

    # PEFT
    # >>>>>>>>>>>>>>>>>>>    
    # LoRA config based on QLoRA paper & Sebastian Raschka experiment
    peft_config = LoraConfig(
        lora_alpha=8,
        lora_dropout=0.05,
        r=16,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
        # modules_to_save = ["lm_head", "embed_tokens"] # add if you want to use the Llama 3 instruct template
    )


    # Training
    # >>>>>>>>>>>>>>>>>>>   
    trainer = SFTTrainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        dataset_text_field = "text",
        eval_dataset = test_dataset,
        peft_config=peft_config,
        max_seq_length=script_args.max_seq_length,
        tokenizer = tokenizer,
        packing = True,
        dataset_kwargs = {
            "add_special_tokens": False,  # We template with special tokens
            "append_concat_token": False,  # No need to add additional separator token
        }
    )
    
    if trainer.accelerator.is_main_process:
        trainer.model.print_trainable_parameters()
        
    trainer.save_model()

In [31]:
if __name__ == "__main__":
    parser = TrlParser((ScriptArguments, TrainingArguments))
    script_args, training_args = parser.parse_args_and_config()    
    
    # set use reentrant to False
    if training_args.gradient_checkpointing:
        training_args.gradient_checkpointing_kwargs = {"use_reentrant": True}
    # set seed
    set_seed(training_args.seed)

    # launch training
    training_function(script_args, training_args)

usage: ipykernel_launcher.py [-h] --model_id MODEL_ID
                             [--dataset_path DATASET_PATH]
                             [--max_seq_len MAX_SEQ_LEN]
                             [--output_dir OUTPUT_DIR]
                             [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
                             [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]
                             [--do_predict [DO_PREDICT]]
                             [--eval_strategy {no,steps,epoch}]
                             [--prediction_loss_only [PREDICTION_LOSS_ONLY]]
                             [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE]
                             [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
                             [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE]
                             [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]
                             [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STE

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [35]:
import yaml 

with open('fsdp.yaml') as f:
    file = yaml.full_load(f)


In [36]:
file

{'model_id': 'Bllossom/llama-3-Korean-Bllossom-70B',
 'dataset_path': '.',
 'max_seq_len': 2048,
 'output_dir': './llama-3-korean-70b-hf',
 'report_to': 'tensorboard',
 'learning_rate': 0.0002,
 'lr_scheduler_type': 'constant',
 'num_train_epochs': 1,
 'per_device_train_batch_size': 16,
 'per_device_eval_batch_size': 1,
 'gradient_accumulation_steps': 4,
 'optim': 'adamw_torch',
 'logging_steps': 10,
 'save_strategy': 'epoch',
 'max_grad_norm': 0.3,
 'warmup_ratio': 0.03,
 'bf16': True,
 'tf32': True,
 'gradient_checkpointing': True,
 'fsdp': 'full_shard auto_wrap',
 'fsdp_config': {'backward_prefetch': 'backward_pre',
  'forward_prefetch': 'false',
  'use_orig_params': 'false'}}