讀進library

In [None]:
from pathlib import Path
import os
import sys
from llama_recipes.utils import get_preprocessed_dataset
from llama_recipes.configs.datasets import samsum_dataset

import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import TrainerCallback
from contextlib import nullcontext
from transformers import default_data_collator, Trainer, TrainingArguments
import datasets

from llama_recipes.datasets.utils import Concatenator
import argparse

Constant 參數設置

In [None]:
def parse_arg():
    # 創建 ArgumentParser 物件
    parser = argparse.ArgumentParser(description="這是一個簡單的命令行程式")

    # 添加命令行參數
    parser.add_argument('--base_mdl', help='從哪個model開始tune', default="/mnt/External/Seagate/FedASR/LLaMa2/7B_hf/")
    parser.add_argument('--output_dir',help='', default="tmp/Tuned-MeduAD")
    parser.add_argument('--train_eval_split_num', help='eval data從地幾個號碼開始缺', default=200)
    parser.add_argument('--datasetfile_path',help='', default="./train-00000-of-00001-4401d00b2bdd1863.parquet")

    # 解析命令行參數
    args = parser.parse_args()

    # 在這裡添加實際的程式邏輯，例如複製檔案或處理資料
    return args

args=parse_arg()

model_id = args.base_mdl
output_dir = args.output_dir

讀入模型

可能要一兩分鐘

In [None]:
tokenizer = LlamaTokenizer.from_pretrained(model_id)
model =LlamaForCausalLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto", torch_dtype=torch.float16,\
                                        )

讀入資料庫

這個例子選用的是 MeQuAD 資料庫

In [None]:

parquet_file_path = "C:/Users/iec120955/Downloads/train-00000-of-00001-4401d00b2bdd1863.parquet"
import pandas as pd
from datasets import Dataset

# 使用pandas读取Parquet文件
df = pd.read_parquet(args.datasetfile_path)
MeQuADdataset = Dataset.from_pandas(df)

處理MeQuAD資料庫
* 設定prompt template
* apply_prompt_template: 把每個sample都塞進prompt template
* 把prompt用tokenizer轉成數字
* chunk: 把過長的切成下一個sample

In [None]:
def process_MeQuAD_dataset(MeQuADdataset, tokenizer, split='train'):
    prompt = (
        f"Answer this question:\n{{question}}\n---\nAnswer:\n{{answer}}{{eos_token}}"
    )

    def apply_prompt_template(sample):
        return {
            "text": prompt.format(
                question=sample["Questions"],
                answer=sample["Answers"],
                eos_token=tokenizer.eos_token,
            )
        }

    MeQuADdataset = MeQuADdataset.map(apply_prompt_template, remove_columns=list(MeQuADdataset.features))
        
    MeQuADdataset = MeQuADdataset.map(
        lambda sample: tokenizer(sample["text"]),
        batched=True,
        remove_columns=list(MeQuADdataset.features),
    ).map(Concatenator(), batched=True)
    return MeQuADdataset

MeQuAD_dataset = process_MeQuAD_dataset(MeQuADdataset, tokenizer, 'train')


把資料庫切成Train跟test

In [None]:
train_eval_split_num=200

train_dataset = MeQuAD_dataset.select(
    range(len(MeQuAD_dataset))[:train_eval_split_num]
    )

Finutune本身

Finetune一律都用這些固定參數就好了，先盡量不要動這一塊

In [None]:

model.train()
def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_int8_training,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules = ["q_proj", "v_proj"]
    )

    # prepare int-8 model for training
    model = prepare_model_for_int8_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config

# create peft config
model, lora_config = create_peft_config(model)


enable_profiler = False


config = {
    'lora_config': lora_config,
    'learning_rate': 1e-4,
    'num_train_epochs': 1,
    'gradient_accumulation_steps': 2,
    'per_device_train_batch_size': 2,
    'gradient_checkpointing': False,
}

# Set up profiler
if enable_profiler:
    wait, warmup, active, repeat = 1, 1, 2, 1
    total_steps = (wait + warmup + active) * (1 + repeat)
    schedule =  torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat)
    profiler = torch.profiler.profile(
        schedule=schedule,
        on_trace_ready=torch.profiler.tensorboard_trace_handler(f"{output_dir}/logs/tensorboard"),
        record_shapes=True,
        profile_memory=True,
        with_stack=True)
    
    class ProfilerCallback(TrainerCallback):
        def __init__(self, profiler):
            self.profiler = profiler
            
        def on_step_end(self, *args, **kwargs):
            self.profiler.step()

    profiler_callback = ProfilerCallback(profiler)
else:
    profiler = nullcontext()

# Define training args
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    bf16=True,  # Use BF16 if available
    # logging strategies
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
    optim="adamw_torch_fused",
    max_steps=total_steps if enable_profiler else -1,
    **{k:v for k,v in config.items() if k != 'lora_config'}
)

with profiler:
    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=default_data_collator,
        callbacks=[profiler_callback] if enable_profiler else [],
    )

    # Start training
    trainer.train()

# 

到這邊就已經finetune好了，model就是finetune的模型

接下來要儲存

model.save_pretrained(output_dir)