In [1]:
# LLaMA 2 指令微调（Alpaca-Style on Dolly-15K Dataset)
from datasets import load_dataset
from random import randrange
 
# 从hub加载数据集
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 数据集样例总数: 15011
dataset

Dataset({
    features: ['instruction', 'context', 'response', 'category'],
    num_rows: 15011
})

In [3]:
# 随机抽选一个数据样例打印
print(dataset[randrange(len(dataset))])

{'instruction': 'Who is Shoji Hashimoto?', 'context': 'Shoji Hashimoto (橋本 昌二, Hashimoto Shōji, April 18, 1935 – December 2, 2009) was a professional Go player.', 'response': 'Shoji Hashimoto (橋本 昌二, Hashimoto Shōji, April 18, 1935 – December 2, 2009) was a professional Go player.\nHashimoto turned pro in 1947 when he was just 12. It took him only 11 years to reach 9p. He learned Go from his father Hashimoto Kunisaburō and his disciples include Takahara Shūji, Moriyama Naoki, Oda Hiromitsu, Okahashi Hirotada, and Hayashi Kōzō. He was a member of the Kansai Ki-in.', 'category': 'information_extraction'}


In [4]:
# 以 Alpaca-Style 格式化指令数据
# Alpacca-style 格式：https://github.com/tatsu-lab/stanford_alpaca#data-release
def format_instruction(sample_data):
    """
    Formats the given data into a structured instruction format.

    Parameters:
    sample_data (dict): A dictionary containing 'response' and 'instruction' keys.

    Returns:
    str: A formatted string containing the instruction, input, and response.
    """
    # Check if required keys exist in the sample_data
    if 'response' not in sample_data or 'instruction' not in sample_data:
        # Handle the error or return a default message
        return "Error: 'response' or 'instruction' key missing in the input data."

    return f"""### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM. 
 
### Input:
{sample_data['response']}
 
### Response:
{sample_data['instruction']}
"""

In [5]:
# 随机抽选一个样例，打印 Alpaca 格式化后的样例 
print(format_instruction(dataset[randrange(len(dataset))]))

### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM. 
 
### Input:
Hapalochlaena lunulata is an octopus, so it must have eight arms.
 
### Response:
How many arms does Hapalochlaena lunulata have?



In [6]:
# 使用快速注意力（Flash Attention）加速训练
# 加载模型
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# 如果硬件设备支持，成功安装 flash-attn后，将 use_flash_attention 设置为True
use_flash_attention = False
 
# 取消注释以使用 flash-atten
# if torch.cuda.get_device_capability()[0] >= 8:
#     from utils.llama_patch import replace_attn_with_flash_attn
#     print("Using flash attention")
#     replace_attn_with_flash_attn()
#     use_flash_attention = True
 
 
# 获取 LLaMA 2-7B 模型权重
# 无需 Meta AI 审核的模型权重
model_id = "NousResearch/Llama-2-7b-hf" 
# 通过 Meta AI 审核后可使用此 Model ID 下载
# model_id = "meta-llama/Llama-2-7b-hf" 
 
 
# 使用 BnB 加载量化后的模型
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
 
# 加载模型与分词器
# force_download=True
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, use_cache=False, force_download=True, device_map="auto")
model.config.pretraining_tp = 1 
 
# 通过对比doc中的字符串，验证模型是否在使用flash attention
if use_flash_attention:
    from utils.llama_patch import forward    
    assert model.model.layers[0].self_attn.forward.__doc__ == forward.__doc__, "Model is not using flash attention"
 
 
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Downloading shards: 100%|██████████| 2/2 [14:27<00:00, 433.51s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.12s/it]


In [7]:
# 使用 QLoRA 配置加载 PEFT 模型
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
 
# QLoRA 配置
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=16,
        bias="none",
        task_type="CAUSAL_LM", 
)
 
 
# 使用 QLoRA 配置加载 PEFT 模型
model = prepare_model_for_kbit_training(model)
qlora_model = get_peft_model(model, peft_config)

In [8]:
qlora_model.print_trainable_parameters()

trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.12433454005023165


In [9]:
# 训练超参数
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# 演示训练参数（实际训练是设置为 False）
demo_train = False
output_dir = f"models/llama-7-int4-dolly-{timestamp}"

In [10]:
from transformers import TrainingArguments
 
args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1 if demo_train else 3,
    max_steps=100,
    per_device_train_batch_size=3, # Nvidia T4 16GB 显存支持的最大 Batch Size
    gradient_accumulation_steps=1 if demo_train else 4,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="steps" if demo_train else "epoch",
    save_steps=10,
    learning_rate=2e-4,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant"
)

In [11]:
# 实例化 SFTTrainer
from trl import SFTTrainer
 
# 数据集的最大长度序列（筛选后的训练数据样例数为1158）
max_seq_length = 2048 
 
trainer = SFTTrainer(
    model=qlora_model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction, 
    args=args,
)

Generating train split: 1158 examples [00:02, 400.95 examples/s]


In [12]:
# 训练模型
trainer.train()



Step,Training Loss
10,1.6143
20,1.3845
30,1.289
40,1.2593
50,1.2542
60,1.2336
70,1.2124
80,1.1938
90,1.198
100,1.2266




TrainOutput(global_step=100, training_loss=1.2865844917297364, metrics={'train_runtime': 47106.522, 'train_samples_per_second': 0.025, 'train_steps_per_second': 0.002, 'total_flos': 9.75529410822144e+16, 'train_loss': 1.2865844917297364, 'epoch': 1.04})

In [13]:
trainer.save_model()