In [None]:
%%capture
!pip install -q accelerate==0.31.0 peft==0.11.1 bitsandbytes==0.43.1 transformers==4.41.2 trl==0.9.4 sentencepiece==0.2.0 triton==3.1.0

# Supervised Fine-Tuning (SFT)

## Data Preprocessing

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Load a tokenizer to use its chat template
# 从 Hugging Face 模型库下载并加载名为 TinyLlama/TinyLlama-1.1B-Chat-v1.0 的聊天模型所对应的分词器
# 这里加载它的主要目的不是为了分词，而是为了使用它内置的聊天模板（Chat Template）
template_tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

# 创建format_prompt（）函数，确保对话遵循模版
def format_prompt(example):
    """Format the prompt to using the <|user|> template TinyLLama is using"""

    # Format answers
    chat = example["messages"]
    # 将原始的 messages 列表格式化成模型训练或推理时所期望的特定字符串格式
    prompt = template_tokenizer.apply_chat_template(chat, tokenize=False)

    return {"text": prompt}

# 下载数据集
dataset = (
    load_dataset("HuggingFaceH4/ultrachat_200k",  split="test_sft")
      .shuffle(seed=42) # 随机种子，将数据集随机打乱顺序
      .select(range(3_000)) # 从打乱后的数据集中仅选择前 3000 条样本。
    # 这是一个常用的技巧，用于快速创建一个小型的、用于测试或演示的样本子集。
)
# 使用 TinyLLama 使用的模板加载并格式化数据
# .map()将dataset中的每一个样本取出传入给format_prompt()进行处理，返回"text":prompt
dataset = dataset.map(format_prompt)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

data/train_sft-00000-of-00003-a3ecf92756(…):   0%|          | 0.00/244M [00:00<?, ?B/s]

data/train_sft-00001-of-00003-0a1804bcb6(…):   0%|          | 0.00/244M [00:00<?, ?B/s]

data/train_sft-00002-of-00003-ee46ed25cf(…):   0%|          | 0.00/244M [00:00<?, ?B/s]

data/test_sft-00000-of-00001-f7dfac4afe5(…):   0%|          | 0.00/81.2M [00:00<?, ?B/s]

data/train_gen-00000-of-00003-a6c9fb894b(…):   0%|          | 0.00/244M [00:00<?, ?B/s]

data/train_gen-00001-of-00003-d6a0402e41(…):   0%|          | 0.00/243M [00:00<?, ?B/s]

data/train_gen-00002-of-00003-c0db75b92a(…):   0%|          | 0.00/243M [00:00<?, ?B/s]

data/test_gen-00000-of-00001-3d4cd830914(…):   0%|          | 0.00/80.4M [00:00<?, ?B/s]

Generating train_sft split:   0%|          | 0/207865 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/23110 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/256032 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/28304 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
# Example of formatted prompt
print(dataset["text"][2576])

<|user|>
Given the text: Knock, knock. Who’s there? Hike.
Can you continue the joke based on the given text material "Knock, knock. Who’s there? Hike"?</s>
<|assistant|>
Sure! Knock, knock. Who's there? Hike. Hike who? Hike up your pants, it's cold outside!</s>
<|user|>
Can you tell me another knock-knock joke based on the same text material "Knock, knock. Who's there? Hike"?</s>
<|assistant|>
Of course! Knock, knock. Who's there? Hike. Hike who? Hike your way over here and let's go for a walk!</s>



## Models - Quantization

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# 指定要加载的模型名称。这是一个拥有 11 亿参数的 TinyLlama 模型，是在3T token上训练了1431k步
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

# 4-bit quantization configuration - Q in QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 以4位精度而非通常的16位或32位加载模型，大幅减少内存使用
    bnb_4bit_quant_type="nf4",  # 使用 NF4 (Normal Float 4) 量化类型
    bnb_4bit_compute_dtype="float16",  # 在计算时使用 float16 精度，平衡计算速度和数值稳定性
    bnb_4bit_use_double_quant=True,  # 使用双重量化，对量化参数本身也进行量化，进一步减少内存使用
)

# Load the model to train on the GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto", # 自动将模型的不同部分分配到可用的 GPU 设备上

    # 应用上面定义的量化配置
    # 如果进行常规的监督微调（SFT）而不是 QLoRA，可以省略这个参数
    quantization_config=bnb_config,
)
model.config.use_cache = False # 禁用键值缓存，这在训练时是必要的，因为缓存会干扰梯度计算
model.config.pretraining_tp = 1 # 设置张量并行度为 1（不使用张量并行），这对于微调是常见的设置

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)
tokenizer.pad_token = "<PAD>" # 设置填充标记为 <PAD>
tokenizer.padding_side = "left" # 在左侧进行填充，这对于自回归模型（如LLaMA）很重要，
                  # 因为它确保在生成文本时不会受到填充标记的影响

config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

## Configuration

### LoRA Configuration

In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# Prepare LoRA Configuration
peft_config = LoraConfig(
    lora_alpha=32,  # 控制添加到原始权重上的变化量。本质上，它平衡了原始模型知识与新任务知识之间的关系。
    lora_dropout=0.1,  # Dropout for LoRA Layers
    r=64,  # 压缩矩阵的秩
    bias="none", # 不训练偏置参数
    task_type="CAUSAL_LM", # 指定任务类型为因果语言建模
    target_modules=  # 控制需要适配的神经网络层
     ['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

# prepare model for training
model = prepare_model_for_kbit_training(model) # 准备模型进行k-bit训练
model = get_peft_model(model, peft_config) # 将基础模型转换为PEFT模型

### Training Configuration

In [None]:
from transformers import TrainingArguments

output_dir = "./results"

# Training arguments
training_arguments = TrainingArguments(
    output_dir=output_dir, # 指定输出目录，用于保存训练结果和检查点。
                # 这里设置为当前目录下的"results"文件夹。
    per_device_train_batch_size=2, # 每个设备（如GPU）上的训练批次大小。
                    # 这里设置为2，即每个GPU每次训练处理2个样本。
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit", # 使用的优化器类型，有助于稳定训练和减少内存使用。
    learning_rate=2e-4, # 学习率。2e-4（即0.0002）是训练中常用的学习率大小。
    lr_scheduler_type="cosine", # 学习率调度器的类型
    num_train_epochs=1,
    logging_steps=10,  # 每隔多少步记录一次日志/训练信息
    fp16=True, # 是否使用半精度浮点数（16位）进行训练
    gradient_checkpointing=True # 是否使用梯度检查点技术。设置为True可以在显存中节省大量内存，
            # 但会以稍微增加计算时间为代价。梯度检查点通过在前向传播时不保存全部中间变量，
            # 而是在反向传播时重新计算部分中间结果来实现内存节省。
)

## Training!

In [None]:
from trl import SFTTrainer

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    max_seq_length=512,

    # Leave this out for regular SFT
    peft_config=peft_config,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
# Train model
trainer.train()

# Save QLoRA weights
trainer.model.save_pretrained("TinyLlama-1.1B-qlora")
# 最后训练得到的是适配器（Adapter）权重，而不是完整的模型，后面需要合并权重

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m1449185730[0m ([33m1449185730-sun-yat-sen-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Step,Training Loss
10,1.6702
20,1.4763
30,1.4515
40,1.4885
50,1.4781
60,1.3907
70,1.495
80,1.4502
90,1.4275
100,1.4042




### Merge Adapter

In [None]:
from peft import AutoPeftModelForCausalLM

# AutoPeftModelForCausalLM.from_pretrained()是PEFT库提供的专用方法，
# 用于加载使用PEFT技术（如LoRA、QLoRA）微调过的因果语言模型
model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora", # 模型保存路径
    low_cpu_mem_usage=True,
    device_map="auto",
)

# Merge LoRA and base model
merged_model = model.merge_and_unload()
# merge将LoRA适配器的权重与原始基础模型的权重合并
# unload移除LoRA适配器结构，只保留合并后的完整模型

### Inference

In [None]:
from transformers import pipeline

# Use our predefined prompt template
prompt = """<|user|>
Tell me something about Large Language Models.</s>
<|assistant|>
"""

# Run our instruction-tuned model
pipe = pipeline(task="text-generation", model=merged_model, tokenizer=tokenizer)
print(pipe(prompt)[0]["generated_text"])

<|user|>
Tell me something about Large Language Models.</s>
<|assistant|>
Large Language Models (LLMs) are a type of artificial intelligence (AI) that can generate human-like language. They are trained on large amounts of data, including text, audio, and video, and are capable of generating complex and nuanced language.

LLMs are used in a variety of applications, including natural language processing (NLP), machine translation, and chatbots. They can be used to generate text, speech, or images, and can be trained to understand different languages and dialects.

One of the most significant applications of LLMs is in the field of natural language generation (NLG). LLMs can be used to generate text in a variety of languages, including English, French, and German. They can also be used to generate speech, such as in a chatbot or voice assistant.

LLMs have the potential to revolutionize the way we communicate and interact with each other. They can help us create more engaging and personal

# Preference Tuning (PPO/DPO)

## Data Preprocessing

In [None]:
from datasets import load_dataset

def format_prompt(example):
    """Format the prompt to using the <|user|> template TinyLLama is using"""

    # Format answers
    system = "<|system|>\n" + example['system'] + "</s>\n" # /n是换行
    prompt = "<|user|>\n" + example['input'] + "</s>\n<|assistant|>\n"
    chosen = example['chosen'] + "</s>\n" # chosen就是accepted generation
    rejected = example['rejected'] + "</s>\n"

    return {
        "prompt": system + prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

# Apply formatting to the dataset and select relatively short answers
dpo_dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train")
dpo_dataset = dpo_dataset.filter(
    lambda r:
        r["status"] != "tie" and # 排除平局样本。DPO需要明确的偏好，不能是两者差不多的。
        r["chosen_score"] >= 8 and # 只选择质量很高的胜出回答（评分≥8分，通常是10分制）。
                       # 确保学习的偏好是明确的优质回答。
        not r["in_gsm8k_train"] # 排除属于GSM8K训练集的数据。这是为了防止数据泄露，
                      # 确保模型在数学推理基准测试上的评估结果是公平的。
)
# 移除所有原始列，只保留format_prompt函数返回的新列。
dpo_dataset = dpo_dataset.map(format_prompt, remove_columns=dpo_dataset.column_names)
dpo_dataset

Dataset({
    features: ['chosen', 'rejected', 'prompt'],
    num_rows: 5922
})

## Models - Quantization

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig, AutoTokenizer

# 4-bit quantization configuration - Q in QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision model loading
    bnb_4bit_quant_type="nf4",  # Quantization type
    bnb_4bit_compute_dtype="float16",  # Compute dtype
    bnb_4bit_use_double_quant=True,  # Apply nested quantization
)

# Merge LoRA and base model
model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora", # 上一节中训练得到的适配器在这个文件中
    low_cpu_mem_usage=True,
    device_map="auto",
    quantization_config=bnb_config,
)
merged_model = model.merge_and_unload()

# Load LLaMA tokenizer
# 加载原始模型的tokenizer
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)
tokenizer.pad_token = "<PAD>" # 设置填充token，确保批处理时长度一致
tokenizer.padding_side = "left" # 左侧填充（适用于因果语言模型）



## Configuration

In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# Prepare LoRA Configuration
# 见上面SFT中Configuration的代码解释
peft_config = LoraConfig(
    lora_alpha=32,  # LoRA Scaling
    lora_dropout=0.1,  # Dropout for LoRA Layers
    r=64,  # Rank
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=  # Layers to target
     ['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
) # q_proj, k_proj, v_proj：注意力机制的查询、键、值投影
 # o_proj：注意力输出投影 gate_proj, up_proj, down_proj：FFN层的前向投影

# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config) # 将基础模型转换为PEFT模型，冻结原始模型的所有参数，仅添加和训练LoRA适配器层

In [None]:
from trl import DPOConfig

output_dir = "./results"

# Training arguments
training_arguments = DPOConfig( # 上面STF也有training_arguments，
                 # 区别在于超参数调用的函数名是TrainingArguments
                # trainer的函数名是SFTTrainer
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    max_steps=200,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True,
    warmup_ratio=0.1
)

In [None]:
from trl import DPOTrainer

# Create DPO trainer
dpo_trainer = DPOTrainer( # DPO的trainer函数名是DPOTrainer
    model,
    args=training_arguments,
    train_dataset=dpo_dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    beta=0.1,
    max_prompt_length=512,
    max_length=512,
)

# Fine-tune model with DPO
dpo_trainer.train()

# Save adapter
dpo_trainer.model.save_pretrained("TinyLlama-1.1B-dpo-qlora")


Deprecated positional argument(s) used in DPOTrainer, please use the DPOConfig to set these arguments instead.


Map:   0%|          | 0/5922 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
10,0.692
20,0.6777
30,0.6451
40,0.6059
50,0.5947
60,0.6168
70,0.5932
80,0.5312
90,0.5588
100,0.6394


In [None]:
from peft import PeftModel

# Merge LoRA and base model
model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-qlora", # SFT阶段训练的LoRA适配器
    low_cpu_mem_usage=True,
    device_map="auto",
)
sft_model = model.merge_and_unload() # 合并得到SFT基础模型

# Merge DPO LoRA and SFT model
dpo_model = PeftModel.from_pretrained(
    sft_model, # 使用SFT模型作为新基础
    "TinyLlama-1.1B-dpo-qlora", # DPO阶段训练的LoRA适配器
    device_map="auto",
)
dpo_model = dpo_model.merge_and_unload() # 最终合并

In [None]:
from transformers import pipeline

# Use our predefined prompt template
prompt = """<|user|>
Tell me something about Large Language Models.</s>
<|assistant|>
"""

# Run our instruction-tuned model
pipe = pipeline(task="text-generation", model=dpo_model, tokenizer=tokenizer)
print(pipe(prompt)[0]["generated_text"])

<|user|>
Tell me something about Large Language Models.</s>
<|assistant|>
Large Language Models (LLMs) are a type of artificial intelligence (AI) that can generate human-like language. They are trained on large amounts of data, including text, audio, and video, and are capable of generating complex and nuanced language.

LLMs are used in a variety of applications, including natural language processing (NLP), machine translation, and chatbots. They can be used to generate text, speech, or images, and can be trained to understand different languages and dialects.

One of the most significant applications of LLMs is in the field of natural language generation (NLG). LLMs can be used to generate text in a variety of languages, including English, French, and German. They can also be used to generate speech, such as in a chatbot or voice assistant.

LLMs have the potential to revolutionize the way we communicate and interact with each other. They can help us create more engaging and personal