In [13]:
import datasets
from utils.toolkit import set_seed, auto_device
import torch

In [2]:
set_seed(12345)
device = auto_device()
model_path = 'D:/PycharmProjects/llama3_proj/models/Meta-Llama-3-8B-Instruct'

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_path)  # 这个从modelscope下载的llama3与HF不太一样
tokenizer.pad_token = tokenizer.eos_token

def get_processor(tokenizer, shift=False):
    def process_function(sample):
        prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{sample['instruction']}{sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
        prompt = tokenizer.encode(prompt, add_special_tokens=False)  # optional: +tokenizer.eos_token
        output = tokenizer.encode(f"{sample['output']}<|eot_id|>",
                                  add_special_tokens=False)  # optional: +tokenizer.eos_token
        input_ids = prompt + output
        labels = [-100] * len(prompt) + output
        if shift:
            input_ids = input_ids[:-1]
            labels = labels[1:]
        sample = {
            "input_ids": input_ids,
            "attention_mask": [1] * len(input_ids),
            "labels": labels
        }
        return sample
    return process_function

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
preprocess = get_processor(tokenizer, shift=True)

ds = datasets.load_dataset('json', data_files={'train': 'D:/PycharmProjects/open_llm/dataset/huanhuan.json'})
train_dataset = ds["train"]
train_dataset = ds["train"].map(preprocess, remove_columns=train_dataset.column_names)

Map:   0%|          | 0/3729 [00:00<?, ? examples/s]

In [12]:
print(train_dataset[0])

{'input_ids': [128000, 128006, 882, 128007, 271, 111319, 3922, 64022, 9554, 106241, 58850, 72368, 19000, 32018, 16325, 31867, 3922, 113723, 19361, 100389, 109, 80578, 111319, 101067, 101307, 58843, 224, 104241, 45829, 3922, 86894, 102, 113715, 111419, 41914, 50928, 89151, 89151, 103203, 9554, 8713, 128009, 128006, 78191, 128007, 271, 103001, 246, 8713, 72368, 37687, 104894, 110767, 37687, 105150, 126957, 108298, 9554, 1811], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 103001, 246, 8713, 72368, 37687, 104894, 110767, 37687, 105150, 126957, 108298, 9554, 1811, 1

In [14]:
print(train_dataset[1])

{'input_ids': [128000, 128006, 882, 128007, 271, 103624, 104840, 101402, 103242, 102856, 104587, 21043, 102491, 107297, 3922, 112471, 124375, 101402, 103242, 115820, 105600, 51609, 54253, 18184, 105600, 71869, 116749, 104123, 15225, 101171, 231, 123594, 103429, 102981, 105987, 53901, 3922, 95598, 36827, 103268, 9953, 102924, 104198, 100389, 109, 80578, 102697, 70349, 110774, 1811, 128009, 128006, 78191, 128007, 271, 112022, 11743, 102, 58543, 101402, 43240, 35287, 101602, 76982, 34208, 104840, 101402, 103242, 31634, 15120, 116057, 107634, 3922, 53901, 53901, 71005, 71005, 112022, 1811], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 

In [15]:
from transformers import DataCollatorForSeq2Seq
import numpy as np
import random


class LengthBasedBatchSampler(torch.utils.data.BatchSampler):
    """
    构造一个自定义的batch sampler：它将dataset里所有样本按length升序排序, 然后把相邻的样本按batch_size组成一个batch返回
    注意：返回的batch(当batch_size>1)依然长度参差不齐, 还需要后续collate_fn完成padding操作
    """
    def __init__(self, data_source, batch_size: int, drop_last: bool, shuffle: bool=True) -> None:
        if isinstance(next(iter(data_source)), dict):
            first_key = next(iter(next(iter(data_source)).keys()))
            self.lengths = [len(d[first_key]) for d in data_source]
        else:
            self.lengths = [len(d) for d in data_source]
        self.batch_size = batch_size
        self.drop_last = drop_last
        self.shuffle = shuffle

    def __iter__(self):
        ids = np.argsort(self.lengths, kind='mergesort')
        if self.drop_last:
            ids = ids[:len(ids) // self.batch_size * self.batch_size]

        batches = [ids[i:i+self.batch_size] for i in range(0, len(ids), self.batch_size)]

        if self.shuffle:
            random.shuffle(batches)

        for b in batches:
            yield b

    def __len__(self):
        if self.drop_last:
            return len(self.lengths) // self.batch_size
        else:
            return len(self.lengths) // self.batch_size + (len(self.lengths) % self.batch_size > 0)
        
batch_sampler = LengthBasedBatchSampler(train_dataset, batch_size=2, drop_last=True, shuffle=True)
collate_fn = DataCollatorForSeq2Seq(tokenizer)

In [16]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    train_dataset,
    batch_sampler=batch_sampler,
    collate_fn=collate_fn
)

In [32]:
iterator = iter(train_dataloader)
print(next(iterator))

{'input_ids': tensor([[128000, 128006,    882, 128007,    271, 113805,   1811, 128009, 128006,
          78191, 128007,    271, 113805,  11571, 128009],
        [128000, 128006,    882, 128007,    271, 102856,  11571, 128009, 128006,
          78191, 128007,    271, 103001,    246,   8713]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100, 113805,  11571, 128009,   -100],
        [  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100, 103001,    246,   8713, 128009]])}


In [ ]:
# from typing import List
# 
# 
# def collate_fn(batch: List, eos_token_id):
#     max_len = max(len(item['input_ids']) for item in batch)
# 
#     input_ids = []
#     attention_mask = []
#     labels = []
# 
#     for item in batch:
#         input_id = item['input_ids']
#         attention_mask_item = item['attention_mask']
#         label = item['labels']
# 
#         # 计算填充长度
#         pad_len = max_len - len(input_id)
# 
#         input_ids.append(input_id+[eos_token_id] * pad_len)
#         attention_mask.append(attention_mask_item+[0] * pad_len)
#         labels.append(label+[eos_token_id] * pad_len)
# 
#     # 将 list 转换为 tensor: torch.tensor(input, dtype=torch.long)
#     input_ids = torch.LongTensor(input_ids)
#     attention_mask = torch.LongTensor(attention_mask)
#     labels = torch.LongTensor(labels)
# 
#     return {
#         'input_ids': input_ids,
#         'attention_mask': attention_mask,
#         'labels': labels,
#     }

In [26]:
import math


def get_lr(iterate, max_lr=6e-4, warmup_steps=10, max_steps=50):
    # 模拟先经过warmup_steps线性上升到max_lr, 再max_steps余弦平滑下降到min_lr的学习率变化过程, 接下来保持min_lr
    min_lr = max_lr * 0.1
    if iterate < warmup_steps:
        return max_lr * (iterate + 1) / warmup_steps
    if iterate > max_steps:
        return min_lr
    decay_ratio = (iterate - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return max_lr + (max_lr - min_lr) * coeff

In [30]:
import time
from model.llama import LlamaTransformer


def train_lora(model: LlamaTransformer, dataloader, max_lr=6e-4, warmup_steps=140, max_steps=200):
    device = auto_device()

    # 设置矩阵乘法精度(不设置的话, 默认是'highest'), 在4090提速x1.25, 但需要注意数据吞吐效率是否跟得上
    torch.set_float32_matmul_precision('high')

    # 初始化优化器：参数分组, 并对二维矩阵参数设置了weight_decay
    optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device=device)

    # 训练循环 training loop: 每个step会使用1个batch的数据更新模型
    iterator = iter(dataloader)
    for step in range(max_steps):
        # 训练部分:training process
        t_start = time.time()

        model.train()  # 切换成训练模式：影响normalization, dropout等机制

        optimizer.zero_grad()  # 梯度清零

        loss_accum = 0.0

        batch = next(iterator)
        x = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        y = batch['labels'].to(device)
        x, y = x.to(device), y.to(device)
        with torch.autocast(device_type=device, dtype=torch.bfloat16):
            logits, loss = model(x, y)
        loss_accum = loss.detach().item()
        
        loss.backward()  # 反向传播：loss backward之前退出autocasting context

        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 学习率动态调整：每个step都动态计算当前step的学习率
        lr = get_lr(step, max_lr=max_lr, warmup_steps=warmup_steps, max_steps=max_steps)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr  # 动态调整优化器中各参数组的学习率

        # 梯度下降: 更新参数
        optimizer.step()

        torch.cuda.synchronize()  # 这里将等待GPU完成当前batch运算, 用于计时器校准

        t_end = time.time()

        # 打印日志
        dt = t_end - t_start  # 单位：秒
        print(
            f"step {step}, loss: {loss_accum}, lr: {lr}, norm: {norm}, dt: {dt * 1000:.2f}ms")

In [28]:
from utils.toolkit import print_trainable_parameters, print_model_all_parameters
from train.lora import add_lora
from model.llama import LlamaTransformer

model = LlamaTransformer.from_pretrained('llama-3-8B', local_path=model_path, torch_type=torch.bfloat16).to('cuda')
add_lora(model, alpha=32, target=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], dropout_p=0.05)
print_trainable_parameters(model)
print_model_all_parameters(model)

loading weights from pretrained model: meta-llama/Meta-Llama-3-8B => {'n_layers': 32, 'n_heads': 32, 'dim': 4096, 'n_kv_heads': 8, 'vocab_size': 128256, 'max_seq_len': 2048}
parameters num: 291


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 291/291 [00:00<00:00, 382.56it/s]


trainable params: 20,971,520 || all params: 8,051,232,768 || trainable%: 0.2605
Layer Name & Parameters
----------------------------
model.embed_tokens.weight                          | dtype:torch.bfloat16 | Requires_grad: False
model.layers.0.self_attn.q_proj.base_layer.weight  | dtype:torch.bfloat16 | Requires_grad: False
model.layers.0.self_attn.q_proj.lora_A.weight      | dtype:torch.bfloat16 | Requires_grad: True
model.layers.0.self_attn.q_proj.lora_B.weight      | dtype:torch.bfloat16 | Requires_grad: True
model.layers.0.self_attn.k_proj.base_layer.weight  | dtype:torch.bfloat16 | Requires_grad: False
model.layers.0.self_attn.k_proj.lora_A.weight      | dtype:torch.bfloat16 | Requires_grad: True
model.layers.0.self_attn.k_proj.lora_B.weight      | dtype:torch.bfloat16 | Requires_grad: True
model.layers.0.self_attn.v_proj.base_layer.weight  | dtype:torch.bfloat16 | Requires_grad: False
model.layers.0.self_attn.v_proj.lora_A.weight      | dtype:torch.bfloat16 | Requires_grad: True

In [31]:
train_lora(model, train_dataloader)

num decayed parameter tensors: 448, with 20,971,520 parameters
num non-decayed parameter tensors: 0, with 0 parameters
using fused AdamW: True


ValueError: too many values to unpack (expected 2)