In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, DataCollatorForSeq2Seq, Trainer
import torch
from torch.utils.data import DataLoader

# Dataset

In [2]:
# ds = Dataset.load_from_disk("../data/alpaca_data_zh/")
ds = load_dataset("shibing624/alpaca-zh", split='train')
# dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
ds

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 48818
})

In [3]:
ds[0]

{'instruction': '保持健康的三个提示。',
 'input': '',
 'output': '以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。'}

# Tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained("/home/genghaozhe/workspace/huggingface-models/Llama-2-7b-hf")
tokenizer

LlamaTokenizerFast(name_or_path='/home/genghaozhe/workspace/huggingface-models/Llama-2-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False)}, clean_up_tokenization_spaces=False)

In [5]:
tokenizer.pad_token_id = 2
tokenizer.padding_side = "right"  # 一定要设置padding_side为right，否则batch大于1时可能不收敛

In [6]:
def process_func(example):
    MAX_LENGTH = 384    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer("\n".join(["Human: " + example["instruction"], example["input"]]).strip() + "\n\nAssistant: ", add_special_tokens=False)
    response = tokenizer(example["output"], add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.eos_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.eos_token_id]
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [7]:
tokenized_ds = ds.map(process_func, remove_columns=ds.column_names)
tokenized_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 48818
})

In [8]:
tokenized_ds[0]

{'input_ids': [12968,
  29901,
  29871,
  30982,
  31695,
  31863,
  31577,
  30210,
  30457,
  30502,
  31302,
  30858,
  30267,
  13,
  13,
  7900,
  22137,
  29901,
  29871,
  29871,
  30651,
  30557,
  30392,
  30982,
  31695,
  31863,
  31577,
  30210,
  30457,
  30502,
  31302,
  30858,
  30383,
  13,
  13,
  29896,
  29889,
  29871,
  30982,
  31695,
  31687,
  30988,
  31704,
  30846,
  30267,
  31951,
  30408,
  232,
  132,
  157,
  236,
  131,
  133,
  30948,
  30210,
  31687,
  30988,
  31894,
  30846,
  30214,
  30847,
  233,
  152,
  166,
  233,
  176,
  168,
  30330,
  235,
  186,
  148,
  233,
  176,
  168,
  31391,
  233,
  187,
  187,
  233,
  182,
  182,
  30214,
  30815,
  231,
  194,
  134,
  31174,
  30869,
  235,
  164,
  131,
  31624,
  31863,
  31577,
  30214,
  232,
  165,
  161,
  232,
  191,
  189,
  235,
  133,
  143,
  235,
  133,
  140,
  31074,
  31180,
  30214,
  31666,
  30417,
  31931,
  30909,
  232,
  138,
  146,
  31022,
  30988,
  30908,
  30267,
 

# 模型结构

In [9]:
# model = AutoModelForCausalLM.from_pretrained("/home/genghaozhe/workspace/huggingface-models/Llama-2-7b-hf", low_cpu_mem_usage=True, torch_dtype=torch.half, device_map="auto")
model = LlamaForCausalLM.from_pretrained("/home/genghaozhe/workspace/huggingface-models/Llama-2-7b-hf")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
model.dtype

torch.float32

In [11]:
layers = model.model.layers
layers[0]

LlamaDecoderLayer(
  (self_attn): LlamaAttention(
    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (rotary_emb): LlamaRotaryEmbedding()
  )
  (mlp): LlamaMLP(
    (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
    (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
    (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
    (act_fn): SiLUActivation()
  )
  (input_layernorm): LlamaRMSNorm()
  (post_attention_layernorm): LlamaRMSNorm()
)

In [12]:
attn = layers[0].self_attn
mlp = layers[0].mlp
print(type(attn), type(mlp))

<class 'transformers.models.llama.modeling_llama.LlamaAttention'> <class 'transformers.models.llama.modeling_llama.LlamaMLP'>


## MLP 替换张量并行

## Attention替换张量并行

# 数据加载

In [20]:
dl = DataLoader(tokenized_ds, batch_size=2, collate_fn=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True))


In [25]:
ipt = next(enumerate(dl))[1]
ipt

{'input_ids': tensor([[12968, 29901, 29871, 30982, 31695, 31863, 31577, 30210, 30457, 30502,
         31302, 30858, 30267,    13,    13,  7900, 22137, 29901, 29871, 29871,
         30651, 30557, 30392, 30982, 31695, 31863, 31577, 30210, 30457, 30502,
         31302, 30858, 30383,    13,    13, 29896, 29889, 29871, 30982, 31695,
         31687, 30988, 31704, 30846, 30267, 31951, 30408,   232,   132,   157,
           236,   131,   133, 30948, 30210, 31687, 30988, 31894, 30846, 30214,
         30847,   233,   152,   166,   233,   176,   168, 30330,   235,   186,
           148,   233,   176,   168, 31391,   233,   187,   187,   233,   182,
           182, 30214, 30815,   231,   194,   134, 31174, 30869,   235,   164,
           131, 31624, 31863, 31577, 30214,   232,   165,   161,   232,   191,
           189,   235,   133,   143,   235,   133,   140, 31074, 31180, 30214,
         31666, 30417, 31931, 30909,   232,   138,   146, 31022, 30988, 30908,
         30267,    13,    13, 29906, 2

In [None]:
# model(**ipt.to("cuda"))
model(**ipt)

AttributeError: 'tuple' object has no attribute 'to'

# Lora 配置

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(task_type=TaskType.CAUSAL_LM)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [None]:
model = get_peft_model(model, config)

In [None]:
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/home/genghaozhe/workspace/huggingface-models/Llama-2-7b-hf', revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'v_proj', 'q_proj'}, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [None]:
# model.enable_input_require_grads() # 开启梯度检查点时，要执行该方法

In [None]:
# model = model.half()  # 当整个模型都是半精度时，需要将adam_epsilon调大
# torch.tensor(1e-8).half() 

In [None]:
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


# 开始微调