配置网络环境

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,5,6"

导入模型

In [None]:
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModel, HfArgumentParser, TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
model = AutoModelForCausalLM.from_pretrained(
        "/jydata/qwen/Qwen2-7B-Instruct",device_map="auto",torch_dtype=torch.bfloat16)

In [None]:
model.dtype

In [None]:
torch.allclose(model.word_embeddings.weight.data, model.lm_head.weight.data)


In [None]:
output = []
for name, param in model.named_parameters():
    print(name)
    if(name == "base_model.model.model.embed_tokens.weight"):
        output.append(param)
    if(name == "base_model.model.lm_head.weight"):
        output.append(param)

分析未加lora前的模型权重

In [None]:
for name, param in model.named_parameters():
        print(name, param.shape)

加lora后模型权重

In [None]:

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
model = get_peft_model(model, config)

In [None]:
def cal_param(param):
    a = 1
    for papr in param.shape:
        a = a * papr
    return a

In [None]:
all_parameters = 0
lora_parameters = 0
for name, param in model.named_parameters():
    if("lora" in name):
        #print(name, param.shape)
        lora_parameters = lora_parameters + cal_param(param)
    all_parameters = all_parameters + cal_param(param)
print(f"lora参数量：{round(lora_parameters/10**9,2)}B")
print(f"所有参数量：{round(all_parameters/10**9,2)}B")
print(f"lora参数占比：{round(lora_parameters/all_parameters*100,2)}%")

读取tokenizer

In [None]:
from modelscope import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("/jydata/qwen/Qwen2-7B-Instruct")

查看模型中可训练参数

In [None]:

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
print_trainable_parameters(model)

数据预处理

In [None]:

import json
data = []
no = 0
 
# 读取 JSON Lines 文件
with open("./v1/1.jsonl", 'r', encoding='utf-8') as file:
    for line in file:
        # 使用 json.loads() 将 JSON 格式的字符串解析为字典
        entry = json.loads(line)
        # 将解析后的字典添加到列表中
        with open("./v1/1_modify_small.jsonl", 'a', encoding='utf-8') as f:
            if(no == 10):
                break
            f.write(json.dumps(entry[0],ensure_ascii=False))
            f.write('\n')
            no = no + 1


In [None]:
from datasets import load_dataset,load_from_disk
data = load_dataset('json',data_files="./v1/1_modify_small.jsonl")


In [None]:
def handler(data):
    data['response'] = data['response'][0][0]
    return data

datasetMap = data.map(handler)


In [None]:
system_prompt ='''你是一位小说创作专家，你需要根据给定的要求续写文章，必须满足字数要求.'''

def handler2(data):
    data["prompt"] = f'''<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n''' + data["prompt"] + "<|im_end|>\n<|im_start|>assistant\n"
    data["response"] = data["response"]
    return data
datasetMap2 = datasetMap.map(handler2)


In [None]:
def process_func2(example):
    '''
    将数据集进行预处理
    '''

    MAX_LENGTH =8096
    input_ids, attention_mask, labels =[],[],[]

    instruction = tokenizer(example['prompt'],add_special_tokens=False)
    response = tokenizer(example['response'], add_special_tokens=False)
    
    input_ids = instruction['input_ids']+ response['input_ids']+[tokenizer.pad_token_id]
    attention_mask =(
        instruction['attention_mask']+ response['attention_mask']+[1]
    )
    labels =[-100]* len(instruction['input_ids'])+ response['input_ids']+[tokenizer.pad_token_id]
    if len(input_ids)> MAX_LENGTH:# 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return{'input_ids': input_ids,'attention_mask': attention_mask,'labels': labels}

datasetMap3 = datasetMap2.map(process_func2)



In [None]:
datasetMap3

In [None]:
    
datasetMap4 = datasetMap3.map(remove_columns=datasetMap2["train"].column_names)


In [None]:
datasetMap4["train"][0]["labels"]

In [None]:
from transformers import AutoModelForCausalLM,TrainingArguments,Trainer,DataCollatorForSeq2Seq
from transformers import DataCollatorForLanguageModeling

from datetime import datetime
now = datetime.now()
time_str = now.strftime('%Y-%m-%d %H:%M:%S')
print(time_str)

trainer = Trainer(
    model=model,
    train_dataset=datasetMap4["train"],
    args=TrainingArguments(
        output_dir="./output/Qwen2_instruct_lora",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        logging_steps=10,
        num_train_epochs=3,
        save_steps=100, # 为了快速演示，这里设置10，建议你设置成100
        learning_rate=1e-4,
        save_on_each_node=True,
        gradient_checkpointing=True
    ),
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    )
 
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


In [None]:
model.enable_input_require_grads() 

In [None]:
trainer.train()
 
trainer.save_model(trainer.args.output_dir)