In [3]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

import torch
from peft import PromptEncoderConfig, TaskType, get_peft_model, PromptEncoderReparameterizationType
import os

In [5]:
tokenizer = AutoTokenizer.from_pretrained('/root/autodl-tmp/qwen/Qwen-7B-Chat', 
                                          use_fast=False, 
                                          trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eod_id

In [7]:
# 将json转换为csv文件
df = pd.read_json("./huanhuan.json")
ds = Dataset.from_pandas(df)
ds[:3]

{'instruction': ['小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——',
  '这个温太医啊，也是古怪，谁不知太医不得皇命不能为皇族以外的人请脉诊病，他倒好，十天半月便往咱们府里跑。',
  '嬛妹妹，刚刚我去府上请脉，听甄伯母说你来这里进香了。'],
 'input': ['', '', ''],
 'output': ['嘘——都说许愿说破是不灵的。', '你们俩话太多了，我该和温太医要一剂药，好好治治你们。', '出来走走，也是散心。']}

In [6]:
# 用于处理数据集的函数
def process_func(example):
    MAX_LENGTH = 384    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer("\n".join(["<|im_start|>system", "现在你要扮演皇帝身边的女人--甄嬛.<|im_end|>" + "\n<|im_start|>user\n" + example["instruction"] + example["input"] + "<|im_end|>\n"]).strip(), add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer("<|im_start|>assistant\n" + example["output"] + "<|im_end|>\n", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  # Qwen的特殊构造就是这样的
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [8]:
tokenized_id = ds.map(process_func, 
                      remove_columns=ds.column_names)
tokenized_id

Map: 100%|██████████| 3729/3729 [00:01<00:00, 3332.02 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3729
})

In [9]:
tokenizer.decode(tokenized_id[0]['input_ids'])

'<|im_start|>system\n现在你要扮演皇帝身边的女人--甄嬛.<|im_end|>\n<|im_start|>user\n小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——<|im_end|><|im_start|>assistant\n嘘——都说许愿说破是不灵的。<|im_end|>\n<|endoftext|>'

In [10]:
import torch
model = AutoModelForCausalLM.from_pretrained('./qwen/Qwen-7B-Chat', 
                                             trust_remote_code=True, 
                                             torch_dtype=torch.half, 
                                             device_map="auto")
model

The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Loading checkpoint shards: 100%|██████████| 8/8 [00:08<00:00,  1.04s/it]


QWenLMHeadModel(
  (transformer): QWenModel(
    (wte): Embedding(151936, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (rotary_emb): RotaryEmbedding()
    (h): ModuleList(
      (0-31): 32 x QWenBlock(
        (ln_1): RMSNorm()
        (attn): QWenAttention(
          (c_attn): Linear(in_features=4096, out_features=12288, bias=True)
          (c_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (core_attention_flash): FlashSelfAttention()
          (attn_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): RMSNorm()
        (mlp): QWenMLP(
          (w1): Linear(in_features=4096, out_features=11008, bias=False)
          (w2): Linear(in_features=4096, out_features=11008, bias=False)
          (c_proj): Linear(in_features=11008, out_features=4096, bias=False)
        )
      )
    )
    (ln_f): RMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=151936, bias=False)
)

In [14]:
from peft import LoraConfig, TaskType, get_peft_model
#  loraConfig
config = PromptEncoderConfig(
    task_type=TaskType.CAUSAL_LM, 
    num_virtual_tokens=10,
    encoder_reparameterization_type=PromptEncoderReparameterizationType.MLP,
    encoder_dropout=0.1, 
    encoder_num_layers=5, 
    encoder_hidden_size=1024)

In [15]:
# 创建模型并以半精度形式加载
model = AutoModelForCausalLM.from_pretrained('/root/autodl-tmp/qwen/Qwen-7B-Chat', trust_remote_code=True, torch_dtype=torch.half, device_map="auto")
# model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法
# 加载lora参数
model = get_peft_model(model, config)

The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".


Loading checkpoint shards: 100%|██████████| 8/8 [00:08<00:00,  1.05s/it]


In [16]:
# 配置训练参数
args = TrainingArguments(
    output_dir="./output/Qwen",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    logging_steps=10,
    num_train_epochs=3,
    # gradient_checkpointing=True,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True
)

In [17]:
 # 使用trainer训练
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    )
trainer.train() # 开始训练

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,4.3398
20,2.5461
30,2.4988
40,2.5492
50,2.4934
60,2.2641
70,2.2578
80,2.6025
90,2.5949
100,2.5387


Checkpoint destination directory ./output/Qwen/checkpoint-100 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/Qwen/checkpoint-200 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/Qwen/checkpoint-300 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/Qwen/checkpoint-400 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/Qwen/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/Qwen/checkpoint-600 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=2796, training_loss=2.281816670822995, metrics={'train_runtime': 833.7828, 'train_samples_per_second': 13.417, 'train_steps_per_second': 3.353, 'total_flos': 3.937901800766669e+16, 'train_loss': 2.281816670822995, 'epoch': 3.0})

In [20]:
# model.eval()
response, history = model.chat(tokenizer, "你是谁", history=[], system="现在你要扮演皇帝身边的女人--甄嬛.")
print(response)

我是来自阿里云的大规模语言模型，我叫通义千问。


In [27]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

import torch
from peft import PromptEncoderConfig, TaskType, get_peft_model, PromptEncoderReparameterizationType
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

# 用于处理数据集的函数
def process_func(example):
    MAX_LENGTH = 384    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer("\n".join(["<|im_start|>system", "现在你要扮演皇帝身边的女人--甄嬛.<|im_end|>" + "\n<|im_start|>user\n" + example["instruction"] + example["input"] + "<|im_end|>\n"]).strip(), add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer("<|im_start|>assistant\n" + example["output"] + "<|im_end|>\n", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  # Qwen的特殊构造就是这样的
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

#  loraConfig
config = PromptEncoderConfig(
    task_type=TaskType.CAUSAL_LM, num_virtual_tokens=10,
    encoder_reparameterization_type=PromptEncoderReparameterizationType.MLP,
    encoder_dropout=0.1, encoder_num_layers=5, encoder_hidden_size=1024)

# 配置训练参数
args = TrainingArguments(
    output_dir="./output/Qwen",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    logging_steps=10,
    num_train_epochs=3,
    # gradient_checkpointing=True,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True
)


# if "__main__" == __name__:
# os.chdir('/root/self-llm')  # /root/self-llm 需改成自己 self-llm 项目的绝对路径
# 处理数据集
# 将JSON文件转换为CSV文件
df = pd.read_json('./huanhuan.json')
ds = Dataset.from_pandas(df)
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('/root/autodl-tmp/qwen/Qwen-7B-Chat', use_fast=False, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eod_id
# 将数据集变化为token形式
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)

# 创建模型并以半精度形式加载
model = AutoModelForCausalLM.from_pretrained('/root/autodl-tmp/qwen/Qwen-7B-Chat', trust_remote_code=True, torch_dtype=torch.half, device_map="auto")
# model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法
# 加载lora参数
model = get_peft_model(model, config)
# 使用trainer训练
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    )
trainer.train() # 开始训练


Map: 100%|██████████| 3729/3729 [00:01<00:00, 3346.76 examples/s]
The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Loading checkpoint shards: 100%|██████████| 8/8 [00:07<00:00,  1.07it/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,4.4789
20,2.6469
30,2.518
40,2.5203
50,2.484
60,2.2465
70,2.2512
80,2.5969
90,2.6039
100,2.5416


Checkpoint destination directory ./output/Qwen/checkpoint-100 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/Qwen/checkpoint-200 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/Qwen/checkpoint-300 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/Qwen/checkpoint-400 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/Qwen/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/Qwen/checkpoint-600 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output/Qwen/checkpoint-700 already exists and is non-empty.Saving will procee

TrainOutput(global_step=2796, training_loss=2.2803970249596244, metrics={'train_runtime': 827.6156, 'train_samples_per_second': 13.517, 'train_steps_per_second': 3.378, 'total_flos': 3.937901800766669e+16, 'train_loss': 2.2803970249596244, 'epoch': 3.0})

In [28]:
response, history = model.chat(tokenizer, "你是谁", history=[], system="现在你要扮演皇帝身边的女人--甄嬛.")
print(response)

AssertionError: 