In [1]:
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from dataclasses import dataclass, field
from typing import Dict, Optional, List
import transformers
from transformers import Trainer, GPTQConfig, deepspeed, BitsAndBytesConfig, DataCollatorForSeq2Seq
import json
import os
import torch 
import pandas  as pd 
from datasets import Dataset

# from supervised_dataset import LazySupervisedDataset, SupervisedDataset
# from model_save import safe_save_model_for_hf_trainer




In [2]:
model_name_or_path = '/data/liucd/BigModel/Qwen1.5-1.8B-Chat'
data_path = 'huanhuan.json'

@dataclass
class ModelArguments:
    model_name_or_path: str = model_name_or_path


@dataclass
class DataArguments:
    data_path: str = field(default=data_path, metadata={"help": "Path to the training data."})
    eval_data_path: str = field(default=None, metadata={"help": "Path to the evaluation data."}
    )
    lazy_preprocess: bool = False


@dataclass
class LoraArguments:
    lora_r: int = 64
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    lora_target_modules: List[str] = field(
        default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj",  "up_proj", "gate_proj","down_proj",]
    )
    lora_weight_path: str = ""
    lora_bias: str = "none"
    q_lora: bool = False

@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    use_lora: bool = True
    # bf16: bool = False
    output_dir: str = 'qwen_output'
    model_max_length: int = field(
        default=512,
        metadata={
            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
        },
    )  # 微调时最大序列长度
    gradient_checkpointing: bool = True
    report_to: str = 'none'
    num_train_epochs: int = 1
    per_device_train_batch_size: int = 8   # bs=1对应的训练集loss 更低 !
    gradient_accumulation_steps: int = 8
    learning_rate: float = 3e-4
    weight_decay: float = 0.1
    adam_beta2: float = 0.95
    warmup_ratio: float = 0.01
    lr_scheduler_type: str = 'cosine'
    logging_steps: int = 1  # 每隔10个打印一次日志

    # deepspeed: str = '/data/liucd/BigModel/qwen/Qwen/finetune/ds_config_zero2.json'


args_model = ModelArguments()
args_train = TrainingArguments()
args_lora = LoraArguments()
args_data = DataArguments()





In [3]:

# tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    args_model.model_name_or_path,
    model_max_length=args_train.model_max_length,
    padding_side="right",
    use_fast=False,
)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
compute_dtype = torch.bfloat16 if args_train.bf16  else torch.float16

# Load model and tokenizer
config = transformers.AutoConfig.from_pretrained(
    args_model.model_name_or_path,
)

model = transformers.AutoModelForCausalLM.from_pretrained(
        args_model.model_name_or_path,
        # config=config,
        device_map='auto',
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=compute_dtype,
        )
        if args_train.use_lora and args_lora.q_lora
        else None,
        torch_dtype=compute_dtype  # add by liucd 否则在4卡上会float32运行
    )




In [6]:
model.dtype

torch.float16

In [7]:

lora_config = LoraConfig(
            r=args_lora.lora_r,
            lora_alpha=args_lora.lora_alpha,
            target_modules=args_lora.lora_target_modules,
            lora_dropout=args_lora.lora_dropout,
            bias=args_lora.lora_bias,
            task_type="CAUSAL_LM",
        )


if args_lora.q_lora:
     model = prepare_model_for_kbit_training(
                model, use_gradient_checkpointing=args_train.gradient_checkpointing
            )  # 将某些的LN层等从FP16变成FP32


model = get_peft_model(model, peft_config=lora_config)
model.print_trainable_parameters()



trainable params: 59,965,440 || all params: 1,896,794,112 || trainable%: 3.1614100666293083


In [8]:
# 调用 model.enable_input_require_grads() 是为了确保在使用 grad_checkpoint 时，模型的输入能够被要求梯度，以便在检查点处能够正确地重新计>算梯度。
if args_train.gradient_checkpointing:
    model.enable_input_require_grads()




In [9]:
"""
# 将JSON文件转换为CSV文件
import json 
with open('data.json', 'r') as f:
    data = json.load(f)
instructions = [each_data['messages'][0]['content'] for each_data in data]
outputs = [each_data['messages'][1]['content'] for each_data in data]
print(len(instructions), len(outputs))

df = pd.DataFrame({'instruction': instructions,
                    'input': [''] * len(instructions),
                    'output': outputs            
            }
    ) 
df.to_json('weather.json',  orient='records', indent=4, force_ascii=False)
"""


"\n# 将JSON文件转换为CSV文件\nimport json \nwith open('data.json', 'r') as f:\n    data = json.load(f)\ninstructions = [each_data['messages'][0]['content'] for each_data in data]\noutputs = [each_data['messages'][1]['content'] for each_data in data]\nprint(len(instructions), len(outputs))\n\ndf = pd.DataFrame({'instruction': instructions,\n                    'input': [''] * len(instructions),\n                    'output': outputs            \n            }\n    ) \ndf.to_json('weather.json',  orient='records', indent=4, force_ascii=False)\n"

In [10]:
# 将JSON文件转换为CSV文件
df = pd.read_json('./weather.json')
ds = Dataset.from_pandas(df)

# sys = '现在你要扮演皇帝身边的女人--甄嬛'
sys = '你是一个人工智能助手'
def process_func(example):
    MAX_LENGTH = 384    # Llama分词器会将一个中文字切分为多个token，因此需要放开一些最大长度，保证数据的完整性
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\n {sys} <|im_end|>\n<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False)  # add_special_tokens 不在开头加 special_tokens
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]  # 因为eos token咱们也是要关注的所以 补充为1
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

tokenizer.decode(tokenized_id[2]['input_ids'])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

'<|im_start|>system\n 你是一个人工智能助手 <|im_end|>\n<|im_start|>user\n\n给定一句话：“克山县7月12号的天气”，请你按步骤要求工作。\n\n步骤1：识别这句话中的城市和日期共2个信息\n步骤2：根据城市和日期信息，生成JSON字符串，格式为{"city":城市,"date":日期}\n\n请问，这个JSON字符串是：\n<|im_end|>\n<|im_start|>assistant\n{"city": "克山县", "date": "07-12"}<|endoftext|>'

In [11]:
len(tokenized_id[0]['input_ids']), tokenizer.decode(tokenized_id[0]['input_ids'])

(104,
 '<|im_start|>system\n 你是一个人工智能助手 <|im_end|>\n<|im_start|>user\n\n给定一句话：“11月2日邵东县的天气”，请你按步骤要求工作。\n\n步骤1：识别这句话中的城市和日期共2个信息\n步骤2：根据城市和日期信息，生成JSON字符串，格式为{"city":城市,"date":日期}\n\n请问，这个JSON字符串是：\n<|im_end|>\n<|im_start|>assistant\n{"city": "邵东县", "date": "11-02"}<|endoftext|>')

In [12]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[2]["labels"])))

'{"city": "克山县", "date": "07-12"}<|endoftext|>'

In [13]:
# from supervised_dataset import LazySupervisedDataset, SupervisedDataset

# train_dataset =  SupervisedDataset( json.load(open('data.json', 'r')), tokenizer=tokenizer, max_len=512)


In [14]:
# train_dataset[0]['input_ids'].shape, tokenizer.decode(train_dataset[0]['input_ids'])

In [15]:
trainer = Trainer(
    model=model,
    args=args_train,
    train_dataset=tokenized_id,
    tokenizer=tokenizer,
    # train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,0.8341
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


TrainOutput(global_step=15, training_loss=0.05560781955718994, metrics={'train_runtime': 92.1887, 'train_samples_per_second': 10.847, 'train_steps_per_second': 0.163, 'total_flos': 1067826117672960.0, 'train_loss': 0.05560781955718994, 'epoch': 0.96})

In [None]:
trainer.save_state()  # 保存状态
from model_save import safe_save_model_for_hf_trainer

safe_save_model_for_hf_trainer(trainer=trainer, output_dir=args_train.output_dir, bias=args_lora.lora_bias)


OSError: [Errno 12] Cannot allocate memory

In [None]:
trainer.args

