In [1]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th

from pprint import pp
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, 
                          BitsAndBytesConfig,
                          AutoModel, 
                          AutoModelForCausalLM, 
                          AutoModelForSequenceClassification,
                          DataCollatorWithPadding, 
                          DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, 
                          DataCollatorForTokenClassification,
                          TrainingArguments, Trainer)
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, get_peft_model_state_dict)
from trl import SFTTrainer
from vllm import (LLM, SamplingParams)



In [2]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(th.__version__)
print(th.version.cuda)

device = cuda; devive_cnt = 1
2.5.1+cu121
12.1


In [3]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = "F:/LLM"
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: 数据源

In [4]:
filename = "ht/pt_data.csv"

In [5]:
df_csv = pd.read_csv(os.path.join(path_data, filename))
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    9 non-null      object
 1   content  9 non-null      object
dtypes: object(2)
memory usage: 276.0+ bytes


## step-2: tokenizer

In [6]:
# checkpoint = "Qwen/Qwen2.5-0.5B-Instruct"
checkpoint = "Qwen/Qwen2.5-1.5B-Instruct"
# checkpoint = "Qwen/Qwen2.5-3B-Instruct"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

In [8]:
pp(tokenizer.special_tokens_map)

{'eos_token': '<|im_end|>',
 'pad_token': '<|endoftext|>',
 'additional_special_tokens': ['<|im_start|>',
                               '<|im_end|>',
                               '<|object_ref_start|>',
                               '<|object_ref_end|>',
                               '<|box_start|>',
                               '<|box_end|>',
                               '<|quad_start|>',
                               '<|quad_end|>',
                               '<|vision_start|>',
                               '<|vision_end|>',
                               '<|vision_pad|>',
                               '<|image_pad|>',
                               '<|video_pad|>']}


In [9]:
# add qwen token for pt
df_csv["text"] = "<|im_start|>" + \
    df_csv["title"] + "\n" + df_csv["content"] + \
    tokenizer.eos_token + \
    tokenizer.pad_token

In [10]:
dataset = Dataset.from_pandas(df_csv)
# dataset = Dataset.from_pandas(df_csv).train_test_split(test_size=0.2, shuffle=True, seed=0)

In [11]:
pp(dataset)

Dataset({
    features: ['title', 'content', 'text'],
    num_rows: 9
})


## step-3: 量化参数

In [14]:
# config_bnb = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=th.bfloat16,
#     bnb_4bit_use_double_quant=True
# )  # QLoRA

## step-4: 载入基模

In [12]:
base_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    torch_dtype=th.bfloat16,
    low_cpu_mem_usage=True,
    # quantization_config=(config_bnb if config_bnb else None),
)

In [22]:
# L0 test
system_prompt = (
    # "你叫小慧助手，是由BigData公司开发的差旅智能客服。"
    # "你能为用户提供差旅知识问答、酒店推荐等服务。"
    # "你要始终以差旅为背景回答用户的问题，或提供帮助建议。"
    "You are a helpful assistant on business travel."
)
user_prompt = "华为云慧通差旅是什么？"

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
]

model_inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True
).to(device)

gen_kwargs = {
    "max_new_tokens": 256,
    "do_sample": True,
    "num_beams": 2,
    "temperature": 1.0,
    "top_p": 0.9,
}

base_model.eval()
with th.inference_mode():
    complete_ids = base_model.generate(
        input_ids=model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        **gen_kwargs
    )

input_ids = model_inputs.input_ids
generated_ids = [O[len(I): ] for (I, O) in zip(input_ids, complete_ids)]
response = tokenizer.batch_decode(sequences=generated_ids, skip_special_tokens=True)[0]
pp(response)

('华为云慧通差旅是华为云推出的一款企业级差旅管理解决方案，旨在帮助企业实现差旅管理的数字化和智能化。华为云慧通差旅可以帮助企业实现以下目标：\n'
 '\n'
 '1. 提高差旅管理效率：通过智能化的差旅管理平台，企业可以实现差旅申请、审批、报销等流程的自动化，提高差旅管理的效率。\n'
 '\n'
 '2. 降低差旅成本：通过智能化的差旅管理平台，企业可以实现对差旅费用的精细化管理，降低差旅成本。\n'
 '\n'
 '3. 提高差旅体验：通过智能化的差旅管理平台，企业可以实现对差旅人员的个性化服务，提高差旅体验。\n'
 '\n'
 '4. 提升企业形象：通过智能化的差旅管理平台，企业可以提升企业形象，树立良好的企业形象。\n'
 '\n'
 '华为云慧通差旅可以帮助企业实现差旅管理的数字化和智能化，提高差旅管理的效率，降低差旅成本，提高差旅体验，提升企业形象。')


In [15]:
base_model.gradient_checkpointing_enable()
base_model.enable_input_require_grads()
base_model.config.use_cache = False

if th.cuda.device_count() > 1:
    base_model.is_parallelizable = True
    base_model.model_parallel = True

In [16]:
allocated_memory = th.cuda.memory_allocated()
cached_memory = th.cuda.memory_reserved()
print(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

已分配的GPU内存：0.94G, 已缓存的GPU内存：1.00G


In [17]:
# check embedding_size
tokenizer_size = len(tokenizer)
embedding_size = base_model.get_input_embeddings().weight.shape[0]
print(f"tokenizer_size = {tokenizer_size}; embedding_size = {embedding_size}")

if tokenizer_size > embedding_size:
    base_model.resize_token_embeddings(tokenizer_size)

tokenizer_size = 151665; embedding_size = 151936


## step-5: 模型参数

In [18]:
config_model = {
    "rank": 8,
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "use_rslora": True,
    "epochs": 5,
    "batch_size": 1,
    "gradient_steps": 1,
    "learning_rate": 0.00001,
    "weight_decay": 0.01,
    "max_seq_length": 512
}

## step-6: LoRA参数

## step-7: 模型训练

In [19]:
# 整理函数
def tokenize_function(sample):
    inputs = tokenizer(text=sample["text"], max_length=256, truncation=True, padding=True, return_tensors="pt")
    inputs["labels"] = inputs["input_ids"]
    return inputs

In [20]:
dataset_tokenized = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [23]:
# dataset_train_tokenized = dataset_tokenized["train"]
# dataset_test_tokenized = dataset_tokenized["test"]

In [21]:
args_train = TrainingArguments(
    output_dir=os.path.join(path_output, "model_pt"),
    num_train_epochs=config_model.get("epochs"),
    per_device_train_batch_size=config_model.get("batch_size"),
    per_device_eval_batch_size=config_model.get("batch_size"),
    gradient_accumulation_steps=config_model.get("gradient_steps"),
    gradient_checkpointing=True, 
    optim="adamw_torch",
    learning_rate=config_model.get("learning_rate"),
    weight_decay=config_model.get("weight_decay"),
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    # save_total_limit=1,
    # metric_for_best_model="eval_loss",
    load_best_model_at_end=True
)

In [22]:
collate_fn = DataCollatorForLanguageModeling(tokenizer, mlm=False) 

In [23]:
trainer = Trainer(
    model=base_model,
    tokenizer=tokenizer,
    args=args_train,
    data_collator=collate_fn,
    train_dataset=dataset_tokenized,
    eval_dataset=dataset_tokenized,
)

In [None]:
res_train = trainer.train()

## step-8: 模型评估

In [31]:
res_eval = trainer.evaluate()
print(res_eval)

  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 3.144468069076538, 'eval_runtime': 0.3662, 'eval_samples_per_second': 5.461, 'eval_steps_per_second': 5.461, 'epoch': 1.5714285714285714}


## step-9: 模型保存

In [32]:
# 1 - 使用 Trainer 训练时保存整个训练模型(包含训练状态（模型权重、配置文件、优化器等）)
trainer.save_model(output_dir=os.path.join(path_output, "model_pt_1"))

# 2 - 通常用于非 Trainer 环境下保存模型(只保存模型权重、配置文件和分词器等)
base_model.save_pretrained(save_directory=os.path.join(path_output, "model_pt_2"), max_shard_size="4GB")