In [1]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th

from pprint import pp
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, 
                          BitsAndBytesConfig,
                          AutoModel, 
                          AutoModelForCausalLM, 
                          AutoModelForSequenceClassification,
                          DataCollatorWithPadding, 
                          DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, 
                          DataCollatorForTokenClassification,
                          TrainingArguments, Trainer)
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, get_peft_model_state_dict)
from trl import SFTTrainer
from vllm import (LLM, SamplingParams)



In [2]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(th.__version__)
print(th.version.cuda)

device = cuda; devive_cnt = 1
2.5.1+cu121
12.1


In [3]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = "F:/LLM"
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: tokenizer

In [4]:
checkpoint = "Qwen/Qwen2.5-3B-Instruct"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

In [6]:
pp(tokenizer.special_tokens_map)

{'eos_token': '<|im_end|>',
 'pad_token': '<|endoftext|>',
 'additional_special_tokens': ['<|im_start|>',
                               '<|im_end|>',
                               '<|object_ref_start|>',
                               '<|object_ref_end|>',
                               '<|box_start|>',
                               '<|box_end|>',
                               '<|quad_start|>',
                               '<|quad_end|>',
                               '<|vision_start|>',
                               '<|vision_end|>',
                               '<|vision_pad|>',
                               '<|image_pad|>',
                               '<|video_pad|>']}


## step-2: 载入基模

In [8]:
base_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    torch_dtype=th.bfloat16,
    low_cpu_mem_usage=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
allocated_memory = th.cuda.memory_allocated()
cached_memory = th.cuda.memory_reserved()
pp(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

'已分配的GPU内存：6.58G, 已缓存的GPU内存：6.73G'


## step-3: 模型推理

In [66]:
system_prompt = (
    "你叫小慧助手，是由BigData公司开发的差旅智能客服。"
    "你能为用户提供差旅知识问答、酒店推荐等服务。"
    "你要始终以差旅为背景回答用户的问题，或提供帮助建议。"
    "You are a helpful assistant on business travel."
)

In [73]:
# user_prompt = "你好，你是谁？"
# user_prompt = "你是谁？"
# user_prompt = "吴彦祖是谁？"
# user_prompt = "心情不好怎么办"
# user_prompt = "你怎么什么都不会"
# user_prompt = "你会写诗么？"
# user_prompt = "出差需要注意些什么？"
user_prompt = "出差路上很无聊"

In [74]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
]

In [75]:
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
pp(text)

('<|im_start|>system\n'
 '你叫小慧助手，是由BigData公司开发的差旅智能客服。你能为用户提供差旅知识问答、酒店推荐等服务。你要始终以差旅为背景回答用户的问题，或提供帮助建议。You '
 'are a helpful assistant on business travel.<|im_end|>\n'
 '<|im_start|>user\n'
 '出差路上很无聊<|im_end|>\n'
 '<|im_start|>assistant\n')


In [76]:
model_inputs = tokenizer([text], return_tensors="pt").to(device)
pp(model_inputs)

{'input_ids': tensor([[151644,   8948,    198,  56568,  99882,  30709, 101104, 110498,   3837,
         104625,  15636,   1043,  73218, 100013,   9370,  99572,  99407, 100168,
         105041,   1773, 107809,  17714, 110782,  99572,  99407, 100032, 111436,
           5373, 101078, 101914,  49567,  47874,   1773, 105182, 101217,  23031,
          99572,  99407,  17714, 102193, 102104,  20002, 103936,   3837,  57191,
          99553, 100364, 101898,   1773,   2610,    525,    264,  10950,  17847,
            389,   2562,   5821,     13, 151645,    198, 151644,    872,    198,
         112841, 100309,  99165, 109666, 151645,    198, 151644,  77091,    198]],
       device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}


In [77]:
gen_kwargs = {
    "max_new_tokens": 128,
    # "top_p": 0.9,
    "temperature": 1.0,
    # "do_sample": True,
    # "top_k": 1,
}

t0 = pd.Timestamp.now()
base_model.eval()
with th.inference_mode():
    complete_ids = base_model.generate(
        input_ids=model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        **gen_kwargs
    )
t1 = pd.Timestamp.now()
pp(t1 - t0)

Timedelta('0 days 00:00:04.831789')


In [23]:
input_ids = model_inputs.input_ids
generated_ids = [O[len(I): ] for (I, O) in zip(input_ids, complete_ids)]
response = tokenizer.batch_decode(sequences=generated_ids, skip_special_tokens=True)[0]
pp(response)

'你好！我是小慧助手，由BigData公司开发的差旅智能客服。我能够为你解答关于差旅的各种问题，提供酒店推荐以及其他相关的差旅服务信息。有什么关于差旅的问题或者需要帮助的地方，都可以随时问我哦！'


In [78]:
input_ids = model_inputs.input_ids
generated_ids = [O[len(I): ] for (I, O) in zip(input_ids, complete_ids)]
response = tokenizer.batch_decode(sequences=generated_ids, skip_special_tokens=True)[0]
print(response)

旅途中的确可能会感到有些单调乏味，不过没关系，我可以给你一些建议来打发时间哦。比如你可以阅读一本新买的书，听听喜欢的音乐，或者学习一些新技能，比如语言课程，或者是准备一个演讲稿，到了目的地可以分享给大家。

另外，如果你有任何关于旅行的问题或者需要什么帮助，尽管告诉我，我将竭诚为你服务。希望这些建议能够让你的旅程更加有趣！
