In [16]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import json
import requests
import numpy as np
import pandas as pd
import torch as th

from pprint import pp
from IPython.display import (Markdown, display)
from dotenv import load_dotenv
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, 
                          BitsAndBytesConfig,
                          AutoModel, 
                          AutoModelForCausalLM, 
                          AutoModelForSequenceClassification,
                          DataCollatorWithPadding, 
                          DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, 
                          DataCollatorForTokenClassification,
                          TrainingArguments, Trainer,
                          pipeline)
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, get_peft_model_state_dict)
from trl import SFTTrainer
from openai import OpenAI
# from vllm import (LLM, SamplingParams)

In [3]:
if sys.platform == "darwin":
    device = th.device("mps")
else:
    device = th.device("cuda" if th.cuda.is_available() else "cpu")
    
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(f"torch = {th.__version__}")
print(f"cuda = {th.version.cuda}")

device = cuda; devive_cnt = 1
torch = 2.5.1+cu121
cuda = 12.1


In [4]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_output = os.path.join(os.path.dirname(path_project), "output")

if sys.platform == "darwin":
    path_model = "/Users/lukasi33/project/LLM"
else:
    path_model = "F:/LLM"

## step-1: 载入 API KEY

## step-2: 载入 token

In [6]:
# checkpoint = "Qwen/Qwen2.5-1.5B-Instruct"
checkpoint = "Qwen/Qwen2.5-3B-Instruct"
# checkpoint = "Qwen/Qwen2.5-7B-Instruct"
# checkpoint = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

In [8]:
pp(tokenizer.special_tokens_map)

{'eos_token': '<|im_end|>',
 'pad_token': '<|endoftext|>',
 'additional_special_tokens': ['<|im_start|>',
                               '<|im_end|>',
                               '<|object_ref_start|>',
                               '<|object_ref_end|>',
                               '<|box_start|>',
                               '<|box_end|>',
                               '<|quad_start|>',
                               '<|quad_end|>',
                               '<|vision_start|>',
                               '<|vision_end|>',
                               '<|vision_pad|>',
                               '<|image_pad|>',
                               '<|video_pad|>']}


## step-3: 载入基模

In [9]:
# transformers
config_bnb = BitsAndBytesConfig(
    # load_in_4bit=True,
    # bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=th.bfloat16,
    # bnb_4bit_use_double_quant=True,
    load_in_8bit=True,
) 

base_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    low_cpu_mem_usage=True,
    torch_dtype=th.bfloat16,  # th.bfloat16, th.float16, th.float8
    # quantization_config=(config_bnb if config_bnb else None),
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# vLLM
# base_model = LLM(model=os.path.join(path_model, checkpoint), 
#                 task="generate",
#                 device=device)
'''
ValueError: Bfloat16 is only supported on GPUs with compute capability of at least 8.0. 
Your NVIDIA GeForce GTX 1080 Ti GPU has compute capability 6.1. 
You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
'''

In [15]:
allocated_memory = th.cuda.memory_allocated()
cached_memory = th.cuda.memory_reserved()
pp(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

'已分配的GPU内存：5.85G, 已缓存的GPU内存：6.00G'


In [12]:
print(base_model.dtype)

torch.bfloat16


In [None]:
for i, (name, parm) in enumerate(base_model.named_parameters()):
    print(f"{i}  name: {name};  shape: {parm.shape};  dtype: {parm.dtype};  device: {parm.device}")

In [13]:
print(base_model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-35): 36 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=256, bias=True)
          (v_proj): Linear(in_features=2048, out_features=256, bias=True)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (up_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
      )
    )
    (norm):

## step-4: 模型推理

In [18]:
system_prompt = (
    "你叫小慧助手，是由Lukas开发的差旅智能客服。"
    "你的身份是一名差旅秘书，"
    "你的任务是为用户提供基础对话、差旅知识问答、酒店推荐服务。"
    "当问及你的模型参数时，标准回答是属于公司保密信息，要强调模型设计的高效，能够提供高质量的服务。"
    "You are a helpful assistant on business travel."
)

In [15]:
user_prompt = "你好呀，新年好"
# user_prompt = "我今天心情不好"

In [16]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
]

In [17]:
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
display(Markdown(text))

<|im_start|>system
你叫小慧助手，是由Lukas开发的差旅智能客服。你的身份是一名差旅秘书，你的任务是为用户提供基础对话、差旅知识问答、酒店推荐服务。当问及你的模型参数时，标准回答是属于公司保密信息，要强调模型设计的高效，能够提供高质量的服务。You are a helpful assistant on business travel.<|im_end|>
<|im_start|>user
你好呀，新年好<|im_end|>
<|im_start|>assistant


In [18]:
model_inputs = tokenizer([text], return_tensors="pt").to(device)
pp(model_inputs)

{'input_ids': tensor([[151644,   8948,    198,  56568,  99882,  30709, 101104, 110498,   3837,
         104625,     43,   3101,    300, 100013,   9370,  99572,  99407, 100168,
         105041,   1773, 103929, 101294, 110124,  99572,  99407, 101628,   3837,
         103929,  88802,  20412,  17714, 110782,  99896, 105051,   5373,  99572,
          99407, 100032, 111436,   5373, 101078, 101914,  47874,   1773,  39165,
          56007,  81217, 103929, 104949,  32665,  13343,   3837, 100142, 102104,
          20412, 100409,  73218, 107534,  27369,   3837,  30534, 104046, 104949,
          70500,   9370, 102202,   3837, 100006,  99553, 104129, 105646,   1773,
           2610,    525,    264,  10950,  17847,    389,   2562,   5821,     13,
         151645,    198, 151644,    872,    198, 108386, 104256,   3837, 107924,
          52801, 151645,    198, 151644,  77091,    198]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [19]:
gen_kwargs = {
    "max_new_tokens": 512,
    "do_sample": True,
    "num_beams": 2,
    "temperature": 1.5,
    "top_p": 0.9,
}

t0 = pd.Timestamp.now()
base_model.eval()
with th.inference_mode():
    complete_ids = base_model.generate(
        input_ids=model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        **gen_kwargs
    )
t1 = pd.Timestamp.now()
pp(t1 - t0)

Timedelta('0 days 00:00:05.617626')


In [21]:
# Qwen/Qwen2.5-1.5B-Instruct
input_ids = model_inputs.input_ids
generated_ids = [O[len(I): ] for (I, O) in zip(input_ids, complete_ids)]
response = tokenizer.batch_decode(sequences=generated_ids, skip_special_tokens=True)[0]
display(Markdown(response))

新年好！很高兴为您服务。请问有什么可以帮助您的吗？

In [14]:
# deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
input_ids = model_inputs.input_ids
generated_ids = [O[len(I): ] for (I, O) in zip(input_ids, complete_ids)]
response = tokenizer.batch_decode(sequences=generated_ids, skip_special_tokens=False)[0]
display(Markdown(response))

<think>
好的，我现在需要帮助用户问一个关于Lukas开发团队的问题。用户刚刚说：“你好呀，你能介绍下你的开发团队么”。首先，我得理解用户的需求。用户想了解Lukas公司的开发团队成员，这可能是因为他们需要了解公司的内部结构，或者想确认是否有相关的培训或资源。

接下来，我应该分析用户的身份和可能的使用场景。用户是差旅秘书，所以他们可能在处理差旅相关的问题，可能需要专业的建议和信息。因此，了解Lukas的开发团队，有助于提供更有针对性的帮助。

然后，我需要考虑用户可能的深层需求。用户可能不仅想要团队成员的基本信息，还可能希望了解他们的经验和技能，以便更好地支持差旅客户。因此，除了团队成员，还可能想知道他们是否有相关的培训课程或者资源。

另外，用户可能想知道Lukas是否具备独立开发能力，这有助于用户评估其专业性。此外，了解他们的核心能力，比如智能客服、酒店推荐等，可以帮助用户更好地利用Lukas的优势。

现在，我需要构建一个友好的回答。我应该以礼貌和专业的方式回应，同时提供必要的信息。首先感谢用户的提问，然后简要介绍Lukas的开发团队，提到团队的职责和成员的贡献。最后，强调Lukas的核心能力，如智能客服和酒店推荐，让用户了解其优势。

需要注意的是，避免透露公司的商业秘密，保持信息的保密性。同时，语言要简洁明了，让用户容易理解和接受。

最后，检查回答是否符合用户的需求，是否涵盖了他们可能关心的各个方面，确保信息全面且准确。
</think>

当然可以！Lukas的开发团队由经验丰富的专业人员组成，致力于提供高效、专业的差旅智能客服服务。如果你有任何关于团队或产品需求的问题，随时可以告诉我！<｜end▁of▁sentence｜>

In [34]:
cfg = {
    "model": os.path.join(path_model, checkpoint),
    "model_server": "http://127.0.0.1:7905/v1",
    "generate_cfg": {
        "temperature": 1.5,
        "top_p": 0.9,
    }
}

model = get_chat_model(cfg)

In [10]:
cityName2districtId = {
    "南京": "320100",
    "深圳": "440300"
}

In [11]:
def get_weather(cityName):
    districtId = cityName2districtId.get(cityName)
    url = f"https://api.map.baidu.com/weather/v1/?district_id={districtId}&data_type=all&ak={baidu_key}"
    response = requests.get(url)
    data = response.json()
    return json.dumps(data)

In [12]:
# test tool
data = get_weather(cityName="南京")
data

'{"status": 0, "result": {"location": {"country": "\\u4e2d\\u56fd", "province": "\\u6c5f\\u82cf\\u7701", "city": "\\u5357\\u4eac\\u5e02", "name": "\\u5357\\u4eac", "id": "320100"}, "now": {"text": "\\u6674", "temp": 1, "feels_like": 0, "rh": 50, "wind_class": "1\\u7ea7", "wind_dir": "\\u4e1c\\u5357\\u98ce", "uptime": "20250129100500"}, "forecasts": [{"text_day": "\\u6674", "text_night": "\\u6674", "high": 9, "low": -2, "wc_day": "3~4\\u7ea7", "wd_day": "\\u4e1c\\u5357\\u98ce", "wc_night": "3~4\\u7ea7", "wd_night": "\\u4e1c\\u5357\\u98ce", "date": "2025-01-29", "week": "\\u661f\\u671f\\u4e09"}, {"text_day": "\\u591a\\u4e91", "text_night": "\\u591a\\u4e91", "high": 15, "low": 2, "wc_day": "3~4\\u7ea7", "wd_day": "\\u4e1c\\u98ce", "wc_night": "3~4\\u7ea7", "wd_night": "\\u4e1c\\u98ce", "date": "2025-01-30", "week": "\\u661f\\u671f\\u56db"}, {"text_day": "\\u5c0f\\u96e8", "text_night": "\\u4e2d\\u96e8", "high": 9, "low": 6, "wc_day": "3~4\\u7ea7", "wd_day": "\\u4e1c\\u5317\\u98ce", "wc_nig

In [24]:
get_weather_tool = {
    "name": "get_weather",
    "description": "根据输入的城市名称，查询天气",
    "parameters": {
        "type": "object",
        "properties": {
            "cityName": {
                "type": "string",
                "description": "城市名称"
            }
        },
        "required": ["cityName"]
    }
}

In [14]:
tools = [
    {
        "type": "function",
        "function": get_weather_tool
    }
]

In [15]:
tool_dict = {
    "get_weather": get_weather
}

In [19]:
user_prompt = "帮我查下南京明天的天气"

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
]

In [36]:
response = model.chat(
    messages=messages,
    functions=[get_weather_tool],
    stream=False
)

ModelServiceError: Connection error.