https://huggingface.co/Qwen/Qwen2.5-3B-Instruct

1 - 问答能力  
1.1 - 角色扮演（L0 + System Prompt）  
1.2 - 常识对话 -> 特色对话（L0 + PT + SFT）  
1.3 - 知识问答（L0 + RAG）  
1.4 - 总结概括（L0 + User Prompt）  
2 - 思考能力  
2.1 - 意图识别（text-to-labels, L0 + PEFT）  
2.2 - 实体抽取（text-to-json, L0 + PEFT）  
2.3 - 实体映射（pair-to-labels, Embedding + Ranking + PEFT）  
2.4 - 代码生成（text-to-code, L1 + PEFT）

In [16]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th

from pprint import pp
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig,
                          TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, DataCollatorForTokenClassification)
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, get_peft_model_state_dict)
from trl import SFTTrainer
from vllm import (LLM, SamplingParams)

In [2]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(th.__version__)
print(th.version.cuda)

device = cuda; devive_cnt = 1
2.5.1+cu121
12.1


In [3]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = "F:/LLM"
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: 载入数据源

## step-2: tokenizer

In [20]:
checkpoint = "Qwen/Qwen2.5-3B-Instruct"

In [21]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

In [22]:
pp(tokenizer.special_tokens_map)

{'eos_token': '<|im_end|>',
 'pad_token': '<|endoftext|>',
 'additional_special_tokens': ['<|im_start|>',
                               '<|im_end|>',
                               '<|object_ref_start|>',
                               '<|object_ref_end|>',
                               '<|box_start|>',
                               '<|box_end|>',
                               '<|quad_start|>',
                               '<|quad_end|>',
                               '<|vision_start|>',
                               '<|vision_end|>',
                               '<|vision_pad|>',
                               '<|image_pad|>',
                               '<|video_pad|>']}


## step-3: 配置量化

In [23]:
config_bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=th.bfloat16,
    bnb_4bit_use_double_quant=True
)  # QLoRA

## step-4: 载入基模

In [24]:
base_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    torch_dtype=th.bfloat16,  # "auto", th.bfloat16
    # quantization_config=(config_bnb if config_bnb else None),
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
pp(base_model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-35): 36 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=256, bias=True)
          (v_proj): Linear(in_features=2048, out_features=256, bias=True)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (up_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
      )
    )
    (norm):

## step-5: 配置模型

## step-6: 配置 LoRA

## step-7: 模型训练

## step-8: 模型评估

## step-9: 模型保存

## step-10: 模型推理

In [131]:
system_prompt = (
    "你叫小哈助手，是由Lukas开发的差旅智能客服。"
    "你能为用户提供差旅知识问答和酒店推荐服务。"
    "你要以私人差旅秘书的身份为用户提供差旅方面的支持和关怀。"
    "You are a helpful assistant on business travel."
)

In [132]:
# user_prompt = "你好"
# user_prompt = "你是谁？"
# user_prompt = "你会什么？"
# user_prompt = "吴彦祖是谁？"
# user_prompt = "我今天心情不太好"
user_prompt = "你怎么什么都不会"

In [133]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
]

In [134]:
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
pp(text)

('<|im_start|>system\n'
 '你叫小爱同学，是由工程师Lukas开发的差旅智能客服。你能为用户提供差旅知识问答和酒店推荐服务。你要以私人差旅秘书的身份为用户提供差旅方面的支撑和关怀。You '
 'are a helpful assistant on business travel.<|im_end|>\n'
 '<|im_start|>user\n'
 '你怎么什么都不会<|im_end|>\n'
 '<|im_start|>assistant\n')


In [135]:
model_inputs = tokenizer([text], return_tensors="pt").to(device)
pp(model_inputs)

{'input_ids': tensor([[151644,   8948,    198,  56568,  99882,  30709,  99242, 101181,   3837,
         104625, 105503,     43,   3101,    300, 100013,   9370,  99572,  99407,
         100168, 105041,   1773, 107809,  17714, 110782,  99572,  99407, 100032,
         111436,  33108, 101078, 101914,  47874,   1773, 105182,  23031, 105815,
          99572,  99407, 101628, 106613,  17714, 110782,  99572,  99407, 104481,
         104069,  33108, 107045,   1773,   2610,    525,    264,  10950,  17847,
            389,   2562,   5821,     13, 151645,    198, 151644,    872,    198,
         109111,  99245, 107859, 151645,    198, 151644,  77091,    198]],
       device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}


In [128]:
gen_kwargs = {
    "max_new_tokens": 512,
    # "top_p": 0.9,
    # "temperature": 0.1,
    # "do_sample": True,
    # "top_k": 1,
}

In [136]:
t0 = pd.Timestamp.now()
base_model.eval()
with th.inference_mode():
    complete_ids = base_model.generate(
        input_ids=model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        **gen_kwargs
    )
t1 = pd.Timestamp.now()
pp(t1 - t0)

Timedelta('0 days 00:00:05.061318')


In [137]:
input_ids = model_inputs.input_ids
generated_ids = [O[len(I): ] for (I, O) in zip(input_ids, complete_ids)]
response = tokenizer.batch_decode(sequences=generated_ids, skip_special_tokens=True)[0]
pp(response)

'你好！作为你的私人差旅秘书，我主要的功能是提供差旅方面的知识问答和酒店推荐服务。对于一些特定的问题，比如预订机票、火车票或者酒店的具体操作，可能需要你提供更多的信息，例如目的地、出行日期等。此外，我也能够帮助解答关于签证、交通、餐饮等方面的问题。如果你有任何关于出差的问题，都可以随时问我哦！有什么我可以帮助你的吗？'
