# https://qwen.readthedocs.io/zh-cn/latest/framework/Langchain.html

In [6]:
# from the official website
from transformers import AutoModelForCausalLM, AutoTokenizer

# Now you do not need to add "trust_remote_code=True"
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-7B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")

# Instead of using model.chat(), we directly use model.generate()
# But you need to use tokenizer.apply_chat_template() to format your inputs as shown below
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# Directly use generate() and tokenizer.decode() to get the output.
# Use `max_new_tokens` to control the maximum output length.
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512,
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

Downloading shards:   0%|          | 0/4 [00:59<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# 原始实现 prompt自己写的
import argparse
from dataclasses import dataclass, field
from typing import Optional, List, Dict
import sys
import torch
from transformers import TrainingArguments, HfArgumentParser, Trainer, AutoTokenizer, AutoModelForCausalLM

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 加载模型和分词器
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "/kaggle/input/qwen2.5/transformers/0.5b/1"
max_position_embeddings = 2048  # 模型支持的最大长度
tokenizer = AutoTokenizer.from_pretrained(model_path,device_map="auto" )
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_path, 
                                            torch_dtype="float32",
                                            trust_remote_code=True,  # Qwen模型需要这个参数
                                            device_map="auto" ).to(device)



# 初始化对话历史记录
conversation_history = []

# 定义格式化对话的函数
def format_conversation(history):
    """
    格式化对话历史，明确标注对话轮次和说话人。②
    """
    formatted_history = ""
    for i, (speaker, text) in enumerate(history):
        if i==len(history)-1:
            break
        formatted_history += f"[Round {i+1}] {speaker}: {text}\n"
    formatted_history += f"above is the history of bot's talk with user. The response to {history[-1][1]} is "
    return formatted_history

# 定义生成回复的函数
def generate_response(conversation_history, user_input):
    """
    根据用户输入和对话历史生成模型回复。
    """
    # 更新对话历史②
    conversation_history.append(("User", user_input))
    formatted_conversation = format_conversation(conversation_history)
    # 对话长度裁剪逻辑③
    while len(tokenizer(formatted_conversation)["input_ids"]) > max_position_embeddings:
        # 如果超出最大长度，移除最旧的一轮对话
        conversation_history.pop(0)
        formatted_conversation = format_conversation(conversation_history)
    
    # 编码输入并生成回复
    messages=[{"role": "system", "content": "You are a helpful assistant."},
              {"role": "user", "content": user_input}
              ]
    

    
    inputs = tokenizer(formatted_conversation, return_tensors="pt", truncation=True, max_length=max_position_embeddings).to(device)
    # generate 文档
    outputs = model.generate(**inputs,pad_token_id=tokenizer.eos_token_id, #在生成时用eos填充序列
                            max_new_tokens=50, #新生成文本长度
                            # num_beams=5,
                            # temperature=0.7,
                            # top_k=50,
                            # top_p=0.95,
                            # repetition_penalty=1.2
                            )
    # print(outputs)
    response = tokenizer.decode(outputs[:, inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True) #在解码过程中跳过特殊符号如eos pad
    
    # 添加对话历史②
    conversation_history.append(("Bot", response.strip()))
    return response.strip()

# 定义主聊天逻辑
def chat():
    """
    主聊天逻辑，支持 quit、newsession 等指令。
    """
    global conversation_history
    print("Let's chat!")
    while True:
        # 获取用户输入
        user_input = input("User: ").strip()
        # 处理不同指令 ③
        if user_input.lower() == "\quit":
            print("Session ended. Bye!")
            break
        elif user_input.lower() == "\\newsession":
            conversation_history = []
            print("Conversation history cleaned.")
        else:
            response = generate_response(conversation_history, user_input)
            print(f"Bot: {response}")


In [29]:
# 修改实现 参考官方prompt格式
import argparse
from dataclasses import dataclass, field
from typing import Optional, List, Dict
import sys
import torch
from transformers import TrainingArguments, HfArgumentParser, Trainer, AutoTokenizer, AutoModelForCausalLM
import re
import os
# 设置可见的 GPU 设备为 cuda:0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

def load_single_model(model_path,torch_dtype,trust_remote_code,device_map,use_cache):
    return AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path=model_path, 
            torch_dtype=torch_dtype,
            trust_remote_code=trust_remote_code,  # Qwen模型需要这个参数
            device_map=device_map,  # 可选，用于自动处理模型加载到设备
            use_cache=use_cache
        )

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_path = "/ssd/xiaxinyuan/code/CS3602_NLP_Final_Project/output/peft_3b/checkpoint-30000"
max_position_embeddings = 4096 # 模型支持的最大长度
tokenizer = AutoTokenizer.from_pretrained(model_path,device_map="auto" )
model = load_single_model(model_path,"bfloat16",True,"auto",False)
model = model.to(device)







Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.75s/it]
You shouldn't move a model that is dispatched using accelerate hooks.


In [30]:
# 定义格式化对话的函数
def format_conversation(history):
    """
    格式化对话历史，明确标注对话轮次和说话人。②
    """
    formatted_history = ""
    for i, (speaker, text) in enumerate(history):
        if i==len(history)-1:
            break
        formatted_history += f"[Round {i+1}] {speaker}: {text}\n"
    formatted_history += f"above is the history of bot's talk with user. The response to {history[-1][1]} is "
    return formatted_history

# 定义生成回复的函数
def generate_response(conversation_history, user_input):
    """
    根据用户输入和对话历史生成模型回复。
    """
    # 更新对话历史②
    conversation_history.append({"role": "user", "content": user_input})
    
    
    text=tokenizer.apply_chat_template(conversation_history,tokenize=False,add_generation_prompt=True)
    inputs=tokenizer([text],return_tensors="pt").to(model.device)
    if len(inputs["input_ids"][0])>max_position_embeddings:
        # 不移除system 移除一轮对话
        conversation_history.pop(1) 
        conversation_history.pop(2)
        text=tokenizer.apply_chat_template(conversation_history,tokenize=False,add_generation_prompt=True)
        inputs=tokenizer([text],return_tensors="pt").to(model.device)

    
    # generate 文档
    print(inputs)
    print(inputs["input_ids"].device,model.device)
    outputs = model.generate(**inputs,pad_token_id=tokenizer.eos_token_id, #在生成时用eos填充序列
                            max_new_tokens=50, #新生成文本长度
                            # num_beams=5,
                            # temperature=0.7,
                            # top_k=50,
                            # top_p=0.95,
                            # repetition_penalty=1.2
                            )
    # print(outputs)
    response = tokenizer.decode(outputs[:, inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True) #在解码过程中跳过特殊符号如eos pad
    
    # 添加对话历史②
    conversation_history.pop()
    last_round_content=conversation_history[-1]["content"]
    match = re.search(r'\[Round (\d+)\]', last_round_content)
    if match:
        last_round = int(match.group(1))
    else:
        last_round = 0
    conversation_history.append({"role": "user", "content": f"[Round {last_round+1}]:{user_input}"})
    conversation_history.append({"role": "assistant", "content": f"[Round {last_round+1}]:{response.strip()}"})
    return response.strip()

# 定义主聊天逻辑
def chat():
    """
    主聊天逻辑，支持 quit、newsession 等指令。
    """
    global conversation_history
    conversation_history=[{"role": "system", "content": "You are a helpful assistant."}]
    print("chat start")
    while True:
        # 获取用户输入
        user_input = input("User: ").strip()
        # 处理不同指令 ③
        if user_input.lower() == "\quit":
            print("Session ended. Bye!")
            break
        elif user_input.lower() == "\\newsession":
            conversation_history = [{"role": "system", "content": "You are a helpful assistant."}]
            print("Conversation history cleaned.")
        else:
            response = generate_response(conversation_history, user_input)
            print(f"Bot: {response}")

In [31]:
chat()

chat start
{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,   6023, 151645,    198, 151644,
          77091,    198]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}
cuda:0 cuda:0


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [28]:
for name, param in model.named_parameters():
    if param.device!=device:
        print(f"{name}: {param.device}")
