In [None]:
import json
import requests
import mindspore as ms 
from mindnlp import core 
from mindspore import nn 
from peft import PeftModel, LoraConfig 
import numpy as np 
from transformers import AutoTokenizer, AutoModelForCausalLM

class ModelServer:
    def __init__(self, knowledge_base_path, api_url="http://localhost:8000/v1/chat/completions"):
        """
        初始化模型服务器
        Args:
            knowledge_base_path: 知识库文件路径
            api_url: vLLM-MindSpore API 地址
        """
        self.api_url = api_url

        # 加载RAG知识库
        with open(knowledge_base_path, 'r', encoding='utf-8') as f:
            self.knowledge_base = json.load(f)
        print("知识库加载完成！")

    def retrieve_knowledge(self, query, top_k=3):
        """向量检索 / 关键词检索（你可以替换成MindTinyRAG版本）"""
        query_words = set(query.lower().split()) 
        results = []
        for item in self.knowledge_base: 
            content = item["content"].lower() 
            score = len(query_words.intersection(set(content.split()))) 
            if score > 0: 
                results.append((score, item)) 
        results.sort(key=lambda x: x[0], reverse=True) 
        return [item for _, item in results[:top_k]]

    def generate_response(self, query, conversation_history=None, max_tokens=512, lora="sql-lora"):
        """调用 vLLM-MindSpore API 生成回复"""

        # 1. 检索相关知识
        knowledge = self.retrieve_knowledge(query)

        # 2. 构建系统提示
        system_prompt = (
            "你是一个专业的北京旅游咨询助手，请严格根据提供的知识回答问题。"
            "如果知识中没有提到，请回答“我在知识库中没有找到相关信息”。禁止编造。"
        )

        knowledge_text = "\n".join([f"[知识{i+1}] {item['content']}" for i, item in enumerate(knowledge)])

        # 3. 构造对话上下文
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"相关知识:\n{knowledge_text}\n\n用户问题: {query}"}
        ]

        # 4. 请求 vLLM-MindSpore API
        payload = {
            "model": lora,   # 指定要用的 LoRA 模块
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": 0.7,
            "top_p": 0.9
        }

        response = requests.post(self.api_url, json=payload)
        result = response.json()

        # 5. 解析返回
        answer = result["choices"][0]["message"]["content"]
        return answer, knowledge


In [None]:
def main():
    server = ModelServer(knowledge_base_path="./knowledge_base.json")

    questions = [
        "你知道故宫的门票是多少钱吗？",
        "北京的恭王府好不好？",
        "恭王府的开放时间是什么时候？",
        "天安门广场需要门票吗？"
    ]

    for q in questions:
        print(f"\n问题: {q}")
        answer, knowledge = server.generate_response(q)
        print("回答:", answer)
        print("参考知识:", [k["content"][:50] for k in knowledge])

if __name__ == "__main__":
    main()


python -m vllm_mindspore.entrypoints.vllm.entrypoints.openai.api_server \
    --model "/home/ma-user/work/Qwen2.5-1.5B-Instruct/" \
    --enable-lora \
    --lora-modules sql-lora="/home/ma-user/work/checkpoint-1300/" \
    --port 8000
