In [1]:
import os

# 从环境变量获取 DeepSeek API Key
api_key = os.getenv("DEEPSEEK_API_KEY")

In [2]:
with open("mfd.md", "r", encoding="utf-8") as file:
    file_text = file.read()

text_lines = file_text.split("**第")
print(len(text_lines))

388


In [3]:
from openai import OpenAI

deepseek_client = OpenAI(
    api_key=api_key,
    base_url="https://api.deepseek.com/v1",  # DeepSeek API 的基地址
)

In [4]:
from pymilvus import model as milvus_model

embedding_model = milvus_model.DefaultEmbeddingFunction()

In [8]:
test_embedding = embedding_model.encode_queries(["This is a test"])[0]
embedding_dim = len(test_embedding)
print(embedding_dim)
print(test_embedding[:10])

768
[-0.04836059  0.07163021 -0.01130063 -0.03789341 -0.03320651 -0.01318453
 -0.03041721 -0.02269495 -0.02317858 -0.00426026]


In [9]:
test_embedding_0 = embedding_model.encode_queries(["That is a test"])[0]
print(test_embedding_0[:10])

[-0.02752976  0.0608853   0.00388525 -0.00215193 -0.02774976 -0.0118618
 -0.04020916 -0.06023417 -0.03813156  0.0100272 ]


In [5]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(uri="./milvus_mfd.db")

collection_name = "my_mfd_rag_collection"

In [6]:
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

In [10]:
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    metric_type="IP",  # 内积距离
    consistency_level="Strong",  # 支持的值为 (`"Strong"`, `"Session"`, `"Bounded"`, `"Eventually"`)。更多详情请参见 https://milvus.io/docs/consistency.md#Consistency-Level。
)

In [11]:
from tqdm import tqdm

data = []

doc_embeddings = embedding_model.encode_documents(text_lines)

for i, line in enumerate(tqdm(text_lines, desc="Creating embeddings")):
    data.append({"id": i, "vector": doc_embeddings[i], "text": line})

milvus_client.insert(collection_name=collection_name, data=data)

Creating embeddings: 100%|██████████| 388/388 [00:00<00:00, 747194.65it/s]


{'insert_count': 388, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 

In [12]:
question = "什么是不动产?"

In [13]:
search_res = milvus_client.search(
    collection_name=collection_name,
    data=embedding_model.encode_queries(
        [question]
    ),  # 将问题转换为嵌入向量
    limit=3,  # 返回前3个结果
    search_params={"metric_type": "IP", "params": {}},  # 内积距离
    output_fields=["text"],  # 返回 text 字段
)

In [14]:
import json

retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))

[
    [
        "\u4e8c\u767e\u4e00\u5341\u4e5d\u6761** \u5229\u5bb3\u5173\u7cfb\u4eba\u53ef\u4ee5\u7533\u8bf7\u67e5\u8be2\u4e0d\u52a8\u4ea7\u767b\u8bb0\u8d44\u6599\u3002\u7533\u8bf7\u67e5\u8be2\u7684\uff0c\u767b\u8bb0\u673a\u6784\u5e94\u5f53\u63d0\u4f9b\u3002\n\n",
        0.6411126852035522
    ],
    [
        "\u4e8c\u767e\u4e00\u5341\u516b\u6761** \u6743\u5229\u4eba\u3001\u5229\u5bb3\u5173\u7cfb\u4eba\u53ef\u4ee5\u7533\u8bf7\u67e5\u8be2\u3001\u590d\u5236\u4e0d\u52a8\u4ea7\u767b\u8bb0\u8d44\u6599\uff0c\u767b\u8bb0\u673a\u6784\u5e94\u5f53\u63d0\u4f9b\u3002\n\n",
        0.6411126852035522
    ],
    [
        "\u4e8c\u767e\u96f6\u4e03\u6761** \u56fd\u5bb6\u3001\u96c6\u4f53\u3001\u79c1\u4eba\u7684\u7269\u6743\u548c\u5176\u4ed6\u6743\u5229\u4eba\u7684\u7269\u6743\u53d7\u6cd5\u5f8b\u5e73\u7b49\u4fdd\u62a4\uff0c\u4efb\u4f55\u7ec4\u7ec7\u6216\u8005\u4e2a\u4eba\u4e0d\u5f97\u4fb5\u72af\u3002\n\n",
        0.6411126852035522
    ]
]


In [15]:
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)

In [16]:
context

'二百一十九条** 利害关系人可以申请查询不动产登记资料。申请查询的，登记机构应当提供。\n\n\n二百一十八条** 权利人、利害关系人可以申请查询、复制不动产登记资料，登记机构应当提供。\n\n\n二百零七条** 国家、集体、私人的物权和其他权利人的物权受法律平等保护，任何组织或者个人不得侵犯。\n\n'

In [17]:
SYSTEM_PROMPT = """
Human: 你是一个 AI 助手。你能够从提供的上下文段落片段中找到问题的答案。
"""
USER_PROMPT = f"""
请使用以下用 <context> 标签括起来的信息片段来回答用 <question> 标签括起来的问题。最后追加原始回答的中文翻译，并用 <translated>和</translated> 标签标注。
<context>
{context}
</context>
<question>
{question}
</question>
<translated>
</translated>
"""

In [18]:
USER_PROMPT

'\n请使用以下用 <context> 标签括起来的信息片段来回答用 <question> 标签括起来的问题。最后追加原始回答的中文翻译，并用 <translated>和</translated> 标签标注。\n<context>\n二百一十九条** 利害关系人可以申请查询不动产登记资料。申请查询的，登记机构应当提供。\n\n\n二百一十八条** 权利人、利害关系人可以申请查询、复制不动产登记资料，登记机构应当提供。\n\n\n二百零七条** 国家、集体、私人的物权和其他权利人的物权受法律平等保护，任何组织或者个人不得侵犯。\n\n\n</context>\n<question>\n什么是不动产?\n</question>\n<translated>\n</translated>\n'

In [19]:
response = deepseek_client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(response.choices[0].message.content)

根据提供的上下文，虽然未直接定义"不动产"，但通过相关条款可以推断：

不动产是指土地、房屋等不能移动或移动会损害其价值的财产，其物权（包括所有权、使用权等）受法律保护（依据第二百零七条），且相关登记资料可供权利人及利害关系人查询（依据第二百一十八条、第二百一十九条）。

<translated>
According to the provided context, although the term "real estate" is not directly defined, it can be inferred from the relevant clauses that:

Real estate refers to immovable property such as land and buildings that cannot be moved or whose value would be impaired if moved. The property rights (including ownership, usage rights, etc.) of such assets are protected by law (Article 207), and the registration information is available for inquiry by rights holders and interested parties (Articles 218 and 219).
</translated>
