### 提取数据

In [60]:
from neo4j import GraphDatabase

class Neo4jEntityFetcher:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    # 获取所有实体（节点）
    def get_all_entities(self):
        with self.driver.session() as session:
            query = "MATCH (n) RETURN n"
            result = session.run(query)
            # 将实体的属性存入列表
            entities = [{"id": record["n"].element_id, "labels": list(record["n"].labels), "properties": dict(record["n"])} for record in result]
            return entities

    # 获取指定标签的实体
    def get_entities_by_label(self, label):
        with self.driver.session() as session:
            query = f"MATCH (n:{label}) RETURN n"
            result = session.run(query)
            # 将实体的属性存入列表
            entities = [{"id": record["n"].element_id, "labels": list(record["n"].labels), "properties": dict(record["n"])} for record in result]
            return entities

    # 获取指定属性的实体
    def get_entities_by_property(self, property_name, property_value):
        with self.driver.session() as session:
            query = f"MATCH (n {{{property_name}: '{property_value}'}}) RETURN n"
            result = session.run(query)
            # 将实体的属性存入列表
            entities = [{"id": record["n"].element_id, "labels": list(record["n"].labels), "properties": dict(record["n"])} for record in result]
            return entities
            
    # 根据 ID 获取实体
    def get_entity_by_id(self, element_id):
        with self.driver.session() as session:
            query = "MATCH (n) WHERE elementId(n) = $element_id RETURN n"
            result = session.run(query, element_id=element_id)
            # 获取实体的属性
            entity = [{"id": record["n"].element_id, "labels": list(record["n"].labels), "properties": dict(record["n"])} for record in result]
            return entity


    def get_entities_by_knowledge_id(self, element_id):
        with self.driver.session() as session:
            query = "MATCH (m)-[r]->(n) WHERE elementId(n) = $element_id RETURN m, r"
            result = session.run(query, element_id=element_id)
            entity = [{"id": record["m"].element_id, "labels": list(record["m"].labels), "properties": dict(record["m"])} for record in result]
            return entity

    
    def get_entities_by_entities_id(self, element_id):
        with self.driver.session() as session:
            query = "MATCH (n)-[r]->(m) WHERE elementId(n) = $element_id RETURN n, r, m"
            result = session.run(query, element_id=element_id)
            entity = [
                {
                    "key": dict(record["n"])['name'],
                    "id": record["m"].element_id[0],
                    "labels": list(record["m"].labels)[0],
                    "properties": dict(record["m"])['name'],
                    "relationship": {
                        "type": record["r"].type,  
                        "properties": dict(record["r"])
                    }
                } for record in result
            ]
            return entity

    
    
    # 关闭驱动
    def close(self):
        self.driver.close()


# 初始化 Neo4jEntityFetcher
uri = "bolt://localhost:7687"  # Neo4j 数据库地址
user = "neo4j"  # Neo4j 用户名
password = "password"  # Neo4j 密码
fetcher = Neo4jEntityFetcher(uri, user, password)

# 获取所有实体
# print("所有实体：")
all_entities = fetcher.get_all_entities()

# 根据标签获取实体
# print("\n标签为 'knowledge' 的实体：")
knowledge_entities = fetcher.get_entities_by_label("knowledge")

# 根据属性获取实体
# print("\n具有 name='糖尿病' 属性的实体：")
diabetes_entities = fetcher.get_entities_by_property("name", "糖尿病")
    
# 关闭连接
fetcher.close()

In [61]:
diabetes_entities

[{'id': '4:0cfb6f88-002e-4f58-87e3-6809e3bfecb2:59568',
  'labels': ['entity'],
  'properties': {'name': '糖尿病'}}]

In [62]:
knowledge_entities[0]

{'id': '4:0cfb6f88-002e-4f58-87e3-6809e3bfecb2:0',
 'labels': ['knowledge'],
 'properties': {'name': '支气管扩张症的病因包括囊性纤维化、巨大气管支气管症、肺叶内肺隔离症、免疫缺陷性疾病、感染后、机械性气道阻塞、原发性或继发性纤毛运动障碍以及变态反应性支气管肺曲霉病等。'}}

### 嵌入模型

In [22]:
import warnings
warnings.filterwarnings("ignore")


In [24]:
def LoadModel(model_path='../../model/bge-large-zh-v1.5'):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  

    model = AutoModel.from_pretrained(model_path).to(device)  
    tokenizer = AutoTokenizer.from_pretrained(model_path)
   
    return model, tokenizer

In [26]:
def encode_text(model, tokenizer, text, max_length=512):
    device = model.device
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)

    model.eval()

    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().tolist() 
    return embeddings

# 使用示例
text = "输入文本"
model, tokenizer = LoadModel()
embeddings = encode_text(model, tokenizer, text)
print(embeddings)

[[-0.30637967586517334, -0.1549290269613266, -0.6859948635101318, -0.027580898255109787, 0.049760602414608, 0.5290124416351318, -0.163253515958786, -0.2601250410079956, 0.19280600547790527, -0.1002703309059143, -0.006979790981858969, 0.06892531365156174, 0.3265497088432312, 0.22649264335632324, 0.34332403540611267, 0.3212975561618805, 0.18323126435279846, 0.5016316175460815, 0.6707549095153809, -0.024341188371181488, 0.5434405207633972, 0.18788914382457733, -0.06305916607379913, -0.4033040702342987, -0.28754812479019165, -0.006271734833717346, -1.032211184501648, 0.47637176513671875, 0.916732668876648, -0.6076900959014893, 0.2788418233394623, 0.14446674287319183, 0.13608914613723755, 0.5974225401878357, 0.29096946120262146, -0.13862524926662445, -0.9447407722473145, -0.1302027404308319, -0.3205782175064087, -0.7723066210746765, 0.4702088534832001, 0.980893611907959, -0.04290768504142761, -1.0873252153396606, 0.5334612727165222, -0.44799327850341797, -0.5284252166748047, -0.679543435573

### 生成索引

In [3]:

from process.rag.Neo4jEntityFetcher import Neo4jEntityFetcher
from process.rag.Embedding import *
import faiss
import numpy as np

In [12]:
uri = "bolt://localhost:7687"  # Neo4j 数据库地址
user = "neo4j"  # Neo4j 用户名
password = "password"  # Neo4j 密码
fetcher = Neo4jEntityFetcher(uri, user, password)
knowledge_entities = fetcher.get_entities_by_label("knowledge")
knowledge_entities.extend(fetcher.get_entities_by_label("entity"))

texts = [str(i['properties']['name']) for i in knowledge_entities] 
ids = [i['id'] for i in knowledge_entities]

In [16]:
model, tokenizer = LoadModel()
def batch_encode_texts(model, tokenizer, texts, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = encode_text(model, tokenizer, batch_texts)
        embeddings.extend(batch_embeddings)
    return embeddings

embeddings = batch_encode_texts(model, tokenizer, texts, batch_size=64)

100%|█████████████████████████████████████████| 732/732 [04:56<00:00,  2.47it/s]


In [17]:
embeddings = np.array(embeddings,dtype=np.float32)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)  
index.add(embeddings)

faiss.write_index(index, '../../data/faiss_index/faiss_index.index')

np.save('../../data/faiss_index/matedata.npy', ids)

### 文本匹配

In [59]:
import numpy as np
import faiss
from process.rag.Embedding import *
from process.rag.Neo4jEntityFetcher import Neo4jEntityFetcher

from langchain_core.output_parsers import JsonOutputParser,StrOutputParser
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

In [61]:
loaded_index = faiss.read_index('../../data/faiss_index/faiss_index.index')
loaded_ids = np.load('../../data/faiss_index/matedata.npy')

In [63]:
query_text = '糖尿病'
model, tokenizer = LoadModel()
query_vector = encode_text(model, tokenizer, query_text)

In [64]:
# 进行查询
D, I = loaded_index.search(np.array(query_vector, dtype=np.float32), k=3) 

print("距离:", D)  # 距离
print("索引:", I)  # 对应的索引
print("ID:", [loaded_ids[i] for i in I[0]])

距离: [[ 85.83061 191.55424 200.29074]]
索引: [[17828 12679 27828]]
ID: ['4:0cfb6f88-002e-4f58-87e3-6809e3bfecb2:33251', '4:0cfb6f88-002e-4f58-87e3-6809e3bfecb2:9929', '4:0cfb6f88-002e-4f58-87e3-6809e3bfecb2:56899']


In [67]:
systemContent = """
    你是医学领域的专业大学教授，现在需要你根据我给你的数据描述出这段数据表达的知识点

    **输出要求:**
    
    - 我发给你的内容中包括我需要描述的知识点、以及与他有关的实体与实体的解释
    - 我给你的内容中的描述、类型、相关知识点、以及与他有关的实体可能有多种，你需要完整的描述
    - 你仅需要描述相关内容，不需要额外拓展
    - 尽量以严谨的科学口吻描述完整的描述
    - 返回内容为一段话即可，不需要复杂的格式
"""


prompt_template = ChatPromptTemplate.from_messages(
    [("system", systemContent), ("user", "{text}")]
)

model = Ollama(model="qwen2.5",temperature=0.0)
parser = StrOutputParser()
chain =  prompt_template | model | parser

In [69]:
fetcher.get_entities_by_entities_id('4:0cfb6f88-002e-4f58-87e3-6809e3bfecb2:33251')

[{'key': '糖尿病',
  'id': '4',
  'labels': 'description',
  'properties': '一种慢性病，特征为血糖水平异常升高',
  'relationship': {'type': 'description', 'properties': {}}},
 {'key': '糖尿病',
  'id': '4',
  'labels': 'description',
  'properties': '危险因素之一',
  'relationship': {'type': 'description', 'properties': {}}},
 {'key': '糖尿病',
  'id': '4',
  'labels': 'description',
  'properties': '一种代谢性疾病，影响身体对血糖的调节',
  'relationship': {'type': 'description', 'properties': {}}},
 {'key': '糖尿病',
  'id': '4',
  'labels': 'description',
  'properties': '一种慢性代谢性疾病，长期高血糖为主要特征',
  'relationship': {'type': 'description', 'properties': {}}},
 {'key': '糖尿病',
  'id': '4',
  'labels': 'description',
  'properties': '一种基础疾病',
  'relationship': {'type': 'description', 'properties': {}}},
 {'key': '糖尿病',
  'id': '4',
  'labels': 'description',
  'properties': '患者的既往病史之一',
  'relationship': {'type': 'description', 'properties': {}}},
 {'key': '糖尿病',
  'id': '4',
  'labels': 'description',
  'properties': '一种慢性病，影响身体对葡萄糖的使用',
  '

In [71]:
uri = "bolt://localhost:7687"  # Neo4j 数据库地址
user = "neo4j"  # Neo4j 用户名
password = "password"  # Neo4j 密码
fetcher = Neo4jEntityFetcher(uri, user, password)

knowledges = []
for i in I[0]:
    entity_id = loaded_ids[i]
    entity = fetcher.get_entity_by_id(entity_id)
    knowledge = ''
    knowledge += entity[0]['properties']['name'] + '\n\n'
    if entity[0]['labels'][0] == 'entity':
        ent = fetcher.get_entities_by_entities_id(entity[0]['id'])  # 实体相关信息
        type = ''
        description = ''
        key = ent[0]['key']
        know = []
        relation = []
        for e in ent:
            if e['relationship']['type'] == 'type' and type == '':
                type = e['properties']
            if e['relationship']['type'] == 'description' and description == '':
                description = e['properties']
            if e['relationship']['type'] == 'knowledge':
                know.append(e['properties'])
            if e['relationship']['type'] == 'relation':
                relation.append((e['key'],e['relationship']['properties']['relation'],e['properties']))
        knowledge += f'{key,type,description},\n相关知识点：{know},\n相关实体：{relation}'
    if entity[0]['labels'][0] == 'knowledge':
        entities = fetcher.get_entities_by_knowledge_id(entity[0]['id'])  # 获取知识点相关实体
        types = []
        description = []
        for i in entities:
            ent = fetcher.get_entities_by_entities_id(i['id'])  # 实体相关信息
            type = ''
            description = ''
            key = ent[0]['key']
            for e in ent:
                if e['relationship']['type'] == 'type' and type == '':
                    type = e['properties']
                if e['relationship']['type'] == 'description' and description == '':
                    print(e)
                    description = e['properties']
            knowledge += f'{(key,description,type)}'
    print(knowledge,end='\n\n\n\n')
    
    response = chain.invoke({"text": knowledge})
    print(response,end='\n\n\n\n')
    knowledges.append(response)

糖尿病

('糖尿病', '疾病', '一种慢性病，特征为血糖水平异常升高'),
相关知识点：['高血压合并糖尿病患者的降压治疗需特别注意保护肾脏功能。研究显示，血管紧张素转换酶抑制剂（ACEI）能够有效降低这类患者的尿蛋白水平，并且对肾功能有较好的保护作用。因此，在选项中选择ACEI制剂最为合适。', '泌尿系感染的危险因素包括妊娠、糖尿病、免疫缺陷以及医疗操作（如器械使用），而不包括直肠癌。', '肝脓肿好发于老年人或有糖尿病、脂肪肝的中青年等有基础疾病的患者；感染途径包括胆源性、肝动脉源性、门静脉源性和肝外伤性，其中以胆道源性最常见；病原菌可分为细菌性、结核性、真菌性和阿米巴性。CT表现可出现双环征或三环征，并需与胆管细胞癌鉴别。', '泌尿系感染的危险因素包括妊娠、糖尿病、免疫缺陷以及各种侵入性操作（如器械操作），而不直接包括消化系统疾病如直肠癌。这些因素影响了泌尿系统的防御机制或增加了细菌进入泌尿道的机会，从而提高了感染的风险。', '糖尿病足是糖尿病患者常见的并发症之一，由于长期高血糖导致下肢血液循环不良、神经病变等，容易引发感染和溃疡。对于伴有股动脉狭窄的患者，血管腔内成形术（如球囊扩张术或支架置入术）可以改善血流，促进伤口愈合。此外，治疗方案还需综合考虑患者的全身状况、并发症情况及手术风险等因素。', '肝脓肿好发于老年人或有糖尿病、脂肪肝的中青年等有基础疾病的患者；病原菌可分为：细菌性、结核性、真菌性和阿米巴性；感染途径多种，包括胆源性、肝动脉源性、门静脉源性和肝外伤性等。CT表现可出现双环征或三环征。早期不典型肝脓肿需与胆管细胞癌相鉴别。', '急性心肌梗死是指由于冠状动脉供血急剧减少或中断，使相应的心肌严重持久地缺血导致的心肌坏死。本题中患者74岁女性，有高血压、糖尿病病史，突发心前区疼痛且持续伴阵发加重，伴有出汗，硝酸甘油不缓解，结合典型临床表现和既往疾病背景，考虑急性心肌梗死的可能性最大。治疗上需要尽快开通闭塞的冠状动脉，必要时进行溶栓或急诊介入手术以恢复血流。', '糖尿病患者在治疗过程中出现意识障碍通常需要考虑低血糖的可能性。该患者有口服降糖药治疗但饮食欠规律，提示可能存在摄入不足的风险。因此，最可能的原因是低血糖导致的昏迷。', '术前准备对于确保手术安全和患者恢复至关重要。题目中的选项D表示糖尿病患者的术前血糖控制标准是不正确的，实

In [75]:
# memory = ConversationBufferMemory(return_messages=True)

knowledge = ''
for i in I[0]:
    entity_id = loaded_ids[i]
    entity = fetcher.get_entity_by_id(entity_id)
    know = entity[0]['properties']['name']

    # memory.save_context({"input": know},{"output": "以上消息已加入知识库"})
    knowledge += f'{know}\n\n'

# memory.load_memory_variables({})

systemContent = """
    你是医学领域的专业大学教授，现在需要你根据我传递给你的知识点构建一道选择题

    **输出要求:**
    
    - 我发给你的内容是相关需要生成的试题的知识点
    - 你需要从我发给你的知识库中选择部分作为这道题目的主要考点
    - 你需要确保你给的题目具有逻辑性且有唯一正确答案
    - 你需要返回题目、选项、答案、解析
    - 题目的表达形式可以有多种
    - 确保输出是紧凑格式的有效JSON格式，不包含任何其他解释、转义符、换行符或反斜杠
    
    **知识库内容:**
    {knowledge}

    **输出案例：**
    
    {{
      "topic": "往无任何神经系统症状，8小时前突发剧烈头痛，伴喷射状呕吐，肢体活动无障碍。应首选以下哪种检查",
      "options": {{
          "A": "头颅X线平片",
          "B": "穿颅多普勒",
          "C": "CT",
          "D": "MRI"
        }},
        "answer":"C",
        "parse":"血液溢出血管后形成血肿，大量的X线吸收系数明显高于脑实质的血红蛋白积聚在一起，CT图像上表现为高密度病灶，CT值多高于60Hu。"
    }}
    
"""


prompt_template = ChatPromptTemplate.from_messages(
    [("system", systemContent), ("user", "{text}")]
)

model = Ollama(model="qwen2.5",temperature=0.3)
parser = JsonOutputParser()
chain =  prompt_template | model | parser

for k in knowledges:
    response = chain.invoke({ "text": '','knowledge':k})
    print(response,end='\n\n')

{'topic': '糖尿病患者的主要并发症之一是肾小球损伤，请问下列哪种治疗方法最能有效预防和延缓这种并发症的发展？', 'options': {'A': '长期使用高剂量口服降糖药', 'B': '严格控制血糖水平', 'C': '定期进行静脉输注葡萄糖加胰岛素治疗', 'D': '不定期监测酮症酸中毒情况'}, 'answer': 'B', 'parse': '严格控制血糖水平是预防和延缓糖尿病肾小球损伤发展的关键措施。长期高血糖会损害肾脏微血管，导致肾功能逐渐下降。因此选项B为正确答案。'}

{'topic': '在膀胱嗜铬细胞瘤患者的诊断中，下列哪一项症状最能提示该疾病的存在', 'options': {'A': '持续性低血压', 'B': '慢性腹泻', 'C': '阵发性高血压伴尿液中含有糖分', 'D': '长期发热'}, 'answer': 'C', 'parse': '膀胱嗜铬细胞瘤患者可能会出现阵发性高血压、血尿和糖尿等症状，这些症状与肿瘤导致的激素分泌增加有关。因此，阵发性高血压伴尿液中含有糖分是该疾病的重要诊断依据之一。'}

{'topic': '糖尿病肾病的主要诊断依据是什么', 'options': {'A': '尿常规检查', 'B': '血肌酐检测', 'C': '尿白蛋白排泄率检测', 'D': '肾脏B超'}, 'answer': 'C', 'parse': '糖尿病肾病的诊断可通过检测尿白蛋白排泄率进行，这是其主要依据。'}

