<a href="https://colab.research.google.com/github/Huangjian2013/ai-demo/blob/main/19_schema_graphrag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install llama-index llama-index-graph-stores-neo4j --quiet


In [12]:
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
from google.colab import userdata
from llama_index.core import SimpleDirectoryReader, Document
from llama_index.core import PropertyGraphIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.indices.property_graph import SimpleLLMPathExtractor,SchemaLLMPathExtractor
import nest_asyncio
from typing import Literal
import os

import openai

nest_asyncio.apply()
openai.api_key=userdata.get('REAL_OPENAI_KEY')

In [4]:
graph_store = Neo4jPropertyGraphStore(
    username=userdata.get("NEO_USER_NAME"),
    password=userdata.get("NEO_PASSWORD"),
    url=userdata.get("NEO_URL")
)


In [6]:
file_path = './sample_data/story.txt'
with open(file_path, "r", encoding="utf-8") as f:
    content = f.read()

# 将文件内容转换为 LlamaIndex 支持的 Document 对象
documents = [Document(text=content)]

In [13]:
entities = Literal["人物", "场所", "职业"]
relations = Literal[
    "男友", "女友", "单恋", "好感", "相亲", "老友",
    "青梅竹马", "上级", "儿子","女儿",
    "职业是", "拥有", "属于"]
schema = {
    "人物": ["男友", "女友", "单恋", "好感", "相亲", "老友",
           "青梅竹马", "上级", "儿子", "女儿", "职业是", "拥有"],
    "场所": ["属于"],
    "职业": [],
 }


kg_extractor = SchemaLLMPathExtractor(
    llm=OpenAI(model="gpt-4", temperature=0.0),
    possible_entities=entities,
    possible_relations=relations,
    kg_validation_schema=schema,
    strict=True,  # if false, allows values outside of spec
)

In [14]:
index = PropertyGraphIndex.from_documents(
    documents,
    embed_model=OpenAIEmbedding(model_name="text-embedding-ada-002"),
    kg_extractors=[kg_extractor],
    property_graph_store=graph_store,
    show_progress=True,
)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting paths from text with schema: 100%|██████████| 1/1 [00:32<00:00, 32.46s/it]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.39it/s]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  6.00it/s]


In [15]:
retriever = index.as_retriever(
    include_text=False,  # include source text in returned nodes, default True
)

nodes = retriever.retrieve("小镇里面的人物关系")

for node in nodes:
    print(node.text)

小镇 -> 拥有 -> 林叔
林叔 -> 职业是 -> 咖啡馆老板
赵老板 -> 相亲 -> 吴昊
丽莎 -> 女儿 -> 赵老板
林叔 -> 儿子 -> 小林


In [16]:
query_engine = index.as_query_engine(include_text=True)

response = query_engine.query("小镇里面谁最有可能要结婚了？为什么？")

print(str(response))

丽莎最有可能要结婚了，因为她是镇上富商赵老板的女儿，赵老板一直在安排她和镇上另一位年轻有为的企业家吴昊相亲。
