## 1. GraphDB 연결하기

In [1]:
from neo4j import GraphDatabase

# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j+s://ddadee7d.databases.neo4j.io"
AUTH = ("neo4j", "dBUuO0ettwv7MK-R1UsGAPLUs7hA22-YltunltI4d18")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

In [2]:
# 로컬에 있는 .env 를 통해 API key 불러옴 
from dotenv import load_dotenv
load_dotenv()

True

## 2. 텍스트에서 Node, Relationships 추출하기 

### 1) Text Split

In [None]:
from neo4j_graphrag.experimental.components.types import TextChunks, TextChunk
text = """
Marie Curie, 7 November 1867 – 4 July 1934, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
She was, in 1906, the first woman to become a professor at the University of Paris.
Also, Robin Williams.
"""

In [None]:
from langchain_text_splitters import CharacterTextSplitter
from neo4j_graphrag.experimental.components.text_splitters.langchain import LangChainTextSplitterAdapter
splitter = LangChainTextSplitterAdapter(
    CharacterTextSplitter(chunk_size=4000, chunk_overlap=200, separator=".")
)


In [None]:
chunks = await splitter.run(text=text)
chunks

### 2) Text Chunk 임베딩 추가하기 
* 임베딩을 하는 이유 : 벡터 검색을 할 때, 벡터 임베딩을 기반으로 검색이 되기 때문

In [None]:
from neo4j_graphrag.experimental.components.embedder import TextChunkEmbedder
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

embedder = OpenAIEmbeddings()
chunk_embedder = TextChunkEmbedder(embedder)

In [None]:
# chunks_with_embeddings = chunk_embedder.run(text_chunks=chunks)
chunks_with_embeddings = await chunk_embedder.run(text_chunks=chunks)

In [None]:
chunks_with_embeddings

### 3) LLM으로 Node, Relationships 추출하기 

In [None]:
from neo4j_graphrag.experimental.components.entity_relation_extractor import LLMEntityRelationExtractor
from neo4j_graphrag.llm import OpenAILLM

extractor = LLMEntityRelationExtractor(
    llm = OpenAILLM(
        model_name = "gpt-4o",
        model_params = {
            "max_tokens": 1000,
            "response_format": {"type": "json_object"}
        }
    )
)

extract_results = await extractor.run(chunks = chunks_with_embeddings)
# extract_results = await extractor.run(chunks = chunks)

In [None]:
extract_results

In [None]:
extract_results.nodes

In [None]:
extract_results.relationships

### 4) GraphDB 적재하기

In [None]:
from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter
from neo4j_graphrag.experimental.components.types import Neo4jGraph

driver = GraphDatabase.driver(URI, auth=AUTH)

writer = Neo4jWriter(driver)
graph = Neo4jGraph(nodes = extract_results.nodes, relationships=extract_results.relationships)
await writer.run(graph)

## 3. 벡터 기반 RAG 구현하기

### 1) Vector Index 추가하기

In [None]:
from neo4j import GraphDatabase
from neo4j_graphrag.indexes import create_vector_index

INDEX_NAME = "vectorchunk"
DIMENSION = 1536

create_vector_index(
    driver,
    INDEX_NAME,
    label = "Chunk",
    embedding_property="embedding", 
    dimensions=DIMENSION,
    similarity_fn = "cosine"
)

### 2) Vector Retriever 사용하기
* 관련있는 청크를 찾는 것 

In [None]:
import neo4j 
from neo4j_graphrag.retrievers import VectorRetriever
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

retriever = VectorRetriever(driver, "vectorchunk", embedder=OpenAIEmbeddings())

retriever.search(query_text = "Who is Marie Curie?", top_k=1)


### 3) GraphRAG 파이프라인 모듈 사용하기 
* 벡터리트리버 기반으로 검색된 텍스트를 기반으로 답변을 하는 것 

In [None]:
from neo4j_graphrag.retrievers import VectorRetriever
from neo4j_graphrag.llm.openai_llm import OpenAILLM
from neo4j_graphrag.generation import GraphRAG

retriever = VectorRetriever(driver, "vectorchunk", embedder=OpenAIEmbeddings())
llm = OpenAILLM(model_name="gpt-5.2")
graph_rag = GraphRAG(retriever, llm)
response = graph_rag.search(
    query_text = "Who is Marie Curie", # 이 텍스트가 임베딩으로 변환되어 Neo4j의 벡터 인덱스에서 유사한 청크를 검색하고, 검색된 컨텍스트를 바탕으로 LLM이 답변을 생성함
    retriever_config={"top_k":3}, # 벡터 유사도 검색에서 상위 3개의 가장 관련성 높은 청크를 가져옴. 값이 클수록 더 많은 컨텍스트를 LLM에게 제공하지만, 토큰 사용량이 증가함
    return_context = True) # 검색된 컨텍스트 정보를 응답에 포함할지 여부, True이면 response.retriever_result를 통해 검색된 원본 청크 내용을 확인 가능 

In [None]:
print(response.answer)

In [None]:
print(response.retriever_result)

## 4. 사전구축 파이프라인: SimpleKGPipeline
* 위에서는 각 모듈을 개별적으로 구현을 해놨는데, 이 전체과정을 사전구축해 둔 것이 있음   
    * 청킹, 임베딩, DB에 적재
* 예제: 논문 PDF에서 그래프 추출하고 GraphRAG 구현하기

In [3]:
import neo4j
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

neo4j_driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)

In [11]:
from neo4j_graphrag.llm.openai_llm import OpenAILLM
llm = OpenAILLM(model_name = "gpt-5.2", model_params = {"response_format":{"type":"json_object"}})

In [12]:
# 그래프 적재
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder = SimpleKGPipeline(
    llm = llm,
    driver = neo4j_driver, 
    embedder = OpenAIEmbeddings(),
    from_pdf = True
)
await kg_builder.run_async(file_path = "GraphRAG.pdf")

PipelineResult(run_id='f54d3565-0397-48c6-8418-c00d6f6e5545', result={'resolver': {'number_of_nodes_to_resolve': 575, 'number_of_created_nodes': 497}})

In [13]:
# "graphragchunk"라는 이름을 갖는 벡터 인덱스 생성
# 벡터 인덱스가 필요한 이유
 # 1) 빠른 검색: 벡터 유사도 검색을 O(n) -> O(log n) 으로 최적화
 # 2) RAG 지원: 질문이 들어오면 임베딩으로 변환 후, 인덱스를 통해 유사한 Chunk를 빠르게 찾음
 # 3) 코사인 유사도: 벡터 간 방향을 기준으로 검색 

# 벡터인덱스는 그래프 시각화 화면에서 (Neo4j Aura)에서는 직접 보이지 않음 
# 인덱스는 노드나 관계까 아니라 데이터베이스의 메타데이터이기 때문임 

from neo4j import GraphDatabase
from neo4j_graphrag.indexes import create_vector_index

INDEX_NAME = "graphragchunk"
DIMENSION = 1536

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    create_vector_index(
        driver, # Neo4j 데이터베이스 연결 드라이버
        INDEX_NAME,  # 생성할 벡터인덱스 이름
        label="Chunk", # 인덱스를 적용할 노드 라벨 (Chunk 노드에 적용)
        embedding_property = "embedding", # 벡터가 저장된 속성(property) 이름 
        dimensions= DIMENSION, # 임베딩 벡터의 차원수 
        similarity_fn = "cosine" 
    )

In [None]:
# 벡터 리트리버를 통해 쿼리와 유사한 청크를 검색하는 부분

import neo4j
from neo4j_graphrag.retrievers import VectorRetriever
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

retriever = VectorRetriever(driver, "graphragchunk", embedder=OpenAIEmbeddings())
retriever.search(query_text = "What is the Novelty of this paper?", top_k=3 )

RetrieverResult(items=[RetrieverResultItem(content="{'embedding': None, 'index': 0, 'text': 'From Local to Global: A GraphRAG Approach to\\nQuery-Focused Summarization\\nDarren Edge1† Ha Trinh1† Newman Cheng2 Joshua Bradley2 Alex Chao3\\nApurva Mody3 Steven Truitt2 Dasha Metropolitansky1 Robert Osazuwa Ness1\\nJonathan Larson1\\n1Microsoft Research\\n2Microsoft Strategic Missions and Technologies\\n3Microsoft Office of the CTO\\n{daedge,trinhha,newmancheng,joshbradley,achao,moapurva,\\nsteventruitt,dasham,robertness,jolarso}@microsoft.com\\n†These authors contributed equally to this work\\nAbstract\\nThe use of retrieval-augmented generation (RAG) to retrieve relevant informa-\\ntion from an external knowledge source enables large language models (LLMs)\\nto answer questions over private and/or previously unseen document collections.\\nHowever, RAG fails on global questions directed at an entire text corpus, such\\nas “What are the main themes in the dataset?”, since this is inherently

In [None]:
# 답변 생성하는 부분 

from neo4j_graphrag.llm.openai_llm import OpenAILLM
from neo4j_graphrag.generation import GraphRAG

llm = OpenAILLM(model_name = "gpt-4o")
graph_rag = GraphRAG(retriever, llm)

query_text = "WHat is the Novelty of this paper?"
response = graph_rag.search(query_text=query_text, retriever_config={"top_k":3}, return_context=True)
print(response.answer)

The novelty of this paper lies in its proposal of an adaptive benchmarking procedure that utilizes persona generation to create queries representative of real-world Retrieval-Augmented Generation (RAG) system usage. It introduces the GraphRAG approach, which incorporates a graph index derived from an LLM (Large Language Model) to span nodes (entities), edges (relationships), and covariates (claims) for enhanced domain-tailored summarization and community detection. The evaluation criteria also stand out by leveraging LLMs for assessing the quality of generated answers, employing a comparative approach to measure criteria relevant to global sensemaking, such as comprehensiveness, diversity, and answer relevance, among others. The study applies these methods to different datasets and evaluates the effectiveness using statistical analyses.


In [24]:
# 어떤 정보를 기반으로 위 답변을 생성했는지 체크
print(response.retriever_result)


items=[RetrieverResultItem(content="{'embedding': None, 'index': 3, 'text': 'and authentic sets of personas (Kosinski,\\n2024; Salminen et al., 2024; Shin et al., 2024). Our adaptive benchmarking procedure uses persona\\ngeneration to create queries that are representative of real-world RAG system usage. Specifically,\\nour approach uses the LLM to infer the potential users would use the RAG system and their use\\ncases, which guide the generation of corpus-specific sensemaking queries.\\n2.4 RAG evaluation criteria\\nOur evaluation relies on the LLM to evaluate how well the RAG system answers the generated ques-\\ntions. Prior work has shown LLMs to be good evaluators of natural language generation, includ-\\ning work where LLMs evaluations were competitive with human evaluations (Wang et al., 2023a;\\nZheng et al., 2024). Some prior work proposes criteria for having LLMs quantify the quality of\\n3\\nSource Documents\\nText Chunks\\ntext extraction\\nand chunking\\nEntities & Relatio

In [27]:
print(response.retriever_result.items[0].content[100:])

t al., 2024; Shin et al., 2024). Our adaptive benchmarking procedure uses persona\ngeneration to create queries that are representative of real-world RAG system usage. Specifically,\nour approach uses the LLM to infer the potential users would use the RAG system and their use\ncases, which guide the generation of corpus-specific sensemaking queries.\n2.4 RAG evaluation criteria\nOur evaluation relies on the LLM to evaluate how well the RAG system answers the generated ques-\ntions. Prior work has shown LLMs to be good evaluators of natural language generation, includ-\ning work where LLMs evaluations were competitive with human evaluations (Wang et al., 2023a;\nZheng et al., 2024). Some prior work proposes criteria for having LLMs quantify the quality of\n3\nSource Documents\nText Chunks\ntext extraction\nand chunking\nEntities & Relationships\ndomain-tailored\nsummarization\nKnowledge Graph\ndomain-tailored\nsummarization\nGraph Communities\ncommunity\ndetection\nCommunity Summaries\n

## 5. 임베딩 직접 추가하기 
* 도메인의 경우 도메인그래프는 준비되어있지만(그래프DB가 적재완료), 임베딩 프로퍼티가 없는 경우 
* 임베딩 프로퍼티를 추가해야 벡터 리트리버가 가능
* 스택오버플로우 데이터 셋 활용 :https://sandbox.neo4j.com/?usecase=stackoverflow

In [28]:
# 임베딩 추가할 때 openAI 사용해도 되지만, 과금비용 아끼고 싶을 때 쓰는 패키지
from neo4j_graphrag.embeddings.sentence_transformers import SentenceTransformerEmbeddings

In [29]:
embedder = SentenceTransformerEmbeddings(
    model = "all-MiniLM-L6-v2"
)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
# Neo4j의 Question 노드에 임베딩 벡터를 추가가
driver = GraphDatabase.driver(URI, auth=AUTH)

with driver.session() as session:
    result = session.run("MATCH (q:Question) WHERE q.embedding IS NULL RETURN elementId(q) AS id, q.body_markdown AS text")
    records = result.data()

    for record in records: #  각 노드에 임베딩 추가
        node_id = record["id"] # 노드 고유 ID
        text = record["text"]  # 질문 텍스트
        vector = embedder.embed_query(text) # 텍스트 -> 벡터 변환 (all-MiniLM-L6-v2는 384차원 벡터 )

        session.run("""
        MATCH (q) WHERE elementId(q) = $id
        SET q.embedding = $embedding
        """, {"id": node_id, "embedding": vector}) # ID로 특정 노드찾고, 해당 노드에 임베딩 속성추가, 파라미터 바인딩하여 SQL Injection 방지



In [33]:
# 만들어준 임베딩 벡터에 벡터인덱스 생성 
from neo4j import GraphDatabase
from neo4j_graphrag.indexes import create_vector_index

INDEX_NAME = "questionindex"
DIMENSION = 384

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    create_vector_index(
        driver, 
        INDEX_NAME, 
        label="Question", 
        embedding_property="embedding", 
        dimensions=DIMENSION, 
        similarity_fn="cosine"
    )

In [35]:
from neo4j_graphrag.retrievers import VectorRetriever
from neo4j_graphrag.llm.openai_llm import OpenAILLM
from neo4j_graphrag.generation import GraphRAG

retriever = VectorRetriever(driver, "questionindex", embedder=embedder)
llm = OpenAILLM(model_name="gpt-4o")
graph_rag = GraphRAG(retriever, llm)
response = graph_rag.search(query_text="What are some possible questions that could arise under the topic 'Cypher - get all associated relationships for a node'?", return_context = True)

In [37]:
print(response.answer)

1. How can I retrieve all relationships for a specific node in a Neo4j graph using Cypher?
2. What is the Cypher query to find all nodes connected to a particular node along with their relationship types?
3. How do you fetch all incoming and outgoing relationships for a node using Cypher?
4. Can I get a detailed list of all nodes and their relationships associated with a specified node in Cypher?
5. Is there a way to visualize all the connections and relationships of a specific node using a Cypher query?
6. What Cypher query should I use to find nodes with a specific property and all their related nodes?
7. How can I efficiently query and list multiple layers of relationships from a given node in Neo4j?
8. What are the best practices for fetching extensive relationship data for a node without performance issues in Neo4j?
9. How can I modify a Cypher query to include specific relationship properties when retrieving node relationships?
10. Are there any limitations or considerations I sh

In [38]:
print(response.retriever_result)

items=[RetrieverResultItem(content="{'embedding': None, 'body_markdown': 'I have a database containing millions of nodes and edge data and I want to get all the nodes and relationships data between two specified nodes.\\r\\nBelow is the sample data for the graph which has 7 nodes and 7 relationships.\\r\\n \\r\\nTo traverse from 1st node to 7th node I can use the variable length relationship approach and can get the nodes and relationships in between the first and 7th nodes (but in this approach we need to know the number of relationships and nodes between 1st and 7th node). \\r\\nFor using variable length relationship approach we have to specify the number where we will get the end node and it traverses in one direction. \\r\\nBut in my case I know the start and end node and don&#39;t know how many relationships and nodes are in between them. Please suggest how I can write a Cypher query for this case.\\r\\n\\r\\nI have used the APOC spanning tree procedure where it returns ‘path’ fro