## Graph RAG with Neo4j and Langchain

### Import libraries

In [1]:
import os
from dotenv import load_dotenv

In [2]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
from langchain_neo4j import Neo4jGraph, GraphCypherQAChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_ollama import ChatOllama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


### Load Environment Variables and Set Constants

In [3]:
load_dotenv()

# Constants
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
LLM_TYPE = os.getenv("LLM_TYPE", "ollama")  # Default to 'ollama' if not set
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "llama3.1")  # Default to 'llama3.1' if not set

os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY

In [4]:
print(f"Using LLM type: {LLM_TYPE}")

Using LLM type: google


### Establish Connection to Neo4j

In [23]:
NEO4J_PASSWORD = 'XY4eZkrvDpdcr1PbONIfKpEqoj-1xzp2KTggxREUox8'
print(NEO4J_PASSWORD)

XY4eZkrvDpdcr1PbONIfKpEqoj-1xzp2KTggxREUox8


In [24]:
graph = Neo4jGraph(
  url=NEO4J_URI,
  username=NEO4J_USERNAME,
  password=NEO4J_PASSWORD,
)

### Load Documents and Chunking

Set the path to the document to add to the Knowledge Graph

In [None]:
file_path = "docs/83_84.txt"

Load the document to be added

In [None]:
loader = TextLoader(
  file_path=file_path,
  encoding="utf-8",
)
docs = loader.load()

Recommend using Gemini for cleaning text

In [None]:
# Create LLM for text cleaning
cleaning_llm = ChatGoogleGenerativeAI(
  model="gemini-2.0-flash-lite",
  temperature=0,
)

# Create prompt for text cleaning
cleaning_prompt = ChatPromptTemplate.from_messages([
  ("system", "You are a text cleaning assistant. Your task is to fix misplaced Thai characters, vowels, and spaces in the text while preserving ALL original content. Do not add, remove, or change any words or meaning. Only fix the positioning of Thai characters and spacing issues."),
  ("human", "Please fix the misplaced Thai characters, vowels, and spaces in this text while keeping all content exactly the same:\n\n{text}")
])

# Create cleaning chain
cleaning_chain = cleaning_prompt | cleaning_llm | StrOutputParser()

# Clean the document
cleaned_content = cleaning_chain.invoke({"text": docs[0].page_content})

# Update the document with cleaned content
docs[0].page_content = cleaned_content

print("Document content has been cleaned and updated.")

In [None]:
docs[0].page_content

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=600,
  chunk_overlap=128,
)
documents = text_splitter.split_documents(docs)

In [None]:
print(len(documents))

In [None]:
print(documents[0])

### Convert Text Chunks into Graphs

In [None]:
if LLM_TYPE == "ollama":
  print("Using Ollama LLM")
  llm = ChatOllama(
    model="llama3.1",
    temperature=0,
  )
else:
  print("Using Google Gemini LLM")
  llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-lite",
    temperature=0,
  )

allowed_nodes = ["ข้อบังคับ", "มาตรฐาน", "คำจำกัดความ", "หมวด/มาตรา", "ข้อกำหนด", "หน่วยงาน", "กระบวนการ", "เอกสาร", "เกณฑ์/เกณฑ์ตัวเลข"]
allowed_relationships = ["ปฏิบัติตาม", "กำหนดไว้ใน", "บังคับโดย", "ต้องมี", "อ้างอิง", "มีคำนิยาม", "มีขั้นตอน", "มีเกณฑ์"]
prompt = ChatPromptTemplate.from_template("""
คุณเป็นผู้เชี่ยวชาญด้านมาตรฐานการบัญชีของประเทศไทย โปรดแยกข้อมูลความรู้จากข้อความด้านล่างออกมาเป็นกราฟความรู้ (Knowledge Graph) โดยเน้นโหนดที่เกี่ยวกับข้อบังคับ มาตรฐานทางบัญชี ข้อกำหนด กฎเกณฑ์ และความสัมพันธ์ระหว่างแนวปฏิบัติเหล่านี้ ให้ใช้เฉพาะประเภทโหนดและความสัมพันธ์ที่กำหนดไว้เท่านั้น
""")
strict_mode = True
node_properties = ["ชื่อ", "คำอธิบาย", "เลขมาตรา", "วันที่มีผลบังคับใช้", "อ้างอิงเอกสาร"]
relationship_properties = ["เหตุผล", "บริบท", "คะแนนความมั่นใจ"]
ignore_tool_usage = False

In [None]:
llm_transformer = LLMGraphTransformer(
  llm=llm,
  allowed_nodes=allowed_nodes,
  allowed_relationships=allowed_relationships,
  prompt=prompt,
  strict_mode=strict_mode,
  node_properties=node_properties,
  relationship_properties=relationship_properties,
  ignore_tool_usage=ignore_tool_usage,
)

graph_documents = llm_transformer.convert_to_graph_documents(documents)

### Add Graphs into Neo4j

In [None]:
graph.add_graph_documents(
  graph_documents=graph_documents,
  include_source=True,
  baseEntityLabel=True,
)

### Compute Vector Embeddings from Graph

In [None]:
embeddings = OllamaEmbeddings(
  model="qllama/multilingual-e5-small"
)

vector_index = Neo4jVector.from_existing_graph(
  embedding=embeddings,
  search_type="hybrid",
  node_label="Document",
  text_node_properties=["text"],
  embedding_node_property="embedding",
)

vector_retriever = vector_index.as_retriever()

### Chain for Extracting Entities

In [None]:
llm = ChatGoogleGenerativeAI(
  model="gemini-2.0-flash",
  temperature=0,
)

In [None]:
class Entities(BaseModel):
  """Identifying informatoin about entities"""

  names: list[str] = Field(
    ...,
    description="All the person, organization, or business entities that appears in the text",
  )

prompt = ChatPromptTemplate.from_messages(
  [
    (
      "system",
      "You are extracting organization and person entities from the text"
    ),
    (
      "human",
      "Use the given format to extract information from the following "
      "input: {question}",
    )
  ]
)

entity_chain = prompt | llm.with_structured_output(Entities)

### Retrieval from Graph

#### Check Entities Extraction

In [None]:
entity_chain.invoke({"question": "ภาษีมูลค่าเพิ่ม"}).names

#### Query the Graph Database

First, create a full text index for the field ID by running the cypher query:

`CREATE FULLTEXT INDEX entity FOR (n:__Entity__) ON EACH [n.id]`

In [None]:
def generate_full_text_query(input: str) -> str:
  words = [el for el in remove_lucene_chars(input).split() if el]
  if not words:
    return ""
  full_text_query = " AND ".join([f"{word}~2" for word in words])
  print(f"Generated Query: {full_text_query}")
  return full_text_query.strip()

# Fulltext index query
def graph_retriever(question: str) -> str:
  """
  Collects the neighborhood of entities mentioned
  in the question
  """
  result = ""
  entities = entity_chain.invoke({"question": question})
  for entity in entities.names:
    response = graph.query(
      """CALL db.index.fulltext.queryNodes('entity', $query, {limit: 2})
      YIELD node, score
      CALL {
        WITH node
        MATCH (node)-[r:!MENTIONS]->(neighbor)
        RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
        UNION ALL
        WITH node
        MATCH (node)<-[r:!MENTIONS]-(neighbor)
        RETURN neighbor.id + ' - ' + type(r) + ' -> ' + node.id AS output
      }
      RETURN output LIMIT 10
      """,
      {"query": entity},
    )
    result += "\n".join([el['output'] for el in response])
  return result

In [None]:
print(graph_retriever("ราชอาณาจักร"))

#### Full Retriever to Retrieve from Vector Store

In [None]:
def full_retriever(question: str):
  graph_data = graph_retriever(question)
  vector_data = [el.page_content for el in vector_retriever.invoke(question)]
  final_data = f"""Graph data:
  {graph_data}
  vector data:
  {"#Document ".join(vector_data)}
  """
  return final_data

### Final Chain

In [None]:
template = """Answer the qyestion based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
  {
    "context": full_retriever,
    "question": RunnablePassthrough()
  }
  | prompt
  | llm 
  | StrOutputParser()
)

In [None]:
chain.invoke(input="การจ่ายภาษีสินค้าต่างประเทศ")