<a href="https://colab.research.google.com/github/HirunaD/LangChain/blob/main/08_Knowledge_Graph_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Setup and Installation**

In [14]:
!pip install langchain -q
!pip install langchain-community -q
!pip install langchain-openai -q
!pip install langchain-experimental -q
!pip install neo4j -q
!pip install -U :class:`~langchain-neo4j

/bin/bash: -c: line 1: unexpected EOF while looking for matching ``'
/bin/bash: -c: line 2: syntax error: unexpected end of file


In [15]:
from google.colab import userdata
import os

import warnings
warnings.filterwarnings('ignore')

**Initialize OpenAI LLM**

In [30]:
from langchain_openai import ChatOpenAI

# set OpenAI API key
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

# initialize the ChatOpenAI model
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0
)

**Initialize Embedding Model**

In [31]:
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

**Load Data**

In [32]:
from langchain.schema import Document

text = Document(page_content="""
Three students, A, B, and C, are tackling two subjects, X (Mathematics) and Y (Literature). Each has a unique perspective, weaving their experiences into a shared academic journey. A, gifted in Mathematics, thrives on solving equations but struggles with the abstract world of poetry and storytelling. On the other hand, B shines in Literature, captivating others with a flair for creative writing, yet finds numbers daunting and formulas perplexing.
C, a generalist, performs decently in both subjects but often bridges gaps between A and B. While A helps B understand mathematical concepts, B guides A through essay writing. Meanwhile, C organizes group study sessions, offering real-world examples to connect ideas from X and Y, making both subjects more relatable. Their collaboration not only enhances their learning but fosters a sense of camaraderie, demonstrating the power of teamwork in overcoming challenges.
""")

**Split Documents into Chunks**

In [33]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=250,chunk_overlap=30)

chunks = splitter.split_documents([text])

**Graph Initialization and Transformation**

In [34]:
from langchain_community.graphs import Neo4jGraph

os.environ["NEO4J_URI"] = "neo4j+s://bfc55c61.databases.neo4j.io"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "6HYZdJBB8uJQyGlAQBDvB2dtonpCMHiqdRgtyevU4fw"

# initialize Neo4j graph database
graph = Neo4jGraph()

In [35]:
from langchain_experimental.graph_transformers import LLMGraphTransformer

graph_transformer = LLMGraphTransformer(llm = llm)

In [36]:
# convert text chunks to graph nodes
graph_documents = graph_transformer.convert_to_graph_documents(chunks)

In [37]:
graph_documents[0]

GraphDocument(nodes=[Node(id='A', type='Student', properties={}), Node(id='B', type='Student', properties={}), Node(id='C', type='Student', properties={}), Node(id='X', type='Subject', properties={}), Node(id='Y', type='Subject', properties={}), Node(id='Mathematics', type='Academicsubject', properties={}), Node(id='Literature', type='Academicsubject', properties={})], relationships=[Relationship(source=Node(id='A', type='Student', properties={}), target=Node(id='X', type='Subject', properties={}), type='STUDIES', properties={}), Relationship(source=Node(id='B', type='Student', properties={}), target=Node(id='X', type='Subject', properties={}), type='STUDIES', properties={}), Relationship(source=Node(id='C', type='Student', properties={}), target=Node(id='X', type='Subject', properties={}), type='STUDIES', properties={}), Relationship(source=Node(id='A', type='Student', properties={}), target=Node(id='Y', type='Subject', properties={}), type='STUDIES', properties={})], source=Document(

In [38]:
for node in graph_documents[0].nodes:
  print(node)

id='A' type='Student' properties={}
id='B' type='Student' properties={}
id='C' type='Student' properties={}
id='X' type='Subject' properties={}
id='Y' type='Subject' properties={}
id='Mathematics' type='Academicsubject' properties={}
id='Literature' type='Academicsubject' properties={}


In [39]:
for relationship in graph_documents[0].relationships:
  print(relationship)

source=Node(id='A', type='Student', properties={}) target=Node(id='X', type='Subject', properties={}) type='STUDIES' properties={}
source=Node(id='B', type='Student', properties={}) target=Node(id='X', type='Subject', properties={}) type='STUDIES' properties={}
source=Node(id='C', type='Student', properties={}) target=Node(id='X', type='Subject', properties={}) type='STUDIES' properties={}
source=Node(id='A', type='Student', properties={}) target=Node(id='Y', type='Subject', properties={}) type='STUDIES' properties={}


In [40]:
# add nodes and relationships to graph
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [41]:
# indexing enables fast searches within text-based properties
def create_fulltext_index(g):
  cypher = "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]"
  g.query(cypher)

create_fulltext_index(graph)

**Querying the Graph and Entity Retrieval**

In [42]:
from langchain_core.prompts import ChatPromptTemplate

entity_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system","You are extracting entities from the text.",
        ),
        (
            "human","Use the following information to extract entities"
            "input: {question}",
        ),
    ]
)

In [43]:
from pydantic import BaseModel, Field
from typing import List

class Entities(BaseModel):
    names: List[str] = Field(
        ...,
        description="All the entities that appear in the text",
    )


In [44]:
entity_chain = (
    entity_prompt
    | llm.with_structured_output(Entities)
)

In [45]:
entity_chain.invoke({"question": "who learn both X and Y subjects"}).names

['X', 'Y']

**Graph Retriever**

Lucene chars:
https://api.python.langchain.com/en/latest/_modules/langchain_community/vectorstores/neo4j_vector.html#remove_lucene_chars

In [46]:
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars

def generate_full_text_query(input):
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

def graph_retriever(question: str) -> str:
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 20
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result

In [47]:
print(graph_retriever("who learn both X and Y subjects"))



A - STUDIES -> X
B - STUDIES -> X
C - STUDIES -> X
B - STUDIES -> X
B - EXCELS_IN -> Literature
B - STRUGGLES_WITH -> Mathematics
B - GUIDE -> A
C - BRIDGE -> B
A - GUIDE -> BA - STUDIES -> Y
B - STUDIES -> X
B - EXCELS_IN -> Literature
B - STRUGGLES_WITH -> Mathematics
B - GUIDE -> A
C - BRIDGE -> B
A - GUIDE -> B


**Semantic Search Retriever**

In [48]:
from langchain_community.vectorstores import Neo4jVector

vector_index = Neo4jVector.from_existing_graph(
    embedding_model,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

In [49]:
vector_index.similarity_search("who learn both X and Y subjects", k=2)



[Document(metadata={}, page_content='\ntext: Three students, A, B, and C, are tackling two subjects, X (Mathematics) and Y (Literature). Each has a unique perspective, weaving their experiences into a shared academic journey. A, gifted in Mathematics, thrives on solving equations but struggles'),
 Document(metadata={}, page_content='\ntext: offering real-world examples to connect ideas from X and Y, making both subjects more relatable. Their collaboration not only enhances their learning but fosters a sense of camaraderie, demonstrating the power of teamwork in overcoming challenges.')]

In [50]:
def retriever(question):
    graph_search_result = graph_retriever(question)
    semantic_search_result = [data.page_content for data in vector_index.similarity_search(question, k=2)]
    final_data = f"Graph data:{graph_search_result}\nText data:{' '. join(semantic_search_result)}"
    return final_data

**Define Prompt Template for RAG**

In [51]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system","Answer this question using the provided context only.",
        ),
        (
            "human","Context: {context}"
            "Question: {question}",
        ),
    ]
)

**Create RAG Chain**

In [52]:
from langchain_core.runnables import RunnablePassthrough

chain = (
    {
      "context": retriever,
      "question": RunnablePassthrough()
    }
    | prompt
    | llm
)

**Invoke RAG Chain with Example Questions**

In [53]:
response = chain.invoke("who learn both X and Y subjects")

print(response.content)



Students A and B learn both X (Mathematics) and Y (Literature) subjects.


In [54]:
response = chain.invoke("who are the students in that school")

print(response.content)



The students in that school are A, B, and C.
