In [19]:
%pip --quiet install --upgrade neo4j langchain-community

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Neo4j setup

In [31]:

# from langchain_community.graphs import Neo4jGraph
from langchain_neo4j import Neo4jGraph
from dotenv import load_dotenv
import os
# from langchain.graphs.neo4j_graph import Neo4jGraph 
# from langchain.chains import GraphQAChain old
from langchain_neo4j import GraphCypherQAChain

from langchain_core.prompts import PromptTemplate
# from langchain_community.chains.graph_qa.cypher import GraphCypherQAChain

load_dotenv()
neo_pass = os.getenv("NEO4J_PASS")
neo_db_id = os.getenv("DB_ID")

graph = Neo4jGraph(
    url="neo4j+s://f5c81351.databases.neo4j.io",
    username="neo4j",
    password=neo_pass,
    enhanced_schema=True
    # refresh_schema=Fa lse
)

def clean_graph():
    query = """
    MATCH (n)
    DETACH DELETE n
    """
    graph.query(query)

### OpenAI setup

In [44]:
import getpass
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# os.environ[api_key] = getpass.getpass()
import os
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Neo4jVector

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

additional_instructions = """
When creating entities, add a "document_id" property to each node and set it to the document's unique ID.
For example, if the document ID is "doc123", each created node should include document_id: "doc123".
Query example: 
CREATE (n:NodeLabel) 
SET n.document_id = "doc123" 
RETURN n
"""
llm_transformer = LLMGraphTransformer(llm=llm, additional_instructions=additional_instructions, ignore_tool_usage=True)

# Embeddings for complex search queries
embed = OpenAIEmbeddings(model="text-embedding-ada-002")
vector_index = Neo4jVector.from_existing_graph(
    embedding=embed,
    search_type="vector",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)
vector_retriever = vector_index.as_retriever()


### Split document into chunks

In [62]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

pdf_path = "testdoc.pdf"
loader = PyPDFLoader(pdf_path)
pages = loader.load_and_split()
pages = loader.load() # load pages

# chunk overlap is the shared context window between chunks--allows context to be maintained across chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)

splits = text_splitter.split_documents(pages) # split the pages using LangChain's text_splitter

processed_chunks = []

for i, chunk in enumerate(splits):
    # Process the chunk
    metadata = {
        "chunk_id": i,
        "source": pdf_path,
        "page_number": chunk.metadata.get("page", None),
        "total_length": len(chunk.page_content),
        "text_preview": (
            chunk.page_content[:100] + "..."
            if len(chunk.page_content) > 100
                else chunk.page_content
        ),


    }
    # Store the metadata for each chunk after processing
    processed_chunks.append({"text": chunk.page_content, "metadata": metadata})

print(str(len(processed_chunks)) + " chunks processed")
# print(processed_chunks[6]['metadata']['text_preview'])
# print(processed_chunks[6]['metadata'])
# print(processed_chunks[0]['text_preview'])

    

4 chunks processed


In [66]:
# Check out processed chunks
print(processed_chunks[3]['metadata'])


{'chunk_id': 3, 'source': 'testdoc.pdf', 'page_number': 0, 'total_length': 576, 'text_preview': 'and\n \nflashing\n \nlights.\n \nIn\n \nmany\n \ncountries,\n \nthey\n \nfollow\n \nspecific\n \nroutes,\n \nstopping\n \n...'}


In [66]:
# from typing import List, Dict
# # Create graph using the processed chunks
# def create_graph(chunks: List[Dict]):
#     # cypher query to create the chunks & their attributes
#     create_chunk_query = """
#     MERGE (chunk:Chunk {chunk_id: $chunk_id})
#     ON CREATE SET
#         chunk.source = $source,
#         chunk.page_number = $page_number,
#         chunk.total_length = $total_length,
#         chunk.text_preview = $text_preview,
#         chunk.full_text = $full_text
#         RETURN chunk
#     """

#     for chunk in chunks:
#         graph.query

### Parse Document (prob able to optimize this)

In [68]:
import asyncio
from langchain_core.documents import Document


# Convert processed chunks to Langchain Document for Neo4j db
docs = [
    Document(
        page_content=chunk['text'],
        metadata=chunk['metadata']
    )
    for chunk in processed_chunks
]

# Function to process documents in batches
async def process_batches(docs, batch_size=2, retry_delay=20, max_retries=5):
    for i in range(0, len(docs), batch_size):
        batch = docs[i : i + batch_size] # Use subset of documents
        retries = 0
        
        while retries < max_retries:
            try:
                # generate embeddings for each doc
                for doc in batch:
                    embedding = embed.embed_query(doc.page_content)
                    doc.metadata["embedding"] = embedding


                graph_docs = await llm_transformer.aconvert_to_graph_documents(batch)
                print(f"Processed batch {i // batch_size + 1}: {graph_docs}")
                
                # Add to Neo4j
                graph.add_graph_documents(graph_docs, include_source=True, baseEntityLabel=True)
                print(f"Successfully added batch {i // batch_size + 1} to Neo4j")
                break  # exit if retry works
            
            except Exception as e:
                if "rate_limit_exceeded" in str(e):
                    retries += 1
                    print(f"Rate limit hit. Retrying batch {i // batch_size + 1} in {retry_delay} seconds... (Attempt {retries}/{max_retries})")
                    await asyncio.sleep(retry_delay)
                else:
                    raise  # other error

# Run batching process
await process_batches(docs, batch_size=2)



Processed batch 1: [GraphDocument(nodes=[Node(id='creating a partial vacuum', type='Process', properties={}), Node(id='Vacuums', type='Product', properties={}), Node(id='vitamins', type='Nutrient', properties={}), Node(id='Apples', type='Product', properties={}), Node(id='other surfaces', type='Surface', properties={}), Node(id='cooking', type='Activity', properties={}), Node(id='apple juice', type='Product', properties={}), Node(id='cider', type='Product', properties={}), Node(id='particles', type='Element', properties={}), Node(id='container', type='Container', properties={}), Node(id='fiber', type='Nutrient', properties={}), Node(id='floors', type='Surface', properties={}), Node(id='air', type='Element', properties={}), Node(id='bag', type='Container', properties={}), Node(id='applesauce', type='Product', properties={}), Node(id='apple trees', type='Plant', properties={}), Node(id='antioxidants', type='Nutrient', properties={})], relationships=[Relationship(source=Node(id='Apples', 

### Query Functions

In [72]:
query = "What are cells?"
vector_retriever.search_kwargs = {"k": 1}
results = vector_retriever.get_relevant_documents(query)

print(len(results))
for doc in results:
    print(doc.id)
    print(doc.page_content)
    print()

1
None

text: particles,
 
which
 
are
 
then
 
trapped
 
in
 
a
 
bag
 
or
 
container
 
for
 
disposal.
 
There
 
are
 
different
 
types
 
of
 
vacuums,
 
including
 
upright,
 
canister,
 
handheld,
 
and
 
robotic
 
models,
 
each
 
designed
 
for
 
specific
 
cleaning
 
needs.
 
Vacuums
 
are
 
commonly
 
used
 
in
 
homes,
 
offices,
 
and
 
industrial
 
settings
 
to
 
maintain
 
cleanliness
 
and
 
hygiene.
 
Some
 
modern
 
vacuums
 
also
 
feature
 
HEPA
 
filters,
 
which
 
help
 
remove
 
allergens
 
from
 
the
 
air.
 
  
Mitochondria  are  organelles  found  in  the  cells  of  most  eukaryotic  organisms.  Often  referred  to  
as
 
the
 
"powerhouses"
 
of
 
the
 
cell,
 
they
 
generate
 
most
 
of
 
the
 
cell’s
 
energy
 
in
 
the
 
form
 
of
 
ATP
 
(adenosine
 
triphosphate)
 
through
 
a
 
process
 
called
 
cellular
 
respiration.
 
In
 
addition
 
to
 
energy
 
production,
 
mitochondria
 
are
 
involved
 
in
 
regulating
 
cell
 
growth,
 
controlling
 
the
 


In [12]:
import openai
from langchain.prompts import ChatPromptTemplate
from langchain.llms import OpenAI
from pydantic import BaseModel, Field
from langchain.chains import LLMChain

# Define a prompt to extract entities from the input query
prompt = ChatPromptTemplate.from_messages([ 
    ("system", "Extract concepts and idea entities from the text."),
    ("human", "Extract entities from: {question}")
])
# Initialize the OpenAI model for entity extraction
llm = ChatOpenAI(model="gpt-3.5-turbo")  

# Combine the prompt and OpenAI model to create an entity extraction chain
entity_chain = LLMChain(prompt=prompt, llm=llm)
def graph_retriever(question: str) -> str:
    global entity_chain
    # Define a model for the extracted entities from the text
    class Entities(BaseModel):
        names: list[str] = Field(..., description="All entities from the text")

    


    # Use the entity extraction chain to get entities from the question
    response = entity_chain.run({"question": question})

    # Assuming response is structured as {'names': ['Person A', 'Organization B']}
    entities = response.get("names", [])
    print("Retrieved Entities:")
    print(entities)
    
    result = ""  # Initialize a variable to store the result

    # Iterate over each extracted entity and query Neo4j for relationships
    for entity in entities:
        query_response = graph.query(
            """MATCH (p:__Entity__ {id: $entity})-[r]->(e)
            RETURN p.id AS source_id, type(r) AS relationship, e.id AS target_id
            LIMIT 50""",
            {"entity": entity}
        )
        # Format the query results and append to the result string
        result += "\n".join([f"{el['source_id']} - {el['relationship']} -> {el['target_id']}" for el in query_response])

    # Return the formatted results containing entity relationships
    return result



  entity_chain = LLMChain(prompt=prompt, llm=llm)


In [20]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

from langchain.chat_models import ChatOpenAI

def querying_ollama(question):
    global entity_chain
    # Define a function that combines data retrieved from both Neo4j and vector embeddings
    def full_retriever(question: str):
        # Retrieve graph data for the question using the graph_retriever function
        graph_data = graph_retriever(question)
        print("Graph Data")
        print(graph_data)
        
        # Retrieve vector data by invoking the vector retriever with the question
        vector_data = [el.page_content for el in vector_retriever.invoke(question)]
        
        # Combine the graph data and vector data into a formatted string
        return f"Graph data: {graph_data}\nVector data: {'#Document '.join(vector_data)}"

    # Define a prompt template for generating a response based on context
    template = """Answer the question based only on the following context:
    {context}
    Question: {question}
    Answer:"""
    
    # Create a prompt from the template, which takes the context and question as input
    prompt = ChatPromptTemplate.from_template(template)
    
    # Initialize the ChatOpenAI model (you can use your preferred model here)
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)  # Use ChatOpenAI for chat models

    # Create a processing chain that:
    # 1. Generates context using the full_retriever function
    # 2. Passes through the question as-is using RunnablePassthrough
    # 3. Applies the prompt template to generate the final question
    # 4. Uses the LLM (language model) to generate the answer
    # 5. Uses StrOutputParser to format the output as a string
    chain = (
        {
            "context": lambda input: full_retriever(input),  # Generate context from the question
            "question": RunnablePassthrough(),  # Pass the question through without modification
        }
        | prompt  # Apply the prompt template
        | llm  # Use the language model to answer the question based on context
        | StrOutputParser()  # Parse the model's response as a string
    )

    # Call the chain to get the response
    import json

    response = entity_chain.run({"question": question})
    print(response)
    try:
        response_json = json.loads(response)  # Convert string to dictionary
        print(response)
        entities = response_json.get("names", [])  # Now we can safely use .get()
    except json.JSONDecodeError:
        entities = []  # Handle the case where the response isn't valid JSON

    print("Retrieved Entities:")
    print(entities)



### Test

In [None]:
querying_ollama("Explain the tools and technologies in the resume")

Entities:
- Tools
- Technologies
- Resume
Retrieved Entities:
[]


: 