## Dependencies

In [11]:
%pip install --quiet langchain-neo4j langchain_ollama pypdf

Note: you may need to restart the kernel to use updated packages.


## Connect to Neo4j

In [1]:
from langchain_neo4j import Neo4jGraph
from dotenv import load_dotenv
import os

load_dotenv()
neo_pass = os.getenv("NEO4J_PASS")

url = "neo4j+s://f5c81351.databases.neo4j.io"
username = "neo4j"
password = neo_pass
graph = Neo4jGraph(url=url, username=username, password=password)

## Ollama embeddings

In [2]:
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="llama3.2")

## Process PDF Documents
split text into chunks and extract data

In [None]:
from typing import List, Dict
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from collections import Counter
import re #Regex

def _extract_keywords(text: str, top_n: int = 5) -> List[str]:

    words = re.findall(r"r\w+", text.lower())

    stop_words = set(
        [
            "the",
            "a",
            "an",
            "and",
            "or",
            "but",
            "in",
            "on",
            "at",
            "to",
            "for",
            "of",
            "with",
            "by",
        ]
    )
    filtered_words = [
        word for word in words if word not in stop_words and len(word) > 2
    ]

    return [word for word, count in Counter(filtered_words).most_common(top_n)]

def load_and_process_pdf(pdf_path: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Dict]:
    loader = PyPDFLoader(pdf_path)

    pages = loader.load() # load pages

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len)

    splits = text_splitter.split_documents(pages) # split the pages using LangChain's text_splitter

    processed_chunks = []
    for i, chunk in enumerate(splits):
       metadata = {
            "chunk_id": i,
            "source": pdf_path,
            "page_number": chunk.metadata.get("page", None),
            "total_length": len(chunk.page_content),
            "keywords": _extract_keywords(chunk.page_content),
            "text_preview": (
                chunk.page_content[:100] + "..."
                if len(chunk.page_content) > 100
                else chunk.page_content
            ),
        }
       processed_chunks.append({"text": chunk.page_content, "metadata": metadata})
    return processed_chunks


### Process


In [20]:
pdf_path = "resume.pdf"

chunks = load_and_process_pdf(pdf_path)
print(f"Total chunks created: {len(chunks)}")

for i, chunk in enumerate(chunks):
    print(f"\nChunk {i}:")
    print(f"Text Preview: {chunk['metadata']['text_preview']}")
    print(f"Keywords: {chunk['metadata']['keywords']}")
    print(f"Page Number: {chunk['metadata']['page_number']}")

Total chunks created: 7

Chunk 0:
Text Preview: Abi Kakolla 
Toronto, ON | (647) 957-7403 | kakolla@usc.edu| linkedin.com/in/kakolla|kakolla.com 
 
...
Keywords: ['rvard', 'ring', 'rning', 'ral', 'ronto']
Page Number: 0

Chunk 1:
Text Preview: ● Modeled in-silico layers of the Hippocampus used to generate a dendritic tree as part of a neural ...
Keywords: ['ral', 'ration', 'rate', 'rid', 'rating']
Page Number: 0

Chunk 2:
Text Preview: Lead Software Developer                  Nov 2021 - June 2022 
inLoop (sponsored by Deloitte)       ...
Keywords: ['ronto', 'red', 'ript', 'rebase', 'riving']
Page Number: 0

Chunk 3:
Text Preview: PROJECTS                                                                                            ...
Keywords: ['rvard', 'rraform', 'ricks', 'raspberry', 'rest']
Page Number: 0

Chunk 4:
Text Preview: ● Implemented a Random Forest Classifier model in Databricks using Google Kubernetes Engine resource...
Keywords: ['rithm', 'react', 'rest', 'redictions', 're

## Store data chunks to Neo4J

In [None]:
def create_graph_from_chunks(chunks: List[Dict]):
    # graph.query("MATCH (n) DETACH DELETE n") # cleans graph

    # cypher query to create the chunks & their attributes
    create_chunk_query = """
    MERGE (chunk:Chunk {chunk_id: $chunk_id})
    ON CREATE SET
        chunk.source = $source,
        chunk.page_number = $page_number,
        chunk.total_length = $total_length,
        chunk.text_preview = $text_preview,
        chunk.full_text = $full_text
        WITH chunk
        UNWIND $keywords AS keyword
        MERGE (kw:Keyword {name: keyword})
        MERGE (chunk)-[:HAS_KEYWORD]->(kw)
        RETURN chunk
    """

    for chunk in chunks:
        graph.query(
            create_chunk_query,
            params={
                "chunk_id": chunk["metadata"]["chunk_id"],
                "source": chunk["metadata"]["source"],
                "page_number": chunk["metadata"]["page_number"],
                "total_length": chunk["metadata"]["total_length"],
                "text_preview": chunk["metadata"]["text_preview"],
                "full_text": chunk["text"],
                "keywords": chunk["metadata"]["keywords"],
            },
        )

create_graph_from_chunks(chunks[:200])

# after storing the data, create a unique constraint to make sure data is secure
graph.query(
    """
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunk_id IS UNIQUE
"""
)

embedding_dim = 3072



### Create vector index for similarity search using embeddings

In [23]:
def generate_embedding(text: str) -> List[float]:
   
    try:
        embedding = embeddings.embed_query(text)

        embedding = [float(x) for x in embedding]

        magnitude = sum(x * x for x in embedding) ** 0.5
        if magnitude > 0:
            embedding = [x / magnitude for x in embedding]

        if len(embedding) != embedding_dim:
            if len(embedding) < embedding_dim:
                embedding.extend([0.0] * (embedding_dim - len(embedding)))
            else:
                embedding = embedding[:embedding_dim]

        return embedding

    except Exception as e:
        print(f"Error generating embedding: {e}")
        return [0.0] * embedding_dim


# we create the vector index using the above function for generating embeddings
def create_vector_index(chunks: List[Dict]):

    try:
        graph.query(
            """
            DROP INDEX chunk_vector_index IF EXISTS 
        """
        )

        graph.query(
            """
            CALL db.index.vector.createNodeIndex(
                'chunk_vector_index',
                'Chunk',
                'embedding',
                $dim,
                'cosine'
            )
            """,
            params={"dim": embedding_dim},
        )

        batch_size = 10
        total_processed = 0

        for i in range(0, len(chunks), batch_size):
            batch = chunks[i : i + batch_size]
            batch_embeddings = []

            for chunk in batch:
                embedding = generate_embedding(chunk["text"])
                batch_embeddings.append(
                    {"chunk_id": chunk["metadata"]["chunk_id"], "embedding": embedding}
                )

            batch_update_query = """
            UNWIND $batch AS item
            MATCH (chunk:Chunk {chunk_id: item.chunk_id})
            SET chunk.embedding = item.embedding
            """

            graph.query(batch_update_query, params={"batch": batch_embeddings})

            total_processed += len(batch)
            print(f"Processed {total_processed}/{len(chunks)} chunks")

    except Exception as e:
        print(f"Error creating vector index: {e}")
        raise


try:
    create_vector_index(chunks[:200])
except Exception as e:
    print(f"Failed to create vector index: {e}")



Processed 7/7 chunks


### Perform vector search on data

In [24]:
def verify_vector_index():
    query = """
    SHOW INDEXES
    YIELD name, type, labelsOrTypes, properties, options
    WHERE name = 'chunk_vector_index'
    """
    return graph.query(query)


def vector_search(query: str, top_k: int = 3) -> List[Dict]:
   
    try:
        query_embedding = embeddings.embed_query(query)

        search_query = """
        MATCH (c:Chunk)
        WITH c, vector.similarity.cosine(c.embedding, $embedding) AS score
        WHERE score > 0.7
        RETURN 
            c.chunk_id AS chunk_id,
            c.source AS source,
            c.page_number AS page_number,
            c.text_preview AS text_preview,
            c.full_text AS full_text,
            c.total_length AS total_length,
            score
        ORDER BY score DESC
        LIMIT $limit
        """

        results = graph.query(
            search_query, params={"embedding": query_embedding, "limit": top_k}
        )

        return results

    except Exception as e:
        print(f"Vector search error: {e}")
        return []


print(verify_vector_index())
for x in vector_search("What is a biofuel?"):
    print(x)
    print(x['chunk_id'])
    print(x['source'])
    print(x['page_number'])
    print(x['text_preview'])
    # print()

[{'name': 'chunk_vector_index', 'type': 'VECTOR', 'labelsOrTypes': ['Chunk'], 'properties': ['embedding'], 'options': {'indexProvider': 'vector-2.0', 'indexConfig': {'vector.hnsw.m': 16, 'vector.hnsw.ef_construction': 100, 'vector.dimensions': 3072, 'vector.similarity_function': 'COSINE', 'vector.quantization.enabled': True}}}]


## Rag pipeline

In [25]:
from langchain.prompts import PromptTemplate
from langchain.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import Neo4jVector


neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=embeddings,  
    url=url,
    username=username,
    password=password,
    index_name='chunk_vector_index',  
    node_label='Chunk',  
    text_node_properties=['full_text'], 
    embedding_node_property='embedding'
)

retriever = neo4j_vector_store.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


# Initialize the Ollama model
llm = Ollama(model="llama3.2")

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""

custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is the person's name?")

"I don't know, there is no mention of a person's name in the provided text. \n\nThanks for asking!"