In [3]:
import requests
from os.path import exists

remote_pdf_url = "https://arxiv.org/pdf/1709.00666.pdf"
pdf_filename = "einsteins_patents_and_inventions_ch02_data.pdf"

response = requests.get(remote_pdf_url)
if not exists(pdf_filename):
    if response.status_code == 200:
        print("Download Started")
        with open(pdf_filename, "wb") as pdf_file:
            pdf_file.write(response.content)
        print("Download Finished")
    else:
        print(f"Failed to Download the PDF. Status Code: {response.status_code}")
print("File Already Exists")

File Already Exists


In [4]:
import pdfplumber

text = ""

with pdfplumber.open(pdf_filename) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

print(text[:100])

Einstein’s Patents and Inventions
Asis Kumar Chaudhuri
Variable Energy Cyclotron Centre
1‐AF Bidhan 


# RAG with Vector Similarity

## Chunking

In [5]:
from typing import List


def chunk_text(
    text: str, chunk_size: int, overlap: int, split_on_whitespace_only: bool = True
) -> List[str]:
    chunks = []
    index = 0

    while index < len(text):
        if split_on_whitespace_only:
            prev_whitespace = 0
            left_index = index - overlap
            while left_index >= 0:
                if text[left_index] == " ":
                    prev_whitespace = left_index
                    break
                left_index -= 1
            next_whitespace = text.find(" ", index + chunk_size)
            if next_whitespace == -1:
                next_whitespace = len(text)
            chunk = text[prev_whitespace:next_whitespace].strip()
            chunks.append(chunk)
            index = next_whitespace + 1
        else:
            start = max(0, index - overlap + 1)
            end = min(index + chunk_size + overlap, len(text))
            chunk = text[start:end].strip()
            chunks.append(chunk)
            index += chunk_size

    return chunks

In [6]:
chunks = chunk_text(text, 500, 40)
print(len(chunks))
print(chunks[0])

89
Einstein’s Patents and Inventions
Asis Kumar Chaudhuri
Variable Energy Cyclotron Centre
1‐AF Bidhan Nagar, Kolkata‐700 064
Abstract: Times magazine selected Albert Einstein, the German born Jewish Scientist as the person of the 20th
century. Undoubtedly, 20th century was the age of science and Einstein’s contributions in unravelling mysteries
of nature was unparalleled. However, few are aware that Einstein was also a great inventor. He and his
collaborators had patented a wide variety of inventions


## Embedding

In [7]:
from sentence_transformers import SentenceTransformer
model= SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

def embed_huggingface(text):
    embedding = model.encode(chunks[0])
    return embedding


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import os
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

OPENAI_API_KEY= os.getenv("OPENAI_API_KEY")
OPENAI_API_BASE_URL= os.getenv("OPENAI_API_BASE_URL")

open_ai_client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_API_BASE_URL)

In [9]:
def embed_openai(texts):
    response = open_ai_client.embeddings.create(input=texts, model="text-embedding-3-small", )
    return list(map(lambda n: n.embedding, response.data))

In [10]:
embeddings = embed_openai(chunks)
print(len(embeddings)) # matching number of chunks
print(len(embeddings[0])) # number of dimensions

89
1536


## Vector Indexing

In [11]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "password"))

In [12]:
# Create a vector index with (chunks as nodes) and (embeddings as properties)
driver.execute_query("""CREATE VECTOR INDEX pdf IF NOT EXISTS
                    FOR (c:Chunk)
                    ON c.embedding""")

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x126178ec0>, keys=[])

In [13]:
# Populating Vector Index
# first create node -> set text & properties using Cypher Loop
cypher_query = '''
WITH $chunks as chunks, range(0, size($chunks)) AS index
UNWIND index AS i
WITH i, chunks[i] AS chunk, $embeddings[i] AS embedding
MERGE (c:Chunk {index: i})
SET c.text = chunk, c.embedding = embedding
'''

driver.execute_query(cypher_query, chunks=chunks, embeddings=embeddings)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x1260e2990>, keys=[])

In [14]:
records, _, _ = driver.execute_query("MATCH (c:Chunk) WHERE c.index = 0 RETURN c.embedding, c.text")

print(records[0]["c.text"][0:30])
print(records[0]["c.embedding"][0:3])

Einstein’s Patents and Inventi
[0.02373969554901123, -0.0224456824362278, -0.014681604690849781]


## Vector Similarity Search with K Approximate Nearest Neighbor

In [15]:
question = "At what time was Einstein really interested in experimental works?"
question_embedding = embed_openai([question])[0]

# k number of approximate nearest neighbor
query = '''
CALL db.index.vector.queryNodes('pdf', $k, $question_embedding) YIELD node as hits, score
RETURN hits.text as text, score, hits.index AS index
'''

similar_records, _, _ = driver.execute_query(query, question_embedding=question_embedding, k=4)

for record in similar_records:
    print(record["text"])
    print(record["score"], record["index"])
    print("======")

CH‐Switzerland
Considering Einstein’s upbringing, his interest in inventions and patents was not unusual.
Being a manufacturer’s son, Einstein grew upon in an environment of machines and instruments.
When his father’s company obtained the contract to illuminate Munich city during beer festival, he
was actively engaged in execution of the contract. In his ETH days Einstein was genuinely interested
in experimental works. He wrote to his friend, “most of the time I worked in the physical laboratory,
fascinated by the direct contact with observation.” Einstein's
0.8108761310577393 42
Einstein
left his job at the Patent office and joined the University of Zurich on October 15, 1909. Thereafter, he
continued to rise in ladder. In 1911, he moved to Prague University as a full professor, a year later, he
was appointed as full professor at ETH, Zurich, his alma‐mater. In 1914, he was appointed Director of
the Kaiser Wilhelm Institute for Physics (1914–1932) and a professor at the Humboldt Unive

## Generation

In [16]:
system_message = "You're an Einstein expert, but can only use the provided documents to respond to the questions."

user_message = f"""
Use the following documents to answer the question that will follow: 
{[doc["text"] for doc in similar_records]}

---

The question to answer using information only from the above documents {question}
"""
print("Question:", question)

stream = open_ai_client.chat.completions.create(
    model="gpt-4.1",
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ],
    stream=True
)

for chunk in stream:
    print(chunk.choices[0].delta.content or "", end="")

Question: At what time was Einstein really interested in experimental works?
According to the provided documents, Einstein was genuinely interested in experimental works during his ETH days. He wrote to his friend, “most of the time I worked in the physical laboratory, fascinated by the direct contact with observation.”

# Full-Text Search

In [17]:
# Create a full-text index named PdfChunkFulltext on text property of the :Chunks nodes
try:
    driver.execute_query("CREATE FULLTEXT INDEX PdfChunkFulltext FOR (c:Chunk) ON EACH [c.text]")
except:
    print("Fulltext Index already exists!")

In [21]:
hybrid_query = '''
CALL {
    // vector index
    CALL db.index.vector.queryNodes('pdf', $k, $question_embedding) YIELD node, score
    WITH collect({node:node, score:score}) AS nodes, max(score) As max
    UNWIND nodes AS n
    // Normalize scores
    RETURN n.node as node, (n.score / max) AS score
    UNION
    // keyword index
    CALL db.index.fulltext.queryNodes('PdfChunkFulltext', $question, {limit: $k}) YIELD node, score
    WITH collect({node:node, score:score}) AS nodes, max(score) AS max UNWIND nodes AS n
    // We use 0 as min
    RETURN n.node AS node, (n.score / max) AS score
}
//deduplicate nodes
WITH node, max(score) AS score ORDER BY score DESC LIMIT $k
RETURN node, score
'''

similar_hybrid_records, _, _ = driver.execute_query(hybrid_query, question_embedding=question_embedding, question=question, k=4)

for record in similar_hybrid_records:
    print(record["node"]["text"])
    print(record["score"], record["node"]["index"])
    print("========")



CH‐Switzerland
Considering Einstein’s upbringing, his interest in inventions and patents was not unusual.
Being a manufacturer’s son, Einstein grew upon in an environment of machines and instruments.
When his father’s company obtained the contract to illuminate Munich city during beer festival, he
was actively engaged in execution of the contract. In his ETH days Einstein was genuinely interested
in experimental works. He wrote to his friend, “most of the time I worked in the physical laboratory,
fascinated by the direct contact with observation.” Einstein's
1.0 42
Einstein
left his job at the Patent office and joined the University of Zurich on October 15, 1909. Thereafter, he
continued to rise in ladder. In 1911, he moved to Prague University as a full professor, a year later, he
was appointed as full professor at ETH, Zurich, his alma‐mater. In 1914, he was appointed Director of
the Kaiser Wilhelm Institute for Physics (1914–1932) and a professor at the Humboldt University of
Berlin