In [1]:
import fitz

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    pages = []

    for page_number in range(len(doc)):
        page = doc[page_number]
        text = page.get_text("text")

        pages.append({
            "page_number": page_number + 1,
            "text": text.strip()
        })

    return pages


In [2]:
pdf_path = "test_pdf_1.pdf"

In [3]:
pages = extract_text_from_pdf(pdf_path)

In [4]:
print("Total pages:", len(pages))
print("\nFirst 1000 characters:\n")
print(pages[17]["text"][:1000])

Total pages: 36

First 1000 characters:

Tenant Floors
Max Towers provides you with a state-of-the-art blank slate workspace 
that you can fully customise to suit your needs.
Designed such that 90% of regular occupied space gets direct  
line-of-sight to the outside environment, the average ceiling height on 
our tenant ﬂoor plates is 4.3 metres. Combined with optimum column 
spacing for eﬃcient ﬂoor planning, raised ﬂooring and a 10.5 meter 
planning grid, our spaces are agile, empowering you to build your 
workplace the way you want.
Above and facing page Oﬃces at Max Towers
35
34


In [5]:
import os
pdf_data = {}

pdf_folder = "pdfs"  # folder containing PDFs

for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        path = os.path.join(pdf_folder, filename)
        pdf_data[filename] = extract_text_from_pdf(path)

print("Loaded PDFs:", list(pdf_data.keys()))


Loaded PDFs: ['test_pdf_1.pdf', 'test_pdf_2.pdf', 'test_pdf_3.pdf', 'test_pdf_4.pdf']


In [6]:
!pip install spacy
!python -m spacy download en_core_web_sm






[notice] A new release of pip is available: 24.3.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 7.2 MB/s eta 0:00:02
     --------- ------------------------------ 2.9/12.8 MB 10.4 MB/s eta 0:00:01
     ------------------ --------------------- 6.0/12.8 MB 12.2 MB/s eta 0:00:01
     ------------------------------ -------- 10.0/12.8 MB 13.9 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 14.0 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 24.3.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import spacy
nlp = spacy.load("en_core_web_sm")


In [8]:
import re

def preprocess(text):
    text = re.sub(r'-\n', '', text)   
    text = re.sub(r'\n+', ' ', text)  
    text = re.sub(r'\s+', ' ', text)  

    return text.strip()


In [9]:
def spacy_chunk(text, max_chars=700):
    doc = nlp(text)
    chunks = []
    current_chunk = ""

    for sent in doc.sents:
        sentence = sent.text.strip()

        if len(current_chunk) + len(sentence) <= max_chars:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


In [10]:
!pip install langchain langchain-community langchain-core




[notice] A new release of pip is available: 24.3.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [48]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=350,        # ~250-350 words
    chunk_overlap=100,     # IMPORTANT
    separators=["\n\n", "\n", ".", " "]
)

documents = []

for pdf_name, pages in pdf_data.items():
    for page in pages:
        chunks = splitter.split_text(page["text"])

        for chunk in chunks:
            documents.append({
                "text": chunk,
                "pdf_name": pdf_name,
                "page_number": page["page_number"]
            })


print("Total document chunks:", len(documents))


Total document chunks: 375


In [49]:
documents[1]

{'text': '“It’s not what went into a building, \nit’s what’ll come out of it.”\n— Jonathan Ive\nChief Design Oﬃcer, Apple\nMax Towers at dusk  \nas seen from the plaza  \ninside Delhi One\n3\n2',
 'pdf_name': 'test_pdf_1.pdf',
 'page_number': 2}

In [13]:
!pip install sentence-transformers





[notice] A new release of pip is available: 24.3.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")


Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 892.88it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [50]:
texts = [doc["text"] for doc in documents]

embeddings = model.encode(
    texts,
    convert_to_numpy=True,
    normalize_embeddings=True
)


print("Embedding shape:", embeddings.shape)


Embedding shape: (375, 384)


In [41]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [51]:
def retrieve(query, top_k=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    
    results = [(documents[i], similarities[i]) for i in top_indices]
    
    return results


In [52]:
results = retrieve("What is the ceiling height?")

for chunk, score in results:
    print("\nScore:", score)
    print(chunk)



Score: 0.56067395
{'text': 'Wooden wall finish & panelling\nHigh ceilings with a height of 3.75 meters\nEfficient floor plates with a clean, efficient rectangular design \nPlants and nature wherever you look \nLobby of Max House\n\x1a', 'pdf_name': 'test_pdf_2.pdf', 'page_number': 7}

Score: 0.55218077
{'text': 'RESIDENCES\nGenerous ceiling height of 10.5ft \nAll residences have been designed  \nto have their own private gardens\nViews from every residence have \nbeen carefully curated to capture the \nstunning natural landscape\nProvision for 2 car parking spaces in \nevery residence\nPublic areas such as kitchen, staff \nrooms, service and utility have been', 'pdf_name': 'test_pdf_3.pdf', 'page_number': 31}

Score: 0.5036244
{'text': 'Generous ceiling height (10.5 ft)\nMaintenance free facade\nInsulated premium windows\nPerlato marble ﬂooring\nLandscaped terrace\nCOURTYARD VILLAS\nUnits 5\nPlot size range 364-565 sq.m.\nBuilt up area 4197 sq.ft.²\nCarpet area 3276 sq.ft.²\nBalcony a

In [19]:
!pip install faiss-cpu





[notice] A new release of pip is available: 24.3.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
import faiss
import numpy as np


In [53]:
import numpy as np

# Normalize embeddings
faiss.normalize_L2(embeddings)


In [54]:
dimension = embeddings.shape[1]

index = faiss.IndexFlatIP(dimension)  # Inner Product (cosine similarity after normalization)
index.add(embeddings)

print("Total vectors in index:", index.ntotal)


Total vectors in index: 375


In [55]:
def retrieve_faiss(query, top_k=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    
    # Normalize query vector
    faiss.normalize_L2(query_embedding)
    
    distances, indices = index.search(query_embedding, top_k)
    
    results = []
    for i in range(top_k):
        results.append((documents[indices[0][i]], distances[0][i]))
    
    return results


In [115]:
results = retrieve_faiss("How long does it take to reach suryaa hotel from Max House?")



def clean_text(text):
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s([?.!,'])", r"\1", text)
    return text.strip()

for chunk, score in results:
    chunk["text"] = clean_text(chunk["text"])

for chunk, score in results:
    print("\nScore:", score)
    print(chunk)




Score: 0.6477129
{'text': "The Suryaa 5 minutes Eros Hotel 9 minutes Crowne Plaza 15 minutes Sarovar Portico 8 minutes Strategically located within walking distance of the Okhla NSIC Metro Station, Max House offers excellent accessibility and connectivity to Delhi's various central business districts, airports and residential neighbourhoods. \x16", 'pdf_name': 'test_pdf_2.pdf', 'page_number': 3}

Score: 0.596907
{'text': '25 minutes Terminal 1D (Airport) 30 minutes MG Road (Gurgaon) 40 minutes Jasola 8 minutes -Iپ\x03WN\x033IQTI[P 8 minutes New Friends Colony 10 minutes Greater Kailash 10 minutes Connaught Place 22 minutes Kalkaji Interconnect Walking distance 0W\\MT[,ZQ^QVO\x03<QUM The Suryaa 5 minutes Eros Hotel 9 minutes Crowne Plaza 15 minutes Sarovar Portico 8 minutes', 'pdf_name': 'test_pdf_2.pdf', 'page_number': 3}

Score: 0.51668584
{'text': "station, Max Towers provides excellent connectivity to Delhi's various central business districts, airports and residential neighbourhoo

In [61]:
!pip install pandas


Collecting pandas
  Downloading pandas-3.0.0-cp313-cp313-win_amd64.whl.metadata (19 kB)
Downloading pandas-3.0.0-cp313-cp313-win_amd64.whl (9.7 MB)
   ---------------------------------------- 0.0/9.7 MB ? eta -:--:--
   ---------- ----------------------------- 2.6/9.7 MB 17.5 MB/s eta 0:00:01
   ------------------------------- -------- 7.6/9.7 MB 21.5 MB/s eta 0:00:01
   ---------------------------------------- 9.7/9.7 MB 21.2 MB/s eta 0:00:00
Installing collected packages: pandas
Successfully installed pandas-3.0.0



[notice] A new release of pip is available: 24.3.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [116]:
import pandas as pd

eval_df = pd.read_csv("evaluation_questions.csv", encoding="cp1252")



In [117]:
eval_df

Unnamed: 0,id,question,expected_answer_keyword
0,1,Does the project - Max Towers mention any cert...,UPRERA
1,2,"At Max Towers, solar heat gain is controlled t...",vertical-fin
2,3,Most prime location of max towers?,Noida
3,4,when was max estates established?,2016
4,5,What is the Solar Heat Gain Coefficient (SHGC)...,0.25
5,6,What percentage of visible light transmission ...,34%
6,7,What is the total Super Built Up Area of Max H...,105000
7,8,What is the green building certification ratin...,LEED Gold
8,9,How long does it take to reach suryaa hotel fr...,5
9,10,How far is Connaught Place from Max House?,11


In [118]:
import numpy as np
import time

In [119]:
for _, row in eval_df.iterrows():
    question = row["question"]
    results = retrieve_faiss(question)

In [120]:
len(results)

3

In [121]:
for _, row in eval_df.iterrows():
    question = row["question"]

    print("\n" + "="*80)
    print("Question:", question)

    results = retrieve_faiss(question, top_k=3)

    for i, (chunk, score) in enumerate(results):
        print(f"\nTop {i+1} | Score: {score}")
        print("PDF:", chunk["pdf_name"])
        print("Page:", chunk["page_number"])
        print("Answer Chunk:\n", chunk["text"])



Question: Does the project - Max Towers mention any certifications, approvals, or RERA registration?

Top 1 | Score: 0.7475370168685913
PDF: test_pdf_1.pdf
Page: 36
Answer Chunk:
 The project 'Max Towers' is registered with the UPRERA with registration no. UPRERAPRJ12475. 
Please refer to project details on the website of UPRERA www.up-rera.in.
OWNERSHIP 
We would love to see you soon.
For a visit to our experience centre and the site, contact us at
+91- 95553 95222
info@maxestates.in
www.maxtowers.com

Top 2 | Score: 0.726293683052063
PDF: test_pdf_2.pdf
Page: 15
Answer Chunk:
 CIN : U70109UP2016PTC087374. Its Occupation Certificate has been received on21.12.2018. The project 'Max Towers' is registered with the UPRERA with registration no. UPRERAPRJ12475. 
Please refer to project details on the website of UPRERA www.up-rera.in.

Top 3 | Score: 0.7247656583786011
PDF: test_pdf_3.pdf
Page: 35
Answer Chunk:
 on 21.12.2018. The project 'Max Towers' is registered with the UPRERA with regi

In [122]:
import time
import numpy as np
import re

def normalize(text):
    text = text.lower()
    text = text.replace("-", " ")
    text = text.replace("~", "")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def clean_number(text):
    # Remove commas from numbers
    return re.sub(r"(?<=\d),(?=\d)", "", text)

def contains_answer(chunk_text, expected):
    chunk_text = normalize(chunk_text)
    expected = normalize(expected)

    chunk_text = clean_number(chunk_text)
    expected = clean_number(expected)

    expected_words = expected.split()

    return all(word in chunk_text for word in expected_words)



top1_correct = 0
top3_correct = 0
latencies = []

top1_failed = []
top3_failed = []

total_questions = len(eval_df)

for _, row in eval_df.iterrows():
    question = row["question"]
    expected = row["expected_answer_keyword"]

    start = time.time()
    results = retrieve_faiss(question, top_k=3)
    latency = time.time() - start
    latencies.append(latency)

    # ---------- Top-1 ----------
    if contains_answer(results[0][0]["text"], expected):
        top1_correct += 1
    else:
        top1_failed.append({
            "question": question,
            "expected": expected,
            "retrieved_top1": results[0][0]["text"][:400]
        })

    # ---------- Top-3 ----------
    found_in_top3 = False
    for chunk, score in results:
        if contains_answer(chunk["text"], expected):
            found_in_top3 = True
            break

    if found_in_top3:
        top3_correct += 1
    else:
        top3_failed.append({
            "question": question,
            "expected": expected,
            "retrieved_chunks": [chunk["text"][:300] for chunk, _ in results]
        })


# -------- Final Metrics --------
avg_latency = np.mean(latencies)
p95_latency = np.percentile(latencies, 95)

print("\n===== FINAL METRICS =====")
print("Total Questions:", total_questions)
print("Top-1 Accuracy:", round(top1_correct / total_questions, 3))
print("Top-3 Accuracy:", round(top3_correct / total_questions, 3))
print("Average Latency (s):", round(avg_latency, 4))
print("P95 Latency (s):", round(p95_latency, 4))


# -------- Failure Analysis --------
print("\n===== TOP-1 FAILED QUESTIONS =====")
for item in top1_failed:
    print("\nQuestion:", item["question"])
    print("Expected:", item["expected"])
    print("Top-1 Retrieved Snippet:", item["retrieved_top1"])
    print("-" * 60)


print("\n===== TOP-3 FAILED QUESTIONS =====")
for item in top3_failed:
    print("\nQuestion:", item["question"])
    print("Expected:", item["expected"])
    print("Retrieved Top-3 Snippets:")
    for snippet in item["retrieved_chunks"]:
        print("->", snippet)
    print("-" * 60)



===== FINAL METRICS =====
Total Questions: 20
Top-1 Accuracy: 0.8
Top-3 Accuracy: 0.85
Average Latency (s): 0.0102
P95 Latency (s): 0.0207

===== TOP-1 FAILED QUESTIONS =====

Question: At Max Towers, solar heat gain is controlled through which system?
Expected: vertical-fin
Top-1 Retrieved Snippet: Max Towers is  
LEED Platinum Certiﬁed.
Max Towers is a thought leader in sustainability and aims to minimise 
its ecological footprint. To do so is important to us because we feel  
a certain responsibility towards our planet, and we invite you to share 
our enthusiasm for the same.
Ground-water recharge
An extensive rainwater  
management system
------------------------------------------------------------

Question: What is the green building certification rating of Max House?
Expected: LEED Gold
Top-1 Retrieved Snippet: Designed Responsibly and Sustainably Green Building Max Square is IGBC Platinum Certiﬁed for Green Building Standards including appropriate orientation, adequate walls a