In [22]:
import fitz
from pathlib import Path

def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    pages_data = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        
        if text.strip():  # avoid empty pages
            pages_data.append({
                "pdf_name": Path(pdf_path).name,
                "page_number": page_num + 1,
                "text": text.strip()
            })
    
    doc.close()
    return pages_data

In [23]:
pdf_data = extract_pdf_text(r"C:\Users\manav\OneDrive\Desktop\Interview_Question_Creator\data\introduction-to-Estate-360.pdf")
print(pdf_data[0])

{'pdf_name': 'introduction-to-Estate-360.pdf', 'page_number': 1, 'text': 'LiveWell\nat\nINTRODUCTION TO ESTATE 360 | HARERA no.: RC/REP/HARERA/GGM/860/592/2024/87 | HARERA website: https://haryanarera.gov.in'}


In [24]:
%pwd

'c:\\Users\\manav\\OneDrive\\Desktop\\Interview_Question_Creator'

In [25]:
%cd ..

c:\Users\manav\OneDrive\Desktop


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [26]:
%pwd

'c:\\Users\\manav\\OneDrive\\Desktop'

In [27]:
import re

def chunk_text(pages_data, chunk_size=200, overlap=40):
    all_chunks = []
    
    for page in pages_data:
        paragraphs = re.split(r'\n\s*\n', page["text"])  # split by blank lines
        
        current_chunk = []
        current_len = 0
        
        for para in paragraphs:
            words = para.split()
            para_len = len(words)
            
            if current_len + para_len <= chunk_size:
                current_chunk.extend(words)
                current_len += para_len
            else:
                if current_chunk:
                    chunk_text_str = " ".join(current_chunk)
                    all_chunks.append({
                        "pdf_name": page["pdf_name"],
                        "page_number": page["page_number"],
                        "text": chunk_text_str
                    })
                    
                    # overlap
                    current_chunk = current_chunk[-overlap:]
                    current_len = len(current_chunk)
                
                current_chunk.extend(words)
                current_len += para_len
        
        if current_chunk:
            all_chunks.append({
                "pdf_name": page["pdf_name"],
                "page_number": page["page_number"],
                "text": " ".join(current_chunk)
            })
    
    return all_chunks


In [28]:
chunks = chunk_text(pdf_data)
print(len(chunks))
print(chunks[0])

69
{'pdf_name': 'introduction-to-Estate-360.pdf', 'page_number': 1, 'text': 'LiveWell at INTRODUCTION TO ESTATE 360 | HARERA no.: RC/REP/HARERA/GGM/860/592/2024/87 | HARERA website: https://haryanarera.gov.in'}


In [29]:
import torch
print(torch.cuda.is_available())

True


In [30]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder

# model = SentenceTransformer(
#     "all-MiniLM-L6-v2",
#     device="cuda"  # use GPU
# )
model = SentenceTransformer("intfloat/e5-small", device="cuda")
# model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda")

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device="cuda")
# reranker = CrossEncoder("BAAI/bge-reranker-base")

In [31]:
from tqdm import tqdm
import numpy as np

texts = [chunk["text"] for chunk in chunks]

embeddings = model.encode(
    texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # IMPORTANT for cosine similarity
)

print(embeddings.shape)

Batches: 100%|██████████| 2/2 [00:00<00:00,  3.00it/s]

(69, 384)





In [32]:
import faiss

dimension = embeddings.shape[1]

index = faiss.IndexFlatIP(dimension)  # cosine similarity (because normalized)
index.add(embeddings)

print("Total vectors in index:", index.ntotal)

Total vectors in index: 69


In [33]:
from rank_bm25 import BM25Okapi
import numpy as np

# Tokenize corpus for BM25
tokenized_corpus = [chunk["text"].lower().split() for chunk in chunks]
bm25 = BM25Okapi(tokenized_corpus)

In [34]:
# def search(query, top_k=3):
#     # Embed query
#     query_embedding = model.encode(
#         ["query: "+ query],
#         convert_to_numpy=True,
#         normalize_embeddings=True
#     )
    
#     # Search
#     scores, indices = index.search(query_embedding, top_k)
    
#     results = []
#     for score, idx in zip(scores[0], indices[0]):
#         results.append({
#             "score": float(score),
#             "text": chunks[idx]["text"],
#             "pdf_name": chunks[idx]["pdf_name"],
#             "page_number": chunks[idx]["page_number"]
#         })
    
#     return results


In [35]:
def search(query, top_k=3, candidate_k=8, alpha=0.5):

    # -------------------------
    # 1️⃣ Dense Retrieval
    # -------------------------
    query_embedding = model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    dense_scores, dense_indices = index.search(query_embedding, candidate_k)
    dense_scores = dense_scores[0]
    dense_indices = dense_indices[0]

    # Normalize dense scores to [0,1]
    dense_scores = (dense_scores - dense_scores.min()) / (
        dense_scores.max() - dense_scores.min() + 1e-8
    )

    # -------------------------
    # 2️⃣ BM25 Retrieval
    # -------------------------
    tokenized_query = query.lower().split()
    bm25_scores = np.array(bm25.get_scores(tokenized_query))

    # Get top BM25 candidates
    bm25_top_indices = np.argsort(bm25_scores)[::-1][:candidate_k]

    # Normalize BM25 scores
    bm25_scores = (bm25_scores - bm25_scores.min()) / (
        bm25_scores.max() - bm25_scores.min() + 1e-8
    )

    # -------------------------
    # 3️⃣ Full Hybrid Candidate Merge
    # -------------------------

    # Union of both candidate sets
    candidate_indices = set(dense_indices.tolist()) | set(bm25_top_indices.tolist())

    hybrid_scores = {}

    for idx in candidate_indices:
        dense_score = 0
        bm25_score = 0

        if idx in dense_indices:
            dense_score = dense_scores[np.where(dense_indices == idx)[0][0]]

        if idx in bm25_top_indices:
            bm25_score = bm25_scores[idx]

        hybrid_scores[idx] = alpha * dense_score + (1 - alpha) * bm25_score

    # Sort by hybrid score
    sorted_candidates = sorted(
        hybrid_scores.items(),
        key=lambda x: x[1],
        reverse=True
    )

    # Take top for reranking
    rerank_pool = [idx for idx, _ in sorted_candidates[:candidate_k]]

    # -------------------------
    # 4️⃣ Cross-Encoder Reranking
    # -------------------------
    pairs = [(query, chunks[idx]["text"]) for idx in rerank_pool]
    rerank_scores = reranker.predict(pairs)

    reranked = sorted(
        zip(rerank_pool, rerank_scores),
        key=lambda x: x[1],
        reverse=True
    )

    # -------------------------
    # 5️⃣ Final Results
    # -------------------------
    results = []
    for idx, score in reranked[:top_k]:
        results.append({
            "score": float(score),
            "text": chunks[idx]["text"],
            "pdf_name": chunks[idx]["pdf_name"],
            "page_number": chunks[idx]["page_number"]
        })

    return results


In [36]:
results = search("Where is Estate 128 located?")
for r in results:
    print("\n---")
    print("Score:", r["score"])
    print("PDF:", r["pdf_name"])
    print("Page:", r["page_number"])
    print("Text:", r["text"][:300])


---
Score: 2.0487008094787598
PDF: introduction-to-Estate-360.pdf
Page: 5
Text: INTRODUCTION TO ESTATE 360 | HARERA no.: RC/REP/HARERA/GGM/860/592/2024/87 | HARERA website: https://haryanarera.gov.in Bringing real well-being to real estate. Established in 2016, Max Estates Limited is the real estate arm of Max Group. As a well-being company enabled by real estate, we endeavour 

---
Score: -3.231924057006836
PDF: introduction-to-Estate-360.pdf
Page: 18
Text: INTRODUCTION TO ESTATE 360 | HARERA no.: RC/REP/HARERA/GGM/860/592/2024/87 | HARERA website: https://haryanarera.gov.in Estate 360 offers you seamless access to a variety of amenities and environments. Located in Sector 36A-Dwarka Expressway, our estate is built close to the conﬂuence of the Dwarka 

---
Score: -4.4625139236450195
PDF: introduction-to-Estate-360.pdf
Page: 17
Text: INTRODUCTION TO ESTATE 360 | HARERA no.: RC/REP/HARERA/GGM/860/592/2024/87 | HARERA website: https://haryanarera.gov.in Our estate, located in Sector 36

In [37]:
import time
import numpy as np

def measure_latency(test_queries):
    latencies = []
    
    for q in test_queries:
        start = time.time()
        _ = search(q, top_k=3)
        end = time.time()
        latencies.append(end - start)
    
    avg_latency = np.mean(latencies)
    p95_latency = np.percentile(latencies, 95)
    
    return avg_latency, p95_latency


In [38]:
# test_queries = [
#     "Where is Estate 128 located?",
#     "What amenities are available?",
#     "Tell me about sustainability features.",
#     "What are the nearby schools?",
#     "What is the size of The Hub?",
#     "What kind of decks do the apartments have?",
#     "Is there a swimming pool?",
#     "What technology features are included?",
#     "What is the carpet area of the units?",
#     "What is the RERA registration number?",
#     "Is the project earthquake resistant?",
#     "What wellness features are offered?",
#     "What sports facilities are available?",
#     "Does the project have open spaces?",
#     "What is the landscape area?"
# ]
# test_queries = [
#     "Where is Max House situated?",
#     "How many floors does the building have?",
#     "What is the total area of the project?",
#     "What sustainability certification does it have?",
#     "Is there basement parking?",
#     "What is the typical office floor size?",
#     "What is the building height?",
#     "Is the building LEED certified?",
#     "What metro connectivity does it offer?",
#     "What is the ceiling height in common areas?",
#     "What kind of floor condition is provided?",
#     "How far is IGI Airport?",
#     "What is the air treatment technology mentioned?",
#     "What makes the facade unique?",
#     "Who owns Max House?"
# ]
test_queries = [
    "What is Estate 360?",
    "Where is Estate 360 situated?",
    "How large is the total land parcel?",
    "How many residences are part of the estate?",
    "Is there senior living within the project?",
    "Who manages senior care at Estate 360?",
    "What is the total number of trees in the estate?",
    "What is the total area of amenities?",
    "Is Estate 360 IGBC certified?",
    "What does IGBC Platinum mean here?",
    "Is the project car-free?",
    "How does underground parking work?",
    "What makes the community intergenerational?",
    "What healthcare facilities are available?",
    "Is there an on-site wellness centre?",
    "What sports facilities are available?",
    "Is there a cricket pitch?",
    "Are there tennis or padel courts?",
    "Is there a jogging track?",
    "Does the estate have a lake?",
    "What is the Serpentine Lake?",
    "What is The Hub?",
    "What amenities are available at The Hub?",
    "Does The Hub have coworking space?",
    "Are there dining options within the estate?",
    "What retail services are available at the Community Plaza?",
    "Is there a pharmacy inside the estate?",
    "What home automation features are included?",
    "Does it include smart door locks?",
    "Are there motion sensor lights?",
    "What safety features are included?",
    "What is the total area of a 3BHK?",
    "What is the total area of a 4BHK?",
    "What is the RERA carpet area of Type A?",
    "Are there wraparound balconies?",
    "Is there a separate staff entry?",
    "What is the green belt mentioned in the brochure?",
    "How far is IGI Airport?",
    "How far is Medanta Hospital?",
    "What is the HARERA registration number?",
    "Who is the principal architect?",
    "Who is the landscape designer?",
    "Is there an early learning centre?",
    "What concierge services are available?"
]
# test_queries = [
#     "What is the LMS algorithm?",
#     "Where is the housing price dataset from?",
#     "How large is the Portland housing dataset?",
#     "How many bedrooms are part of the richer housing dataset?",
#     "Is there a cost function defined for linear regression?",
#     "Who updated these lecture notes?",
#     "What is the total number of input variables in the hypothesis function?",
#     "What is the total area of a house used as an example input?",
#     "Is the logistic function used for classification?",
#     "What does the term 'non-parametric' mean here?",
#     "Is the batch gradient descent algorithm car-free?"
# ]

In [39]:
avg, p95 = measure_latency(test_queries)
print("Average latency:", avg)
print("P95 latency:", p95)


Average latency: 0.03469365835189819
P95 latency: 0.04252699613571167


In [40]:
# evaluation_set = [
#     {
#         "query": "Where is Estate 128 located?",
#         "expected_keyword": "Noida"
#     },
#     {
#         "query": "What is the total land area of the project?",
#         "expected_keyword": "10 acres"
#     },
#     {
#         "query": "How much open space does the project have?",
#         "expected_keyword": "80% open spaces"
#     },
#     {
#         "query": "What sustainability certification does Estate 128 have?",
#         "expected_keyword": "IGBC Platinum"
#     },
#     {
#         "query": "What are some nearby landmarks?",
#         "expected_keyword": "Mall of India"
#     },
#     {
#         "query": "What amenities are available at The Hub?",
#         "expected_keyword": "swimming pool"
#     },
#     {
#         "query": "What is the size of The Hub?",
#         "expected_keyword": "4273.5 sq. m"
#     },
#     {
#         "query": "What kind of decks do the apartments have?",
#         "expected_keyword": "wrap-around decks"
#     },
#     {
#         "query": "What technology features are provided?",
#         "expected_keyword": "home automation"
#     },
#     {
#         "query": "What is the RERA registration number?",
#         "expected_keyword": "UPRERAPRJ446459"
#     }
# ]
# evaluation_set = [
#     {
#         "query": "Where is Max House located?",
#         "expected_keyword": "Okhla"
#     },
#     {
#         "query": "What is the total super built up area?",
#         "expected_keyword": "1,05,425"
#     },
#     {
#         "query": "How tall is the building?",
#         "expected_keyword": "40 m"
#     },
#     {
#         "query": "What is the green rating of Max House?",
#         "expected_keyword": "LEED Gold"
#     },
#     {
#         "query": "What is the typical floor plate size?",
#         "expected_keyword": "13,000"
#     },
#     {
#         "query": "How many tenant floors are there?",
#         "expected_keyword": "8"
#     },
#     {
#         "query": "What is the floor condition offered?",
#         "expected_keyword": "Warmshell"
#     },
#     {
#         "query": "What is the ceiling height mentioned?",
#         "expected_keyword": "3.75 meters"
#     },
#     {
#         "query": "Which metro station is within walking distance?",
#         "expected_keyword": "Okhla NSIC"
#     },
#     {
#         "query": "What RERA registration number is mentioned?",
#         "expected_keyword": "UPRERAPRJ12475"
#     }
# ]
evaluation_set = [
    {"query": "Where is Estate 360 located?", "expected_keyword": "Sector 36A"},
    {"query": "How many acres is Estate 360 spread across?", "expected_keyword": "11.8"},
    {"query": "How many residences are there in total?", "expected_keyword": "6"},
    {"query": "How many residences are dedicated to senior living?", "expected_keyword": "2"},
    {"query": "How many amenities are offered?", "expected_keyword": "60"},
    {"query": "How many trees are planned in the estate?", "expected_keyword": "700"},
    {"query": "What is the total amenities area in square feet?", "expected_keyword": "1 lac"},
    {"query": "What sustainability certification does Estate 360 have?", "expected_keyword": "IGBC Platinum"},
    {"query": "What is the HARERA registration number?", "expected_keyword": "RC/REP/HARERA/GGM/860/592/2024/87"},
    {"query": "Which organization manages the senior living residences?", "expected_keyword": "Antara"},
    {"query": "What expressway is Estate 360 located on?", "expected_keyword": "Dwarka Expressway"},
    {"query": "How far is Cyber City from Estate 360?", "expected_keyword": "30 min"},
    {"query": "How wide is the green belt to the north of the estate?", "expected_keyword": "50-metre"},
    {"query": "What is the total area of the 3 Bedroom Residence 1 & 3?", "expected_keyword": "2611"},
    {"query": "What is the RERA carpet area of the 4 Bedroom residence?", "expected_keyword": "1899"},
    {"query": "What feature makes Estate 360 a car-free community?", "expected_keyword": "underground vehicular"},
    {"query": "What is the name of the community clubhouse?", "expected_keyword": "The Hub"},
    {"query": "What is the name of the senior-focused clubhouse?", "expected_keyword": "Antara Club"},
    {"query": "Which global architecture firm is the principal architect?", "expected_keyword": "Gensler"},
    {"query": "What metro connectivity is mentioned near the estate?", "expected_keyword": "Delhi Metro"}
]
# evaluation_set = [
#     {"query": "Where is the housing dataset located?", "expected_keyword": "Portland, Oregon"},
#     {"query": "How many houses are in the Portland dataset?", "expected_keyword": "47"},
#     {"query": "How many features are in the richer housing dataset?", "expected_keyword": "2"},
#     {"query": "How many bedrooms are in the first example of the richer dataset?", "expected_keyword": "3"},
#     {"query": "What is the cost function for linear regression?", "expected_keyword": "least-squares"},
#     {"query": "What is the training set size n in the example?", "expected_keyword": "47"},
#     {"query": "What is the intercept term convention?", "expected_keyword": "x0 = 1"},
#     {"query": "What sustainability of convergence does batch gradient descent have?", "expected_keyword": "global minimum"},
#     {"query": "What is the specific name for the Widrow-Hoff rule?", "expected_keyword": "LMS update rule"},
#     {"query": "Which organization's course are these notes for?", "expected_keyword": "CS229"},
#     {"query": "What expressway of learning uses stochastic updates?", "expected_keyword": "stochastic gradient descent"},
#     {"query": "How far does the hypothesis function bound values in logistic regression?", "expected_keyword": "0 and 1"},
#     {"query": "How wide is the parameter vector for a 50,000 word vocabulary in Naive Bayes?", "expected_keyword": "50,000"}
# ]


In [41]:
def evaluate_accuracy(evaluation_set):
    top1_correct = 0
    top3_correct = 0
    
    for item in evaluation_set:
        results = search(item["query"], top_k=3)
        
        # Top-1 check
        if item["expected_keyword"].lower() in results[0]["text"].lower():
            top1_correct += 1
        
        # Top-3 check
        for r in results:
            if item["expected_keyword"].lower() in r["text"].lower():
                top3_correct += 1
                break
    
    total = len(evaluation_set)
    
    return (
        top1_correct / total,
        top3_correct / total
    )


In [42]:
top1, top3 = evaluate_accuracy(evaluation_set)
print("Top-1 Accuracy:", top1)
print("Top-3 Accuracy:", top3)


Top-1 Accuracy: 0.9
Top-3 Accuracy: 1.0
