##Creation of Project Folders and Simple Fast API Backend

In [3]:
import os, textwrap, pathlib

base = "/content/rag-factsheet-bot"

files = {
    f"{base}/backend/requirements.txt": textwrap.dedent("""\
        fastapi
        uvicorn[standard]
        langchain
        langchain-community
        pypdf
        sentence-transformers
        faiss-cpu
        python-multipart
    """),

    f"{base}/backend/app/__init__.py": "",

    f"{base}/backend/app/main.py": textwrap.dedent("""\
        from fastapi import FastAPI
        from pydantic import BaseModel

        app = FastAPI()

        class Question(BaseModel):
            query: str

        @app.get("/health")
        def health():
            return {"status": "ok"}

        @app.post("/ask")
        def ask_pdf(q: Question):
            # RAG pipeline not added yet
            return {"answer": "(placeholder) RAG not implemented yet"}
    """),
}

# Create files
for path, content in files.items():
    pathlib.Path(os.path.dirname(path)).mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        f.write(content)

pathlib.Path(f"{base}/data").mkdir(parents=True, exist_ok=True)

print("Backend skeleton created.")
print(f"Project root: {base}")


Backend skeleton created.
Project root: /content/rag-factsheet-bot


##Upload the pdf into the Project Root


In [4]:
from google.colab import files
import shutil

base = "/content/rag-factsheet-bot/data"

uploaded = files.upload()

# move first uploaded file to the data folder as factsheet.pdf
pdf_name = next(iter(uploaded))
shutil.move(f"/content/{pdf_name}", f"{base}/factsheet.pdf")

print("PDF uploaded:", f"{base}/factsheet.pdf")

Saving bajaj_finserv_factsheet_Oct.pdf to bajaj_finserv_factsheet_Oct (1).pdf
PDF uploaded: /content/rag-factsheet-bot/data/factsheet.pdf


In [5]:
!pip -q install langchain langchain-community pypdf

##Load the PDF using langchain from the Project Root Folder

In [6]:
from langchain_community.document_loaders import PyPDFLoader

pdf_path = "/content/rag-factsheet-bot/data/factsheet.pdf"

loader = PyPDFLoader(pdf_path)
documents = loader.load()

print("Loaded PDF as LangChain documents")
print("Total pages:", len(documents))
print("\n--- Preview of Page 1 ---\n")
print(documents[0].page_content[:500])

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


Loaded PDF as LangChain documents
Total pages: 56

--- Preview of Page 1 ---

Give your investmen ts the
3-in- 1 adv antage
Quality
Companies with dis tinct
compe titive adv antages
Growth
Companies with high
scalability potential
Value
Companies that present intrinsicvalue opportunitiesSMALL CAP. GIANT POTENTIAL.
  
BAJAJ FINSERV 
SMALL CAP FUND
Mutual  Fund investme nts are subj ect to market risks, read al l sche me related documents caref ully.BAJ AJ FIN SERV SMALL CAP FUND
An ope n ende d equity sche me predominantl y investing in small cap stocks
This product is sui


##Chunking with the help of Recursive Chunking
Chunk size= 500
Chunk overlap= 100 for context between the chunks

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 500
chunk_overlap = 100

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n\n", "\n", ".", "!", "?", " ", ""],
)

chunks = text_splitter.split_documents(documents)

print("Chunking done!")
print("Total chunks:", len(chunks))

# First Chunk
print("\n--- First chunk preview ---\n")
print(chunks[0].page_content[:500])

Chunking done!
Total chunks: 638

--- First chunk preview ---

Give your investmen ts the
3-in- 1 adv antage
Quality
Companies with dis tinct
compe titive adv antages
Growth
Companies with high
scalability potential
Value
Companies that present intrinsicvalue opportunitiesSMALL CAP. GIANT POTENTIAL.
  
BAJAJ FINSERV 
SMALL CAP FUND
Mutual  Fund investme nts are subj ect to market risks, read al l sche me related documents caref ully.BAJ AJ FIN SERV SMALL CAP FUND
An ope n ende d equity sche me predominantl y investing in small cap stocks


## Generating Embeddings for All Chunks using SentenceTransformer (MiniLM-L6-v2)


In [17]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# 1. Load model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# 2. Prepare text list
texts = [chunk.page_content for chunk in chunks]

# 3. Embed chunks (float32 + normalized)
vectors = model.encode(
    texts,
    convert_to_numpy=True,
    normalize_embeddings=True
).astype(np.float32)

# 4. Create FAISS index
dim = vectors.shape[1]
index = faiss.IndexFlatIP(dim)

# 5. Add vectors to index
index.add(vectors)

print("✅ FAISS index built!")
print("Total vectors:", index.ntotal)

✅ FAISS index built!
Total vectors: 638


In [18]:
query = "What is the 5-year return of the fund?"
q_vec = model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)

scores, ids = index.search(q_vec, k=4)

print("Top chunks:")
for i, idx in enumerate(ids[0]):
    print(f"\n--- Chunk {idx} (score={scores[0][i]:.4f}) ---\n")
    print(texts[idx][:500])


Top chunks:

--- Chunk 553 (score=0.6038) ---

Period for which scheme’s performance has been provided is computed basis last day of the previous month preceding the date of this material. Returns greater than 1 year are compounded annualized. Face
Value per unit: Rs. 10.The Fund managers of the scheme: Mr. Sorbh Gupta (Equity Portion), Mr. Anup Kulkarni (Equity Portion)(Fund Manager since June 10, 2025), Mr. Siddharth Choudhary (Debt Portion), Mr. Vinay Bafna (Commodity

--- Chunk 543 (score=0.5979) ---

Returns greater than 1 year are compounded annualized. Period for which scheme’s performance has been provided is computed basis last day of the previous month preceding the date of this material. Face 
Value per unit (Allotment NAV): Rs. 1000.34
The Fund managers of the scheme: Mr. Siddharth Chaudhary. For the performance of other schemes managed by Fund Managers which has completed 1 year or more than 1 year since inception, refer page no. 
48.

--- Chunk 386 (score=0.5911) ---

Not

In [19]:
index = faiss.IndexFlatIP(dim)
index.add(vectors)

In [26]:
##Retrieval

In [20]:
def retrieve(query, k=4):#We are using top 4 values
    q_vec = model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
    scores, ids = index.search(q_vec, k)

    results = []
    for i in range(k):
        results.append({
            "chunk_id": int(ids[0][i]),
            "score": float(scores[0][i]),
            "text": texts[ids[0][i]]
        })
    return results


In [23]:
retrieve("What is the benchmark return?")

[{'chunk_id': 548,
  'score': 0.5739936232566833,
  'text': '3.79% 3.95% -3.45% 10,379 10,395 9,655\n11.94% 12.19% 9.34% 12,113 12,157 11,638 Since InceptionValue of Investment of Rs.10,000\nFund Returns (%)Benchmark\nReturns (%)Additional Benchmark \nReturns (%)Fund (Rs) Benchmark (Rs)Additional \nBenchmark (Rs)PERFORMANCE DATA\nLast 1 Year\nReturns as on 30th September, 2025\nPast performance may or may not be sustained in future. Benchmark: NIFTY 50 TRI Additional Benchmark: BSE Sensex TRI. Inception Date: 19th January 2024'},
 {'chunk_id': 526,
  'score': 0.5588138103485107,
  'text': '(TRI) Additional Benchmark: Nifty 50 TRI. Inception Date: 27th December 2024\nPeriod for which scheme’s performance has been provided is computed basis last day of the previous month preceding the date of this material. Simple annualized returns have been provided as per the extant'},
 {'chunk_id': 531,
  'score': 0.5550969839096069,
  'text': 'Additional Benchmark: NIFTY 50 TRI. Inception Date: 29th

In [24]:
def answer_query(query, k=4, min_score=0.15):


    retrieved = retrieve(query, k)

    filtered = [r for r in retrieved if r["score"] >= min_score]

    if not filtered:
        return "Sorry, I couldn’t find this information in the factsheet."

    combined_text = "\n\n".join([r["text"] for r in filtered])

    return combined_text


In [29]:
print(answer_query("What is the 3-year return of Bajaj Flexi Cap Fund"))

At Bajaj Finserv Asset Management Limited, September 
marked an important milestone. Bajaj Finserv Flexi  Cap Fund^ completed two years since launch in August end. In this short span, it has grown to become the largest fund in our product suite, with AUM exceeding ₹5,400 crores*, reinforcing the resilience of our investment approach.
Bajaj Finserv Flexi Cap Fund is built on the principle of

Last 1 Year
Since InceptionBajaj Finserv Flexi Cap Fund - Regular - Growth
Bajaj Finserv Flexi Cap Fund - Direct - GrowthFund Returns (%)Benchmark
Returns (%)Additional Benchmark 
Returns (%)Period Fund (Rs) Benchmark (Rs)Additional 
Benchmark (Rs)Bajaj Finserv Flexi Cap Fund
-2.68% -5.50% -3.45% 9,732 9,450 9,655
18.84% 15.93% 13.03% 14,447 13,703 12,982
-1.36% -5.50% -3.45% 9,864 9,450 9,655
20.55% 15.93% 13.03% 14,893 13,703 12,982Value of Investment of Rs.10,000
Last 1 Year
Since Inception

Bajaj Finserv Multi Cap Fund - Direct - GrowthFund Returns (%)Benchmark
Returns (%)Additional Benchmark 
