In [3]:
import os
import re
import json
import pdfplumber
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
RAW_DIR = Path("data/raw")
PROCESSED_DIR = Path("data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

In [5]:
def clean_text(text: str) -> str:
    # Remove page numbers (standalone digits)
    text = re.sub(r"\n?\s*\d+\s*\n?", " ", text)
    # Remove multiple newlines
    text = re.sub(r"\n{2,}", "\n", text)
    # Normalize spaces
    text = re.sub(r" +", " ", text)
    return text.strip()

In [6]:
def extract_text_from_pdf(pdf_path: Path) -> str:
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            full_text += text + "\n"
    return clean_text(full_text)

In [7]:
def chunk_text(text: str, chunk_size=500, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    return splitter.split_text(text)

In [8]:
def process_pdfs():
    for pdf_file in RAW_DIR.glob("*.pdf"):
        print(f"Processing: {pdf_file.name}")
        text = extract_text_from_pdf(pdf_file)
        chunks = chunk_text(text)

        # Save as JSON
        output_file = PROCESSED_DIR / f"{pdf_file.stem}.json"
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(
                [{"chunk_id": i, "text": chunk} for i, chunk in enumerate(chunks)],
                f,
                ensure_ascii=False,
                indent=2
            )
        print(f"Saved {len(chunks)} chunks → {output_file}")

In [9]:
process_pdfs()

Processing: banking_regulation_act_1949.pdf
Saved 993 chunks → data/processed/banking_regulation_act_1949.json
Processing: sebi_1999.pdf
Saved 167 chunks → data/processed/sebi_1999.json
Processing: sebi_2023.pdf
Saved 45 chunks → data/processed/sebi_2023.json
Processing: sebi_2015.pdf
Saved 52 chunks → data/processed/sebi_2015.json
Processing: fatca_crs_guidelines.pdf
Saved 1317 chunks → data/processed/fatca_crs_guidelines.json


Vector Database

In [6]:
import os
import json
import faiss
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
PROCESSED_DIR = Path("data/processed")

# Load all chunks
documents = []
for file in PROCESSED_DIR.glob("*.json"):
    with open(file, "r", encoding="utf-8") as f:
        chunks = json.load(f)
        for c in chunks:
            documents.append({
                "text": c["text"],
                "source": file.stem,   # keep track of original PDF
                "chunk_id": c["chunk_id"]
            })

print(f"Loaded {len(documents)} chunks from {len(list(PROCESSED_DIR.glob('*.json')))} documents")

Loaded 2574 chunks from 5 documents


In [8]:
# Load small embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Generate embeddings for all chunks
embeddings = model.encode([doc["text"] for doc in documents], show_progress_bar=True)

# Convert to numpy array (FAISS needs float32)
embeddings = np.array(embeddings, dtype="float32")

Batches: 100%|██████████| 81/81 [00:08<00:00,  9.17it/s]


In [9]:
# Dimensions of embeddings
dim = embeddings.shape[1]

# Create FAISS index
index = faiss.IndexFlatIP(dim)

# Add embeddings to index
index.add(embeddings)

print(f"FAISS index contains {index.ntotal} vectors")

FAISS index contains 2574 vectors


In [10]:
def search(query, top_k=3):
    # Embed query
    query_vector = model.encode([query], convert_to_numpy=True).astype("float32")
    
    # Search in FAISS
    distances, indices = index.search(query_vector, top_k)
    
    results = []
    for idx, dist in zip(indices[0], distances[0]):
        if idx == -1:  # safeguard
            continue
        results.append({
            "text": documents[idx]["text"],
            "source": documents[idx]["source"],
            "chunk_id": documents[idx]["chunk_id"],
            "distance": float(dist)
        })
    return results

In [11]:
# Test retrieval
user_question="Who will be the competent authority to grant reward?"
results = search(user_question, top_k=10)
for r in results:
    print(f"[{r['source']} - chunk {r['chunk_id']}] (score={r['distance']:.4f})")
    print(r["text"][:1500], "\n---\n")

[sebi_2023 - chunk 24] (score=0.7631)
discretion of the authority competent to grant reward. The decision of Competent
Authority on such claim shall not be subject to challenge before any Court of law
by the informant or any other person on his behalf.
ii. The reward under these Guidelines shall not be assigned to any other person by
the informant. The Competent Authority may however grant reward to heirs or 
---

[sebi_2023 - chunk 28] (score=0.7014)
the Competent Authority to grant the reward by passing an order in this regard. The
Competent Authority shall pass the order based on the recommendation made by
the Informant Reward Committee. . Circumstances for determining the Amount of Reward.
In recommending the reward amount, the Informant Reward Committee shall
consider the following:
a) The accuracy of the information given by the informant;
b) The extent and nature of the assistance rendered by the informant; 
---

[sebi_2023 - chunk 27] (score=0.6552)
Protection and Education Fun

In [14]:
rag_context=""
count=1
for r in results:
    rag_context+=f"Extract {count}- "
    rag_context+=f"[{r['source']} - chunk {r['chunk_id']}] (score={r['distance']:.4f}) \n"
    rag_context+=f"{r["text"][:1500]} \n\n"
    count+=1

In [15]:
print(rag_context)

Extract 1- [sebi_2023 - chunk 24] (score=0.7631) 
discretion of the authority competent to grant reward. The decision of Competent
Authority on such claim shall not be subject to challenge before any Court of law
by the informant or any other person on his behalf.
ii. The reward under these Guidelines shall not be assigned to any other person by
the informant. The Competent Authority may however grant reward to heirs or 

Extract 2- [sebi_2023 - chunk 28] (score=0.7014) 
the Competent Authority to grant the reward by passing an order in this regard. The
Competent Authority shall pass the order based on the recommendation made by
the Informant Reward Committee. . Circumstances for determining the Amount of Reward.
In recommending the reward amount, the Informant Reward Committee shall
consider the following:
a) The accuracy of the information given by the informant;
b) The extent and nature of the assistance rendered by the informant; 

Extract 3- [sebi_2023 - chunk 27] (score=0.6552) 


In [31]:
import os
from dotenv import load_dotenv
load_dotenv()
openrouter_key = os.getenv('API_KEY')

In [32]:
import requests
import json
import os


url = "https://openrouter.ai/api/v1/chat/completions"
headers = {
    "Authorization": f"Bearer {openrouter_key}",
    "Content-Type": "application/json",
}

payload = {
    "model": "x-ai/grok-4-fast:free",
    "messages": [
        {"role": "user", "content": f"""
            You are a compliance assistant. Answer the question using the provided context.
            If you don’t know, say “No regulation found.” Always cite the source.
            Question: {user_question}
            Context: {rag_context}
            Answer:
    """}
    ]
}

try:
    response = requests.post(url, headers=headers, data=json.dumps(payload))
    response.raise_for_status()
    response_data = response.json()

    model_reply = response_data["choices"][0]["message"]["content"]
    print("Model Reply:")
    print(model_reply)

except requests.exceptions.RequestException as e:
    print(f"An error occurred during the API request: {e}")
    if hasattr(e, 'response') and e.response is not None:
        print(f"Response content: {e.response.text}")
except json.JSONDecodeError:
    print("Failed to decode JSON response.")
    print(f"Raw response content: {response.text}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Model Reply:
**The Executive Director in-charge of the Recovery and Refund Department** shall be the Competent Authority to grant the reward by passing an order in this regard.

**Source:** Extract 3 - [sebi_2023 - chunk 27]
