In [4]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from utilities.hashing import calculate_hash
# from utilities.redis_cache import get_cached_summary, cache_summary
from utilities.file_utils import (
    get_doc_by_hash,
    get_file,
    save_doc_data,
    extract_using_textract,
    load_pdf_using_PyPDF,
    save_file,
)
from utilities.redis_cache import get_cached_data
from utilities.llm_utils import (
    answer_from_structured_data,
    create_embeddings,
    generate_summary,
)

In [5]:
doc_url = "https://hackrx.blob.core.windows.net/assets/policy.pdf?sv=2023-01-03&st=2025-07-04T09%3A11%3A24Z&se=2027-07-05T09%3A11%3A00Z&sr=b&sp=r&sig=N4a9OU0w0QXO6AOIBiu4bpl7AXvEZogeT%2FjUHNO7HzQ%3D"

questions= [
    "What is the grace period for premium payment under the National Parivar Mediclaim Plus Policy?",
]

In [6]:
import requests
response = requests.get(doc_url)
if response.status_code != 200:
    raise Exception("Failed to download the document from the provided URL.")

file_bytes = response.content
filename = doc_url.split("/")[-1].split("?")[0]

In [7]:
file_hash = calculate_hash(file_bytes)
os.makedirs("docs", exist_ok=True)
_, ext = os.path.splitext(filename)
file_path = os.path.join("docs", file_hash + ext)
if not os.path.exists(file_path):
    with open(file_path, "wb") as f:
        f.write(file_bytes)
else:
    print("File already exists!")

File already exists!


In [8]:
doc_data = get_doc_by_hash(file_hash)

In [9]:
print(doc_data)

None


In [10]:
summary = get_file(f"docs/summary/{file_hash}.txt")
print(summary)


b"This document outlines the **National Parivar Mediclaim Plus Policy**, an indemnity-based health insurance plan offered by National Insurance Company Limited. It covers medical expenses for illness or injury requiring hospitalization, day care treatment, or domiciliary hospitalization, up to a Floater Sum Insured for the insured family.\r\n\r\nHere's a summary of its key aspects:\r\n\r\n**1. Core Coverage & Benefits:**\r\n*   **In-patient Treatment:** Covers room/ICU charges (with limits based on Plan A/B/C or PPN rates), medical practitioner fees, anesthesia, blood, oxygen, OT charges, surgical appliances, medicines, diagnostic procedures, internally implanted prosthetics, dental treatment due to injury, plastic surgery, medically necessary hormone replacement, vitamins/tonics (if part of treatment), and circumcision (if medically necessary).\r\n*   **Pre & Post Hospitalization:** Covers medical expenses incurred up to 30 days before hospitalization and up to 60 days after discharge

In [24]:

SIMPLIFY_USER_QUERY = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a smart assistant. Your job is to take user's query and simplify it by removing specificity while preserving the core intent of the question, and rewriting to keep only the most relevant part of the question. Remove specific detail and make the question general.",
        ),
        (
            "human",
            "Document Summary:\n{summary}\n\nUser's Question:\n{question}\n\nRewrite this question so that it captures what the user is most likely asking based on the document.",
        ),
    ]
)

def simplify_query(query):
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
    )

    simplify_chain = SIMPLIFY_USER_QUERY | llm | StrOutputParser()

    simplified_question = simplify_chain.invoke({
        "summary": summary,
        "question": query
    })

    return simplified_question

In [12]:
simplify_query(questions[0])

What is the premium payment grace period for the National Parivar Mediclaim Plus Policy?


In [27]:
import os
from pinecone import Pinecone
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import Pinecone as PineconeStore
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

from prompts import BASIC_PROMPT
from utilities.text_utils import format_docs

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

index_name = "hackrx-embeddings"
def answer_from_structured_data(file_hash, questions):
    # Load vector store from Pinecone
    index = pc.Index(index_name)

    def retrieve(query):
        res = index.search(
            namespace=file_hash, query={"inputs": {"text": query}, "top_k": 5}
        )
        docs = [hit["fields"]["chunk_text"] for hit in res["result"]["hits"]]
        return docs

    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
    )

    prompt = PromptTemplate.from_template(BASIC_PROMPT)

    rag_chain = (
        {
            "context": RunnablePassthrough() | retrieve | format_docs,
            "question": RunnablePassthrough(),
        }
        | prompt
        | llm
        | StrOutputParser()
    )

    try:
        answers = []
        for question in questions:
            simplified_question = simplify_query(question)
            print(simplified_question)
            answer = rag_chain.invoke(simplified_question)
            answers.append(answer)
    except Exception as e:
        print(e)

    return answers

In [26]:
answer_from_structured_data("042f627c5d8f619cf62cc21f864b08dfd59059d0b9aab805d132e0014489d625", questions)

What is the grace period for health insurance premium payments?
{'result': {'hits': [{'_id': '042f627c5d8f619cf62cc21f864b08dfd59059d0b9aab805d132e0014489d625-chunk-10',
                      '_score': 0.4727219045162201,
                      'fields': {'chunk_id': 10.0,
                                 'chunk_text': '2.19  Family members means '
                                               'spouse, children and parents '
                                               'of the insured, covered by the '
                                               'Policy. \n'
                                               ' \n'
                                               '2.20  Floater Sum Insured '
                                               'means the sum insured, as '
                                               'mentioned in the Schedule, '
                                               'available to all the insured '
                                               'persons, for any and 

['The grace period for health insurance premium payments is thirty days.']

In [11]:
from utilities.file_utils import (
    get_doc_by_hash,
    get_file,
    save_doc_data,
    extract_using_textract,
    load_pdf_using_PyPDF,
    save_file,
)

async def extract_and_save_pdf_text(pdf_file_path):
    text = await load_pdf_using_PyPDF(pdf_file_path)
    text = "\n".join(doc.page_content for doc in text)
    txt_file_path = pdf_file_path.rsplit(".", 1)[0] + ".txt"
    text = text.encode("utf-8")
    save_file(text, txt_file_path)
    print(f"Extracted text saved to {txt_file_path}")

In [17]:
await extract_and_save_pdf_text("docs/07ff23e18f431ec812a6c954ce57c3ec1e92dbc2ea01e16f228eb22ba97de370.pdf")

Extracted text saved to docs/07ff23e18f431ec812a6c954ce57c3ec1e92dbc2ea01e16f228eb22ba97de370.txt


In [18]:
import re

def simplify_spaces(text):
    return re.sub(r'\s+', ' ', text)

# Example usage
with open("docs/07ff23e18f431ec812a6c954ce57c3ec1e92dbc2ea01e16f228eb22ba97de370.txt", "r", encoding="utf-8") as file:
    sample_text = file.read()
simplified_text = simplify_spaces(sample_text)
new_text = simplified_text.encode("utf-8")
save_file(new_text, "docs/07ff23e18f431ec812a6c954ce57c3ec1e92dbc2ea01e16f228eb22ba97de370.txt")

In [None]:
import os
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Pinecone as PineconeStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain.schema import Document
from prompts import BASIC_PROMPT, SIMPLIFY_USER_QUERY
from utilities.file_utils import is_doc_already_processed, load_pdf_using_PyPDF
from utilities.text_utils import format_docs, log_chunks
from dotenv import load_dotenv

bge_embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
embeddings = bge_embeddings

def create_embeddings_using_faiss(doc_data, file_hash):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    doc_data = [Document(page_content=doc_data)]
    splits = text_splitter.split_documents(doc_data)

    vector_store_path = f"vector_store/{file_hash}"
    if not os.path.exists(vector_store_path):
        # Creating Vector Store
        vector_store = FAISS.from_documents(splits, embedding=embeddings)
        # Store the vector DB locally to save processing time
        vector_store.save_local(vector_store_path)  

In [26]:
file_hash = "9728ea60fce9e3e99b21c8b70abbf5d0b43236cbbc78901856311f77920efb3b"
with open(f"docs/extracted/{file_hash}.txt", "r", encoding="utf-8") as f:
    doc_data = f.read()

create_embeddings_using_faiss(doc_data, file_hash)

In [None]:
print(file_hash)
print(type(get_doc_by_hash(file_hash)))

9728ea60fce9e3e99b21c8b70abbf5d0b43236cbbc78901856311f77920efb3b
None
