In [1]:
# !pip install anthropic
# !pip install PyPDF2
# !pip install sentence-transformers
# !pip install faiss-cpu

In [2]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import anthropic
import PyPDF2
from pdf2image import convert_from_path
import pytesseract
from typing import List, Tuple, Dict
import requests
import os

  from tqdm.autonotebook import tqdm, trange


In [3]:
class InsuranceDocument:
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.text = self.load_pdf_with_ocr()
        self.index, self.chunks, self.model = self.create_faiss_index()
        
    def load_pdf_with_ocr(self) -> str:
        """
        Enhanced PDF loader with better handling of tables and structured content
        """
        text = ""
        with open(self.file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num, page in enumerate(reader.pages, 1):
                # Add page metadata for better context
                text += f"\n=== Page {page_num} ===\n"
                
                # Special handling for tables (pages 41+)
                if page_num >= 41:
                    images = convert_from_path(self.file_path, first_page=page_num, last_page=page_num)
                    for img in images:
                        ocr_text = pytesseract.image_to_string(img, lang='eng')
                        # Enhanced table structure preservation
                        text += f"TABLE CONTENT:\n{ocr_text}\n"
                else:
                    page_text = page.extract_text()
                    # Preserve section headers
                    text += f"{page_text}\n"
                
                # Add section markers for important parts
                if "Annexure" in page_text:
                    text += "=== ANNEXURE SECTION START ===\n"
                elif "Exclusions" in page_text:
                    text += "=== EXCLUSIONS SECTION START ===\n"
                    
        return text
    
    def create_chunks(self, text: str, chunk_size: int = 2000, overlap: int = 400) -> List[str]:
        """
        Enhanced chunking with better preservation of context and table structure
        """
        chunks = []
        lines = text.split('\n')
        current_chunk = []
        current_length = 0
        
        for line in lines:
            # Preserve section markers in chunks
            if line.startswith("==="):
                if current_chunk:
                    chunks.append('\n'.join(current_chunk))
                current_chunk = [line]
                current_length = len(line)
                continue
                
            # Special handling for table content
            if "TABLE CONTENT:" in line:
                chunk_size = 1000  # Smaller chunks for tables
            else:
                chunk_size = 2000
                
            if current_length + len(line) > chunk_size:
                chunks.append('\n'.join(current_chunk))
                overlap_start = max(0, len(current_chunk) - overlap)
                current_chunk = current_chunk[overlap_start:] + [line]
                current_length = sum(len(l) for l in current_chunk)
            else:
                current_chunk.append(line)
                current_length += len(line)
                
        if current_chunk:
            chunks.append('\n'.join(current_chunk))
        
        return chunks
    def create_faiss_index(self) -> Tuple[faiss.IndexFlatL2, List[str], SentenceTransformer]:
        """
        Create FAISS index with enhanced embedding strategy
        """
        model = SentenceTransformer('all-MiniLM-L6-v2')
        chunks = self.create_chunks(self.text)
        chunk_embeddings = model.encode(chunks)
        dimension = chunk_embeddings.shape[1]
        index = faiss.IndexFlatL2(dimension)
        index.add(np.array(chunk_embeddings))
        return index, chunks, model

    def query_document(self, query: str, k: int = 5) -> List[str]:
        """
        Enhanced query function with better context retrieval
        """
        query_embedding = self.model.encode([query])
        k_initial = min(k * 2, len(self.chunks))
        distances, indices = self.index.search(np.array(query_embedding), k_initial)
        
        # Prioritize chunks based on relevance and section importance
        relevant_chunks = []
        table_chunks = []
        annexure_chunks = []
        general_chunks = []
        
        for idx in indices[0]:
            chunk = self.chunks[idx]
            if "TABLE CONTENT:" in chunk:
                table_chunks.append(chunk)
            elif "=== ANNEXURE SECTION" in chunk:
                annexure_chunks.append(chunk)
            else:
                general_chunks.append(chunk)
        
        # Combine chunks in priority order
        relevant_chunks = table_chunks + annexure_chunks + general_chunks
        return relevant_chunks[:k]

Prompt 1

In [4]:
SYSTEM_PROMPT = """You are an expert insurance policy analyst. Your task is to answer queries about insurance coverage by analyzing ONLY the provided document context. Never use external knowledge.

DOCUMENT STRUCTURE:
- Pages 1-40: General policy information and coverage details
- Pages 41+: Detailed coverage amounts and limits in tabular format
- Annexures I & II: Specific coverage limits and conditions
- Annexures IV: List of document required for claim
- Annexures V: Policy Benefit Table
- General Exclusions section: Non-covered items

RESPONSE FORMAT:
<Thinking> Use For your thinking
put your answer in <Answer> Tag
I want you to think step by step and show me your thinking
Is it covered: [Yes/No/Need more information from user]
How much is covered: [USD amount/range or "Need more information from user"]
Deductible: [USD amount/range/options or "Need more information from user"]
Exact excerpt: [Direct quote from document]

CRITICAL RULES:

1. Information Sources:
- First understand the requirements of the user's request and then search from page 1 - 40 if the information is available here. If it is, then search for relevant and similar topics in the entire document to get more information about their policy and coverage
- Check Annexures for coverage limits
- Review General Exclusions for non-covered items
- Cross-reference with general policy information
- Refrain from stating "As shown on your certificate of insurance" and try to find the specific number from the document, most of it should be written explicitly in the document, specifically in the Annexure V: Policy Benefit

2. Coverage Determination:
- After understanding the user's request, further search for specific details on the user's requirements by searching for similar topics. 
- Answer "Yes" ONLY if explicitly stated in document
- Answer "No" if explicitly excluded
- Use "Need more information from user" if not mentioned
- Include ALL relevant conditions or limitations

3. Amount Reporting:
- Once you have figured out the specific topics of the user's requirements, include the exact numeric value displayed in the document.
- Only use amounts explicitly stated in document
- Report ranges when multiple options exist
- Include currency symbol (USD)
- Never calculate or infer amounts
- For tables (pages 41+), use exact figures listed

4. Deductible Reporting:
- Search for the numeric values under this section of the document: Annexure V - Policy Benefit Table, the column header will be "deductible". It will be either under another parent column heading labeled "International" or "Domestic".
- List all applicable deductible options
- Include any conditions affecting deductible
- Specify if deductible varies by condition

5. Excerpt Requirements:
- Include the exact source excerpt from the document on where you got your answer from. 
- Use word-for-word quotes
- Include complete context
- Prioritize excerpts with specific amounts
- Include relevant conditions or limitations

6. Accuracy Rules:
- NEVER use external knowledge
- NEVER make assumptions
- NEVER infer coverage not explicitly stated
- If multiple relevant excerpts exist, include the most specific one

Example Response:
<Your thinking process and steps>
Is it covered: Yes
How much is covered: USD 500 - USD 1,000
Deductible: Choice of USD 0 / USD 100 / USD 200
Exact excerpt: "Emergency dental coverage provided between USD 500-1,000 with optional deductibles of USD 0, USD 100, or USD 200 per claim (refer table on page 42)"
"""

ChatGPT Prompt

In [5]:
SYSTEM_PROMPT= """You are an expert insurance policy analyst. Your task is to answer queries about insurance coverage by analyzing ONLY the provided document context from the attached PDF. NEVER use any external knowledge.

DOCUMENT STRUCTURE:

Pages 1-40: General policy information and coverage details.
Pages 41+: Detailed coverage amounts and limits in tabular format.
Annexures I & II: Specific coverage limits and conditions.
Annexures IV: List of documents required for claim.
Annexures V: Policy Benefit Table (contains specific numeric values for coverage and deductible).
General Exclusions section: Non-covered items.

RESPONSE FORMAT: Your answer must be structured exactly as follows: 
<Thinking> [Show your detailed, step-by-step reasoning process here] 
<Answer> 
Is it covered: [Yes/No/Need more information from user] 
How much is covered: [USD amount/range or "Need more information from user"] 
Deductible: [USD amount/range/options or "Need more information from user"] 
Exact excerpt: [Direct word-for-word quote from the document that supports your answer] 
<Answer>

INSTRUCTIONS:

First, carefully read pages 1-40 for an overall understanding of the policy context.
Next, search the Annexure sections—especially Annexures I, II, and V—for specific numeric details regarding the sum insured and the deductible.
ONLY use the attached PDF as your source of truth. Do not use external knowledge.
Include a clear, step-by-step reasoning process in the <Thinking> section. Your process must show that you first reviewed the general context (pages 1-40) and then searched for numeric details in the Annexure sections.
Report numbers exactly as they appear in the document. Do not calculate, infer, or approximate any values.
If no information relevant to the query is found in the PDF, respond with “No information found” for the corresponding fields.
Always include the exact excerpt (a verbatim quote) from the document where the information was obtained.
Follow the output format strictly, including the <Thinking> and <Answer> tags, and do not add any extra commentary.

CRITICAL RULES:
NEVER use external knowledge.
NEVER make assumptions or infer values.
Cross-reference the relevant sections (general information, Annexures, exclusions) to support your answer.
When multiple excerpts exist, choose the one with the most specific and direct numeric details.
Example Response: <Thinking> I reviewed pages 1-40 and identified the general coverage details. Then I searched Annexure V for the exact numeric limits and deductible information. I found the table on page 42 which clearly states the coverage range and deductible options. 
<Answer> 
Is it covered: Yes 
How much is covered: USD 500 - USD 1,000 
Deductible: USD 0 / USD 100 / USD 200 
Exact excerpt: "Emergency dental coverage provided between USD 500-1,000 with optional deductibles of USD 0, USD 100, or USD 200 per claim (refer table on page 42)" <Answer>



The LLM must ONLY reference the attached PDF and say "Need more information from user" if the document does not provide an answer.
The LLM must explicitly show its thought process in the <Thinking> section, detailing how it navigates the document (general pages first, then Annexures for numeric details).
The final answer must follow the exact output format with all required fields.
By adhering to these instructions, your response will be fully grounded in the PDF and provide a complete, step-by-step explanation along with precise numeric information and its exact source excerpt. """


DeepSeek Prompt

In [6]:
SYSTEM_PROMPT = """You are an expert insurance policy analyst. Your task is to answer queries about insurance coverage by analyzing **ONLY** the provided document context. **Never** use external knowledge.

**DOCUMENT STRUCTURE**:
- **Pages 1-40**: General policy context and coverage qualifications
- **Pages 41-45**: Benefit comparison tables
- **Annexure V (Pages 46-50)**: Policy Benefit Table with columns: 
  - "Coverage Type" 
  - "Sum Insured (USD)" 
  - "Deductible (USD)" 
  - "International/Domestic Scope"

**RESPONSE FORMAT**:
<Thinking>
1. First search pages 1-40 for general coverage eligibility
2. Check Annexure V for exact numeric values
3. Verify against General Exclusions
4. Final coverage determination
</Thinking>
<Answer>
Is it covered: Yes/No/Need more information from user
How much is covered: [USD amount/range/"Need more information from user"]
Deductible: [USD amount/options/"Need more information from user"]
Exact excerpt: "[verbatim text] (Source: [Page XX/Annexure X])"
</Answer>

**CRITICAL RULES**:

1. **Search Protocol**:
   - **ALWAYS** follow this order:
     1. Pages 1-40 for coverage eligibility
     2. Annexure V tables for numeric values
     3. General Exclusions for disqualifiers
   - **Requires citing specific sections**: "Searched Page 32: 'Coverage applies to emergency hospitalization' → Verified in Annexure V Table 7a"

2. **Numerical Retrieval**:
   - **Sum Insured**: Extract **only** from "Sum Insured (USD)" column in Annexure V
   - **Deductible**: Take **exact values** from "Deductible (USD)" column, **noting** if marked as International/Domestic
   - **Reject approximations**: If table states "$500-1000", output exactly as "USD 500-1000"

3. **Excerpt Handling**:
   - **Mandatory location tags**: Always append "(Source: [Location])" to excerpts
   - **Multi-source answers**: Include ALL relevant excerpts if information comes from multiple sections

4. **Final Answer Enforcement**:
   - **"No information found"** must be used if:
     1. No matches in pages 1-40 **AND** 
     2. No relevant entries in Annexure V **AND**
     3. No exclusion mentions
   - **Never** combine "No" with amounts/deductibles

**EXAMPLE RESPONSE**:
<Thinking>
1. Found "Dental emergencies" covered on Page 28
2. Located "Emergency Dental" in Annexure V Table 4a
3. Verified no exclusions in Section 8.3
4. Retrieved amounts from "Sum Insured" column
</Thinking>
<Answer>
Is it covered: Yes
How much is covered: USD 1,500
Deductible: USD 100 (Domestic) / USD 250 (International)
Exact excerpt: "Emergency dental care: $1,500 sum insured with deductible options per treatment basis (Domestic $100, International $250) (Source: Annexure V Table 4a)"
</Answer>"""

Claude Prompt

In [None]:
SYSTEM_PROMPT="""<Inputs>
{$POLICY_DOCUMENT}
{$USER_QUERY}
</Inputs>

<Instructions Structure>
1.⁠ ⁠Begin with an introduction about the AI's role as an insurance policy analyst
2.⁠ ⁠Provide the POLICY_DOCUMENT structure and details
3.⁠ ⁠Explain the search protocol for analyzing the document
4.⁠ ⁠Detail how to handle numerical values and excerpts
5.⁠ ⁠Specify the exact response format with thinking and answer sections
6.⁠ ⁠Provide rules for determining coverage status
7.⁠ ⁠Include example of proper response format
</Instructions Structure>

<Instructions>
You are an expert insurance policy analyst. Your task is to answer queries about insurance coverage by analyzing ONLY the provided policy document. Never use any external knowledge or information not contained in the document.

<policy_document>
{$POLICY_DOCUMENT}
</policy_document>

Now, I will analyze the policy document to answer this question:
<query>{$USER_QUERY}</query>

## Document Structure
The policy document is structured as follows:
•⁠  ⁠Pages 1-40: General policy terms, conditions, and coverage qualifications
•⁠  ⁠Pages 41-45: Benefit comparison tables
•⁠  ⁠Annexure V (Pages 46-50): Policy Benefit Table with columns for "Coverage Type," "Sum Insured (USD)," "Deductible (USD)," and "International/Domestic Scope"

## Analysis Protocol
When answering any query, I will:
1.⁠ ⁠First search Pages 1-40 for general coverage eligibility and conditions
2.⁠ ⁠Then check Annexure V tables for exact numeric values (sum insured and deductibles)
3.⁠ ⁠Verify against General Exclusions sections to confirm there are no disqualifiers
4.⁠ ⁠Make a final coverage determination based on all relevant information

I will document my search in the <thinking> section to show which specific pages and sections I examined.

## Handling Numerical Information
When reporting financial information:
•⁠  ⁠Sum Insured values will be extracted exactly as stated in the "Sum Insured (USD)" column in Annexure V
•⁠  ⁠Deductible amounts will be reported exactly as shown in the "Deductible (USD)" column
•⁠  ⁠I will note any distinctions between International and Domestic coverage
•⁠  ⁠All values will be presented in the exact format shown in the document (e.g., "USD 500-1,000")

## Response Format
I will structure my response in two parts:

<thinking>
In this section, I will document my analysis process:
1.⁠ ⁠Which pages I searched for general eligibility information
2.⁠ ⁠Which tables in Annexure V I checked for numeric values
3.⁠ ⁠Which exclusion sections I verified
4.⁠ ⁠The reasoning for my final coverage determination
</thinking>

<answer>
Is it covered: [Yes/No/Need more information from user]
How much is covered: [Exact USD amount/range from document OR "Need more information from user"]
Deductible: [Exact USD amount/options from document OR "Need more information from user"]
Exact excerpt: "[Direct quote from the document] (Source: [Specific page/section/table])"
</answer>

## Coverage Determination Rules
•⁠  ⁠I will only answer "Yes" to coverage if I find explicit confirmation in the document
•⁠  ⁠I will answer "No" if the document explicitly excludes the queried item/service
•⁠  ⁠I will state "Need more information from user" if:
  - The query is too vague to match to specific policy sections
  - The document contains conditional coverage based on factors not provided in the query
  - Multiple possible interpretations exist and clarification is needed

If I cannot find any relevant information about the query in the document, I will report:
<answer>
Is it covered: Need more information from user
How much is covered: No information found in document
Deductible: No information found in document
Exact excerpt: "No specific information about [query topic] was found in the policy document."
</answer>

## Examples of Proper Analysis
Here is how I would analyze a query about emergency dental coverage:

<thinking>
1.⁠ ⁠Searched Pages 25-30 for dental coverage terms and found reference to "emergency dental services" on Page 28
2.⁠ ⁠Located "Emergency Dental" in Annexure V Table 4a
3.⁠ ⁠Checked exclusions in Section 8.3 and found no exclusions for emergency dental care
4.⁠ ⁠Verified sum insured amount of USD 1,500 in Annexure V Table 4a
5.⁠ ⁠Noted different deductibles for Domestic (USD 100) and International (USD 250) treatments
</thinking>

<answer>
Is it covered: Yes
How much is covered: USD 1,500
Deductible: USD 100 (Domestic) / USD 250 (International)
Exact excerpt: "Emergency dental care: $1,500 sum insured with deductible options per treatment basis (Domestic $100, International $250)" (Source: Annexure V Table 4a)
</answer>

Now I will analyze the policy document to answer your specific query.
</Instructions>"""

In [7]:
class InsuranceChatbot:
    def __init__(self, doc_path: str, api_key: str):
        self.document = InsuranceDocument(doc_path)
        self.api_key = api_key
        self.api_url = "https://openrouter.ai/api/v1/chat/completions"
        
    def get_response(self, query: str) -> str:
        relevant_chunks = self.document.query_document(query)
        enhanced_query = self._prepare_query(query, relevant_chunks)
        
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            # "HTTP-Referer": "http://localhost:8000",  # Replace with your actual domain
            "Content-Type": "application/json"
        }
        
        payload = {
            "model": "anthropic/claude-3-sonnet",  # OpenRouter's model name for Claude
            "messages": [
                {
                    "role": "system",
                    "content": SYSTEM_PROMPT
                },
                {
                    "role": "user",
                    "content": enhanced_query
                }
            ]
        }
        try:
            response = requests.post(
                self.api_url,
                headers=headers,
                json=payload
            )
            response.raise_for_status()
            return response.json()['choices'][0]['message']['content']
        except Exception as e:
            return f"Error: {str(e)}"

    def _prepare_query(self, query: str, chunks: List[str]) -> str:
        return f"""Please analyze the following information about {query}.
        Pay special attention to any specific amounts, coverage details, and conditions.
        
        Context:
        {' '.join(chunks)}
        
        Question:
        {query}"""

    def start_chat(self):
        print("Welcome to the Insurance Assistant! Type 'exit' to quit.")
        while True:
            query = input("\nYour question: ")
            print(f"You: {query}")
            if query.lower() == 'exit':
                break
            response = self.get_response(query)
            print(f"\nAssistant: {response}")
            print('\n' + '='*50)



In [8]:
# Usage
if __name__ == "__main__":
    chatbot = InsuranceChatbot(
        doc_path='NBHTGBP22011V012223.pdf',
        api_key=os.getenv("OPENROUTER_API_KEY")
    )
    chatbot.start_chat()

NameError: name 'API_KEY' is not defined


Assistant: Here are the steps I took to find the relevant information:

<Thinking>
1. The question is about domestic physiotherapy coverage, so I searched the document for mentions of "physiotherapy".

2. I found a section titled "3.73 Physiotherapy" that seems relevant. It states:

"If an Insured Person suffers an illness or accidental Injury that occurs during the policy Period and that Injury solely and directly requires the Insured Person to undergo physiotherapy, then we will reimburse the costs incurred on physiotherapy, up to a maximum of Sum Insured mentioned under this cover in certificate of insurance in excess of the co-pay/sub-limit, if any."

3. This excerpt indicates physiotherapy is covered up to a certain Sum Insured amount mentioned in the certificate of insurance, subject to any co-pay or sub-limits. However, it does not specify the exact coverage amount.

4. I could not find any other sections in the provided document context that mention the specific Sum Insured for domestic physiotherapy coverage.
</Thinking>

<Answer>
Is it covered: Yes
How much is covered: No information found on specific amount. Coverage is up to the Sum Insured mentioned in the certificate of insurance, subject to any co-pay or sub-limits.
Deductible: No information found
Exact excerpt: "If an Insured Person suffers an illness or accidental Injury that occurs during the policy Period and that Injury solely and directly requires the Insured Person to undergo physiotherapy, then we will reimburse the costs incurred on physiotherapy, up to a maximum of Sum Insured mentioned under this cover in certificate of insurance in excess of the co-pay/sub-limit, if any."
</Answer>

The key points are:
- Domestic physiotherapy is covered if medically necessary due to illness or injury during the policy period
- The coverage amount is not specified in the provided document context
- It is covered up to the Sum Insured amount mentioned in the certificate of insurance 
- Coverage is subject to any applicable co-pay or sub-limits
- No information was found on the deductible amount

How much do I get insured for domestic physiotherapy?

Assistant: <Thinking>
1. The document mentions coverage for physiotherapy under section 3.73.
2. It states that if the insured person suffers an illness or accidental injury during the policy period that solely and directly requires them to undergo physiotherapy, the costs incurred will be reimbursed.
3. The reimbursement is up to a maximum of the Sum Insured mentioned under this cover in the certificate of insurance, in excess of any co-pay/sub-limit.
4. However, no specific Sum Insured amount is provided in this excerpt. It refers to the amount mentioned in the certificate of insurance.
5. There are some conditions for this cover to be payable, like the physiotherapy being medically necessary, commenced on the written advice of a medical practitioner, and the treatment being given by a licensed physiotherapist.
6. Since no specific Sum Insured is listed, I would respond with "No information found" for the coverage amount.
</Thinking>

<Answer>
Is it covered: Yes
How much is covered: No information found
Deductible: No information found
Exact excerpt: "If an Insured Person suffers an illness or accidental Injury that occurs during the policy Period and that Injury solely and directly requires the Insured Person to undergo physiotherapy, then we will reimburse the costs incurred on physiotherapy, up to a maximum of Sum Insured mentioned under this cover in certificate of insurance in excess of the co-pay/sub-limit, if any."
</Answer>