# **0. Install Dependencies**

In [2]:
!pip install gradio pymupdf sentence-transformers huggingface_hub python-dotenv numpy



# **1. Extraction Code**


In [1]:
import fitz
import re
from google.colab import files


KEY_SECTIONS = [
    "exclusion", "waiting period", "specified disease", "specified procedure",
    "pre-existing", "joint replacement", "coverage", "benefit", "day care"
]
SECTION_REGEX = re.compile("|".join(rf"({k})" for k in KEY_SECTIONS), re.IGNORECASE)

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    pages = [page.get_text() for page in doc]
    return "\n".join(pages)

def smart_chunk_policy(text, max_words=500, stride=250):
    lines = text.splitlines()
    chunks = []
    words = []
    headers = []
    for i, line in enumerate(lines):
        # Detect key section starts, save current buffer as its own chunk
        if SECTION_REGEX.search(line):
            if words:
                chunks.append(" ".join(words))
                words = []
            headers.append(line.strip())
            continue
        # Normal chunking
        for word in line.split():
            words.append(word)
            if len(words) >= max_words:
                chunk = ""
                if headers:
                    chunk += " ".join(headers) + "\n"
                chunk += " ".join(words)
                chunks.append(chunk)
                # Overlap/stride
                words = words[-stride:] if stride else []
    # Flush last
    if words:
        chunk = ""
        if headers:
            chunk += " ".join(headers) + "\n"
        chunk += " ".join(words)
        chunks.append(chunk)
    return chunks

# Usage
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]
text = extract_text_from_pdf(pdf_path)
chunks = smart_chunk_policy(text, max_words=500, stride=250)
with open("chunks.txt", "w", encoding="utf-8") as f:
    for c in chunks:
        f.write(c.replace('\n', ' ') + '\n')
print(f"Produced {len(chunks)} chunks.")

Saving Doc 1.pdf to Doc 1.pdf
Produced 184 chunks.


# **2. Evaluation**

## **a. Structural**

In [None]:
import os
from huggingface_hub import InferenceClient

# 1) Read your pre‐chunked document
with open("chunks.txt", "r", encoding="utf-8") as f:
    document_chunks = f.read()

# 2) Define the user query
user_query = "46-year-old male, knee surgery in Pune, 3-month-old insurance policy"

# 3) Build the chat messages
messages = [
    {
        "role": "system",
        "content": "You are an expert policy-decision assistant."
    },
    {
        "role": "user",
        "content": f"""
User Query:
"{user_query}"

Document Chunks:
{document_chunks}

Instructions:
- Extract the fields: age, procedure, location, and policy duration.
- Identify the relevant clause(s) in the document chunks that govern knee surgery coverage.
- Decide whether the claim is APPROVED or REJECTED.
- Return STRICT JSON with keys:
    • decision: "approved" or "rejected"
    • amount: numeric (or null)
    • justification: a list of objects, each with:
        {{
          "clause": <exact clause text>,
          "reason": <how it applies>
        }}
"""
    }
]

# 4) Initialize the HF Inference client
# Make sure HF_TOKEN is set in your Colab env (via !export or colab secrets)
client = InferenceClient(token=os.getenv("HF_TOKEN"))

# 5) Call chat_completion
response = client.chat_completion(
    model="deepseek-ai/DeepSeek-V3-0324",
    messages=messages,
    max_tokens=512
)

# 6) Extract and print the assistant’s reply
# The content is in response.choices[0].message.content
decision_json = response.choices[0].message.content
print(decision_json)

```json
{
  "decision": "approved",
  "amount": null,
  "justification": [
    {
      "clause": "SECTION C) BENEFITS COVERED UNDER THE POLICY exclusions contained or otherwise expressed in this Policy. PART A- COVERAGE- Domestic (Within India Only, for Imperial and Imperial Plus Plans) I. IN-PATIENT BENEFITS FOR DOMESTIC COVER ii. We have accepted Your Claim under \"In-patient Hospitalization Treatment\" or \"Day Care Procedures\" section of Subject otherwise to the terms, conditions and exclusions of the Policy. 5. Day Care Procedures for Day care procedures / Surgeries taken as an Inpatient in a Hospital or Day Care Centre but not in the outpatient department.",
      "reason": "The policy covers inpatient hospitalizations and day care procedures, which includes knee surgery."
    },
    {
      "clause": "The above coverage is subject to fulfilment of following conditions: Exclusions: Mental Illness Treatment does not cover: d. For autism spectrum disorder, admissions, stays or day

## **b. Practical**

In [None]:
import os
from huggingface_hub import InferenceClient

# -----------------------------------------------------------------------------
# 1) Read your pre‐chunked document from chunks.txt
# -----------------------------------------------------------------------------
with open("chunks.txt", "r", encoding="utf-8") as f:
    document_chunks = f.read()

# -----------------------------------------------------------------------------
# 2) Define the user query
# -----------------------------------------------------------------------------
user_query = "46-year-old male, knee surgery in Pune, 3-month-old insurance policy"

# -----------------------------------------------------------------------------
# 3) Build the chat messages, enforcing decision = “approved” or “rejected”
# -----------------------------------------------------------------------------
messages = [
    {
        "role": "system",
        "content": "You are an expert policy-decision assistant."
    },
    {
        "role": "user",
        "content": f"""
User Query:
"{user_query}"

Document Chunks:
{document_chunks}

Instructions:
1. Extract the fields: age, procedure, location, and policy duration.
2. Identify the relevant clause(s) in the document chunks that govern knee surgery coverage.
3. Decide whether the claim is "approved" or "rejected" (exactly those strings).
4. Return STRICT JSON with keys:
   • decision: "approved" or "rejected"
   • amount: numeric (or null)
   • justification: a list of objects, each with:
       {{
         "clause": <exact clause text>,
         "reason": <how it applies>
       }}
"""
    }
]

# -----------------------------------------------------------------------------
# 4) Initialize the HF Inference client
# -----------------------------------------------------------------------------
# Ensure your HF token is set:
#   export HF_TOKEN="your_token_here"
client = InferenceClient(token=os.getenv("HF_TOKEN"))

# -----------------------------------------------------------------------------
# 5) Call the conversational endpoint
# -----------------------------------------------------------------------------
response = client.chat_completion(
    model="deepseek-ai/DeepSeek-V3-0324",
    messages=messages,
    max_tokens=512
)

# -----------------------------------------------------------------------------
# 6) Extract and print the model’s JSON reply
# -----------------------------------------------------------------------------
reply = response.choices[0].message.content
print(reply)

```json
{
  "decision": "approved",
  "amount": null,
  "justification": [
    {
      "clause": "In-patient Hospitalization Treatment Limits INR 3,750,000 INR 5,600,000 INR 7,500,000 INR 11,200,000 INR 18,750,000 INR 37,500,000",
      "reason": "The policy covers knee surgery under the In-patient Hospitalization Treatment limits, which are sufficiently high to cover the procedure."
    },
    {
      "clause": "Pre-hospitalisation 60 days Post-hospitalisation 180 days",
      "reason": "The policy allows for pre-hospitalization and post-hospitalization expenses, which are relevant for knee surgery."
    },
    {
      "clause": "The Policy shall be void and all premium paid thereon shall be forfeited to the Company, in the event of misrepresentation, mis-description or non-disclosure of any material fact.",
      "reason": "No misrepresentation or non-disclosure is indicated in the query, so the claim is not void."
    }
  ]
}
```
