In [1]:
import json

with open('docs/042f627c5d8f619cf62cc21f864b08dfd59059d0b9aab805d132e0014489d625.json', 'r') as file:
    doc_data = json.load(file)

In [14]:
list(doc_data[0].keys())

['DocumentMetadata',
 'JobStatus',
 'NextToken',
 'Blocks',
 'AnalyzeDocumentModelVersion',
 'ResponseMetadata']

In [None]:
len(doc_data)

21

In [28]:
doc_data[0]['Blocks'][3]

{'BlockType': 'LINE',
 'Confidence': 96.06722259521484,
 'Text': 'IRDAI Regn. No. - 58',
 'Geometry': {'BoundingBox': {'Width': 0.12233010679483414,
   'Height': 0.009105673059821129,
   'Left': 0.6049689650535583,
   'Top': 0.06507617235183716},
  'Polygon': [{'X': 0.6049690246582031, 'Y': 0.06507617235183716},
   {'X': 0.7272990942001343, 'Y': 0.06507691740989685},
   {'X': 0.7272990345954895, 'Y': 0.07418184727430344},
   {'X': 0.6049689650535583, 'Y': 0.07418110966682434}]},
 'Id': '56237cae-0bdf-4926-a140-b9d44550acda',
 'Relationships': [{'Type': 'CHILD',
   'Ids': ['4820c19c-0edd-4057-b5fe-0f1b4b0ecebd',
    'c7f4ce19-15b8-46bf-b86b-379cb67928ce',
    '814f97a3-2844-495b-892a-a5251b9ea619',
    '93934542-481a-4d48-9117-2e1d523dacb2',
    '01a136f1-2a46-449c-8901-2562b0ec05c4']}],
 'Page': 1}

In [37]:
doc = doc_data[1]['Blocks']
lines = [block["Text"] for block in doc if block.get("Text")]

In [40]:
text = ""
for page in doc_data:
    for block in page.get("Blocks"):
        if block.get("Text"):
            text += block.get("Text")

In [43]:
len(text)

197949

In [44]:
def extract_text_from_textract(doc_data: list) -> str:
    """Extracts and concatenates text from LINE blocks in Textract result."""
    lines = []
    for page in doc_data:
        for block in page.get("Blocks", []):
            if block["BlockType"] == "LINE" and "Text" in block:
                lines.append(block["Text"])
    return "\n".join(lines)

In [47]:
str(extract_text_from_textract(doc_data))

'National Insurance Company Limited\nCIN U10200WB1906GOI001713\nIRDAI Regn. No. - 58\nNational Insurance\nTrusted Since 1906\nIssuing Office\nNational Parivar Mediclaim Plus Policy\nWhereas the Proposer designated in the schedule hereto has by a Proposal together with Declaration, which shall be the basis of\nthis contract and is deemed to be incorporated herein, has applied to National Insurance Company Ltd. (hereinafter called the\nCompany), for the insurance hereinafter set forth, in respect of person(s)/ family members named in the schedule hereto\n(hereinafter called the Insured Persons) and has paid the premium as consideration for such insurance.\n1 PREAMBLE\nThe Company undertakes that if during the Policy Period, any Insured Person shall suffer any illness or disease (hereinafter called\nIllness) or sustain any bodily injury due to an Accident (hereinafter called Injury) requiring Hospitalisation of such Insured\nPerson(s) for In-Patient Care at any hospital/nursing home (here

In [49]:
from collections import defaultdict
from typing import List, Dict

def parse_textract_result(textract_response_pages):
    """Parses Textract result into readable structures: lines, tables, forms."""
    blocks = []
    for page in textract_response_pages:
        blocks.extend(page["Blocks"])

    block_map = {block["Id"]: block for block in blocks}

    lines = []
    tables = []

    for block in blocks:
        if block["BlockType"] == "LINE":
            lines.append(block["Text"])

        if block["BlockType"] == "TABLE":
            table = parse_table(block, block_map)
            tables.append(table)

    return {
        "text": "\n".join(lines),
        "tables": tables,
        # You can add 'forms': parse_forms(block_map) later
    }


def parse_table(table_block, block_map):
    """Reconstructs a table into a 2D list using CELL blocks."""
    cells = [
        block_map[rel["Ids"][0]]
        for rel in table_block.get("Relationships", [])
        if block_map[rel["Ids"][0]]["BlockType"] == "CELL"
    ]
    table = defaultdict(lambda: defaultdict(str))

    max_row, max_col = 0, 0
    for cell in cells:
        row, col = cell["RowIndex"], cell["ColumnIndex"]
        max_row, max_col = max(max_row, row), max(max_col, col)
        text = ""
        if "Relationships" in cell:
            for rel in cell["Relationships"]:
                if rel["Type"] == "CHILD":
                    text = " ".join(
                        [
                            block_map[cid]["Text"]
                            for cid in rel["Ids"]
                            if block_map[cid]["BlockType"] == "WORD"
                        ]
                    )
        table[row][col] = text

    # Convert to list of lists
    structured_table = []
    for row in range(1, max_row + 1):
        structured_table.append([table[row][col] for col in range(1, max_col + 1)])

    return structured_table


In [51]:
parsed_data = parse_textract_result(doc_data)

In [56]:
parsed_data.get("tables")

[[['Modern Treatment']],
 [['a.']],
 [['Notification of claim for Cashless facility']],
 [['Notification of claim for Reimbursement']],
 [['Notification of claim for vaccination']],
 [['Type of claim']],
 [['First year']],
 [['Features']],
 [['Time interval']],
 [['14']],
 [['']],
 [['']],
 [['']],
 [['Areas of Jurisdiction']]]

In [59]:
from trp import Document

In [60]:
docs = Document(doc_data)

In [61]:
docs

<trp.Document at 0x2137150bf40>

In [72]:
for row in docs.pages[7].tables[0].rows:
    for cell in row.cells:
        print(cell.text.strip(), end=",")
    print()

Modern Treatment,Coverage,
UAE & HIFU,Limit is for Procedure cost only,
Balloon Sinuplasty,Limit is for Balloon cost only,
Deep Brain Stimulation,Limit is for implants including batteries only,
Oral Chemotherapy,Only cost of medicines payable under this limit, other incidental charges like investigations and consultation charges not payable.,
Immunotherapy,Limit is for cost of injections only.,
Intravitreal injections,Limit is for complete treatment, including Pre & Post Hospitalization,
Robotic Surgery,Limit is for robotic component only.,
Stereotactic Radio surgeries,Limit is for radiation procedure.,
Bronchial Thermoplasty,Limit is for complete treatment, including Pre & Post Hospitalization,
Vaporization of the prostrate,Limit is for LASER component only.,
IONM,Limit is for IONM procedure only.,
Stem cell therapy,Limit is for complete treatment, including Pre & Post Hospitalization,


In [76]:
from trp import Document
import json
from typing import List, Dict


def extract_all_tables_from_doc(doc_data: list) -> List[Dict]:
    """
    Parses all tables from all pages in the Textract response using TRP,
    and structures them into a list of dictionaries for LLM consumption.
    """
    doc = Document(doc_data)
    parsed_tables = []

    for i, page in enumerate(doc.pages):
        if not page.tables:
            continue

        for table in page.tables:
            # Format table as list of rows -> list of cells (strings)
            structured_table = []
            for row in table.rows:
                structured_row = [cell.text.strip() for cell in row.cells]
                structured_table.append(structured_row)

            parsed_tables.append({
                "page": i + 1,
                "table": structured_table
            })

    return parsed_tables


def format_table_for_llm(table_data: List[Dict]) -> str:
    """
    Converts the list of parsed tables into a markdown string or readable format for LLM.
    """
    formatted_output = ""

    for table_info in table_data:
        formatted_output += f"\n### Table from Page {table_info['page']}\n\n"

        table = table_info["table"]
        if not table:
            continue

        # Format as Markdown table if possible
        headers = table[0]
        formatted_output += "| " + " | ".join(headers) + " |\n"
        formatted_output += "| " + " | ".join(["---"] * len(headers)) + " |\n"

        for row in table[1:]:
            formatted_output += "| " + " | ".join(row) + " |\n"

    return formatted_output


In [77]:
all_tables = extract_all_tables_from_doc(doc_data)

In [78]:
all_tables[0]

{'page': 8,
 'table': [['Modern Treatment', 'Coverage'],
  ['UAE & HIFU', 'Limit is for Procedure cost only'],
  ['Balloon Sinuplasty', 'Limit is for Balloon cost only'],
  ['Deep Brain Stimulation', 'Limit is for implants including batteries only'],
  ['Oral Chemotherapy',
   'Only cost of medicines payable under this limit, other incidental charges like investigations and consultation charges not payable.'],
  ['Immunotherapy', 'Limit is for cost of injections only.'],
  ['Intravitreal injections',
   'Limit is for complete treatment, including Pre & Post Hospitalization'],
  ['Robotic Surgery', 'Limit is for robotic component only.'],
  ['Stereotactic Radio surgeries', 'Limit is for radiation procedure.'],
  ['Bronchial Thermoplasty',
   'Limit is for complete treatment, including Pre & Post Hospitalization'],
  ['Vaporization of the prostrate', 'Limit is for LASER component only.'],
  ['IONM', 'Limit is for IONM procedure only.'],
  ['Stem cell therapy',
   'Limit is for complete t

In [79]:
llm_ready_string = format_table_for_llm(all_tables)

In [81]:
print(llm_ready_string)


### Table from Page 8

| Modern Treatment | Coverage |
| --- | --- |
| UAE & HIFU | Limit is for Procedure cost only |
| Balloon Sinuplasty | Limit is for Balloon cost only |
| Deep Brain Stimulation | Limit is for implants including batteries only |
| Oral Chemotherapy | Only cost of medicines payable under this limit, other incidental charges like investigations and consultation charges not payable. |
| Immunotherapy | Limit is for cost of injections only. |
| Intravitreal injections | Limit is for complete treatment, including Pre & Post Hospitalization |
| Robotic Surgery | Limit is for robotic component only. |
| Stereotactic Radio surgeries | Limit is for radiation procedure. |
| Bronchial Thermoplasty | Limit is for complete treatment, including Pre & Post Hospitalization |
| Vaporization of the prostrate | Limit is for LASER component only. |
| IONM | Limit is for IONM procedure only. |
| Stem cell therapy | Limit is for complete treatment, including Pre & Post Hospitalization

In [86]:
def extract_all_text_from_doc(doc_data: list):
    text = ""
    docs = Document(doc_data)
    for page in docs.pages:
        text += page.text

    return text

In [87]:
len(extract_all_text_from_doc(doc_data))

108541

In [88]:
def extract_all_text_from_doc(doc_data: list):
    text = ""
    docs = Document(doc_data)
    for page in docs.pages:
        text += page.text

    return text


def extract_all_tables_from_doc(doc_data: list) -> List[Dict]:
    """
    Parses all tables from all pages in the Textract response using TRP,
    and structures them into a list of dictionaries for LLM consumption.
    """
    doc = Document(doc_data)
    parsed_tables = []

    for i, page in enumerate(doc.pages):
        if not page.tables:
            continue

        for table in page.tables:
            # Format table as list of rows -> list of cells (strings)
            structured_table = []
            for row in table.rows:
                structured_row = [cell.text.strip() for cell in row.cells]
                structured_table.append(structured_row)

            parsed_tables.append({"page": i + 1, "table": structured_table})

    return parsed_tables



In [89]:
def get_extracted_text_and_tables(doc_data):

    text = extract_all_text_from_doc(doc_data)
    tables = extract_all_tables_from_doc(doc_data)

    return {"text": text, "tables": tables}

In [90]:
import os
result = get_extracted_text_and_tables(doc_data)

save_path = "docs/extracted/042f627c5d8f619cf62cc21f864b08dfd59059d0b9aab805d132e0014489d625.json"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, "w") as f:
    json.dump(result, f, indent=2)