In [15]:
import os
import requests
from utilities.hashing import calculate_hash
from utilities.redis_cache import get_cached_data
from utilities.file_utils import get_doc_by_hash, save_doc, extract_using_textract


In [3]:
doc_url = "https://hackrx.blob.core.windows.net/assets/policy.pdf?sv=2023-01-03&st=2025-07-04T09%3A11%3A24Z&se=2027-07-05T09%3A11%3A00Z&sr=b&sp=r&sig=N4a9OU0w0QXO6AOIBiu4bpl7AXvEZogeT%2FjUHNO7HzQ%3D"

In [5]:
response = requests.get(doc_url)
file_bytes = response.content
filename = doc_url.split("/")[-1].split("?")[0]

In [10]:
file_hash = calculate_hash(file_bytes)
os.makedirs("docs", exist_ok=True)
_, ext = os.path.splitext(filename)
file_path = os.path.join("docs", file_hash + ext)
with open(file_path, "wb") as f:
    f.write(file_bytes)

In [14]:
# cached_data = get_cached_data(file_hash)
# if cached_data:
    # summary = cached_data.decode()

In [None]:
# doc_data = get_doc_by_hash(file_hash)
# if not doc_data:
#     # Step 2: Process new doc
#     doc_data = extract_using_textract(file_bytes, filename)
#     save_doc(file_hash, doc_data)

In [16]:
import boto3
from pathlib import Path

s3 = boto3.client("s3")

bucket_name = "bajaj-hackrx"
file_name = Path(file_path).name
s3.upload_file(file_path, bucket_name, file_name)
print(f"Uploaded {file_path} to s3://{bucket_name}/{file_name}")

Uploaded docs\042f627c5d8f619cf62cc21f864b08dfd59059d0b9aab805d132e0014489d625.pdf to s3://bajaj-hackrx/042f627c5d8f619cf62cc21f864b08dfd59059d0b9aab805d132e0014489d625.pdf


In [17]:
textract = boto3.client("textract")

features = ["TABLES", "FORMS"]
bucket_name = "bajaj-hackrx"
response = textract.start_document_analysis(
    DocumentLocation={"S3Object": {"Bucket": bucket_name, "Name": file_name}},
    FeatureTypes=features,
)
job_id = response["JobId"]
print(f"Started Textract job: {job_id}")

Started Textract job: 5f50bc3a91c53a892fd8f4551df2cfc83c915b2a7ca430f62108550c8de10ca8


In [19]:
import time

elapsed = 0
timeout = 300
poll_interval=5
while elapsed < timeout:
    response = textract.get_document_analysis(JobId=job_id)
    status = response["JobStatus"]
    if status in ["SUCCEEDED", "FAILED"]:
        print(f"Textract Job Status: {status}")
        break
        
    time.sleep(poll_interval)
    elapsed += poll_interval
print("Timeout waiting for Textract job.")

Textract Job Status: SUCCEEDED
Timeout waiting for Textract job.


In [20]:
pages = []
response = textract.get_document_analysis(JobId=job_id)
pages.append(response)

while "NextToken" in response:
    response = textract.get_document_analysis(
        JobId=job_id, NextToken=response["NextToken"]
    )
    pages.append(response)

In [None]:
import json

def save_result_locally(job_id: str, result: dict, save_path = None):
    """Stores the result JSON locally for reuse."""
    if not save_path:
        save_path = f"docs/{job_id}.json"

    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    with open(save_path, "w") as f:
        json.dump(result, f, indent=2)
    print(f"[✓] Result cached locally at {save_path}")

In [24]:
def extract_text_and_tables(blocks):
    """
    Parses Textract blocks into a structured format (simple version).
    You can later expand this for full table/form parsing.
    """
    lines = []
    tables = []
    forms = []

    for block in blocks:
        if block["BlockType"] == "LINE":
            lines.append(block["Text"])
        elif block["BlockType"] == "TABLE":
            tables.append(block)
        elif block["BlockType"] == "KEY_VALUE_SET":
            forms.append(block)

    return {
        "lines": lines,
        "tables_raw": tables,  # You can further process this into structured tables
        "forms_raw": forms,  # Can be parsed into key-value pairs
    }


In [25]:
save_result_locally(job_id, pages, f"docs/{file_name}.json")

[✓] Result cached locally at docs/042f627c5d8f619cf62cc21f864b08dfd59059d0b9aab805d132e0014489d625.pdf.json


In [26]:
file_path

'docs\\042f627c5d8f619cf62cc21f864b08dfd59059d0b9aab805d132e0014489d625.pdf'

In [27]:

base_name = os.path.splitext(os.path.basename(file_path))[0]
local_cache_path = f"{base_name}.json"

In [28]:
local_cache_path

'042f627c5d8f619cf62cc21f864b08dfd59059d0b9aab805d132e0014489d625.json'