In [38]:
# Required imports
import re
from typing import List
from langchain_core.documents import Document
from langchain_community.document_loaders import PyMuPDFLoader



In [39]:
#PyMuPDFLoader (Fast and accurate)
print("\nLoading LIC PDF using PyMuPDFLoader")
try:
    pymupdf_loader = PyMuPDFLoader("../data/raw/LIC_Bima_Shree.pdf")  # Adjust path as needed
    pymupdf_docs = pymupdf_loader.load()
    print(f"Loaded {len(pymupdf_docs)} pages")
except Exception as e:
    print(f"Error loading PDF: {e}")



Loading LIC PDF using PyMuPDFLoader
Loaded 24 pages


In [40]:
# Manual metadata to be attached to each chunk
CUSTOM_METADATA = {
    "policy_name": "Money Back Plans",
    "product_name": "LIC’s Bima Shree",
    "plan_no": "748",
    "uin_no": "512N316V03"
}


In [41]:
def clean_text(text: str) -> str:
    """
    Removes excessive newlines and fixes spacing in paragraphs.
    """
    text = re.sub(r'\n+', ' ', text)            # Replace multiple \n with space
    text = re.sub(r'\s{2,}', ' ', text)         # Replace multiple spaces with single
    return text.strip()

def extract_section_chunks_cleaned(docs):
    """
    Parses text into section-based chunks (no subsections), and removes redundant newlines.
    """
    section_chunks = []
    current_section = "Introduction"
    # section_pattern = re.compile(r"(?<=\n|^)(\d{1,2})\.\s+([^\n]+)")
    section_pattern = re.compile(r"^\s*(\d{1,2})\.\s+(.+)")

    for doc in docs:
        page_num = doc.metadata.get("page", -1)
        page_text = doc.page_content

        lines = page_text.split("\n")
        buffer = ""

        for line in lines:
            match = section_pattern.match(line.strip())
            if match:
                if buffer.strip():
                    cleaned = clean_text(buffer)
                    section_chunks.append(Document(
                        page_content=cleaned,
                        metadata={
                            **CUSTOM_METADATA,
                            "page": page_num,
                            "section": current_section
                        }
                    ))
                current_section = f"{match.group(1)}. {match.group(2).strip()}"
                buffer = ""
            else:
                buffer += line + "\n"

        if buffer.strip():
            cleaned = clean_text(buffer)
            section_chunks.append(Document(
                page_content=cleaned,
                metadata={
                    **CUSTOM_METADATA,
                    "page": page_num,
                    "section": current_section
                }
            ))

    return section_chunks


In [42]:
sectional_chucks = extract_section_chunks_cleaned(pymupdf_docs)
print(f"Final cleaned + section-tagged chunk count: {len(sectional_chucks)}")

Final cleaned + section-tagged chunk count: 34


In [43]:
# Preview first few chunks
for i, doc in enumerate(sectional_chucks[:40]):
    print(f"\n--- Chunk {i+1} ---")
    print(f"Metadata: {doc.metadata}")
    print(f"Content:\n{doc.page_content}...")
    print(f"Length:{len(doc.page_content)}")



--- Chunk 1 ---
Metadata: {'policy_name': 'Money Back Plans', 'product_name': 'LIC’s Bima Shree', 'plan_no': '748', 'uin_no': '512N316V03', 'page': 0, 'section': 'Introduction'}
Content:
LIC’s Bima Shree plan offers a combination of protection and savings. This plan is specially designed for High Net-worth Individuals. This plan provides financial support for the family in case of unfortunate death of the policyholders during the policy term. Periodic payments shall also be made on survival of the policyholder at specified durations during the policy term and a lump sum payment to the surviving policyholder at the time of maturity. This Plan can be purchased Offline through Licensed agents, Corporate agents, Brokers and Insurance Marketing Firms. Key Features: • The plan provides for protection and savings. • Limited premium payment. • Flexibility to - Choose the premium payment frequency as per convenience. - Choose the period for which protection is required – 14, 16, 18, 20, 24 and

In [44]:
def split_by_char_length(text, max_len=700, overlap=100):
    """
    Splits a long text into overlapping chunks by character count.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_len, len(text))
        chunks.append(text[start:end].strip())
        start += max_len - overlap
    return chunks


In [45]:
final_chunks = []

for doc in sectional_chucks:
    text = doc.page_content
    if len(text) <= 700:
        final_chunks.append(doc)
    else:
        split_chunks = split_by_char_length(text, max_len=700, overlap=100)

        for i, chunk in enumerate(split_chunks):
            final_chunks.append(Document(
                page_content=chunk,
                metadata={**doc.metadata, "chunk_part": i + 1}
            ))

print(f"Final character-based chunk count: {len(final_chunks)}")


Final character-based chunk count: 98


In [46]:
for i, doc in enumerate(final_chunks[:50]):
    print(f"\n--- Chunk {i+1} ---")
    print(f"Length: {len(doc.page_content)} chars")
    print(f"Metadata: {doc.metadata}")
    print(f"Content:\n{doc.page_content}...")



--- Chunk 1 ---
Length: 700 chars
Metadata: {'policy_name': 'Money Back Plans', 'product_name': 'LIC’s Bima Shree', 'plan_no': '748', 'uin_no': '512N316V03', 'page': 0, 'section': 'Introduction', 'chunk_part': 1}
Content:
LIC’s Bima Shree plan offers a combination of protection and savings. This plan is specially designed for High Net-worth Individuals. This plan provides financial support for the family in case of unfortunate death of the policyholders during the policy term. Periodic payments shall also be made on survival of the policyholder at specified durations during the policy term and a lump sum payment to the surviving policyholder at the time of maturity. This Plan can be purchased Offline through Licensed agents, Corporate agents, Brokers and Insurance Marketing Firms. Key Features: • The plan provides for protection and savings. • Limited premium payment. • Flexibility to - Choose the premium pay...

--- Chunk 2 ---
Length: 700 chars
Metadata: {'policy_name': 'Money Back 

In [48]:
import json

output_path = "../output/bima_shree_chunks.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for doc in final_chunks:
        json_line = {
            "page_content": doc.page_content,
            "metadata": doc.metadata
        }
        f.write(json.dumps(json_line, ensure_ascii=False) + "\n")

print(f"Saved {len(final_chunks)} chunks to '{output_path}'")


Saved 98 chunks to '../output/bima_shree_chunks.jsonl'
