In [1]:
! pip install pdf2image pillow python-docx ollama


Collecting python-docx
  Using cached python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Using cached python_docx-1.2.0-py3-none-any.whl (252 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.2.0



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
from docx import Document
from docx.shared import Pt, RGBColor, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH

def create_stylish_bank_template(path: str = "stylish_bank_template.docx"):
    doc = Document()

    # --- Title ---
    title = doc.add_paragraph("Federal Bank - Account Opening Form")
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER
    title.runs[0].font.size = Pt(18)
    title.runs[0].font.bold = True
    title.runs[0].font.color.rgb = RGBColor(0, 51, 153)  # Navy blue

    doc.add_paragraph("")  # blank line for spacing

    # --- Subtitle ---
    subtitle = doc.add_paragraph("Please provide your personal details accurately. All fields are mandatory.")
    subtitle.alignment = WD_ALIGN_PARAGRAPH.CENTER
    subtitle.runs[0].font.size = Pt(10)
    subtitle.runs[0].font.color.rgb = RGBColor(80, 80, 80)

    doc.add_paragraph("")  # blank line for spacing

    # --- Add Table for Information ---
    table = doc.add_table(rows=5, cols=2)
    table.style = "Table Grid"
    table.autofit = True

    fields = [
        ("Full Name", "{{name}}"),
        ("Age", "{{age}}"),
        ("Address", "{{address}}"),
        ("Account Number", "{{account_number}}"),
        ("IFSC Code", "{{ifsc_code}}")
    ]

    for i, (label, placeholder) in enumerate(fields):
        cell_label = table.cell(i, 0)
        cell_label.text = label
        for run in cell_label.paragraphs[0].runs:
            run.font.bold = True
            run.font.size = Pt(11)
            run.font.color.rgb = RGBColor(0, 0, 0)
        cell_value = table.cell(i, 1)
        cell_value.text = placeholder
        for run in cell_value.paragraphs[0].runs:
            run.font.size = Pt(11)
            run.font.color.rgb = RGBColor(50, 50, 50)

    doc.add_paragraph("")  # spacing

    # --- Declaration ---
    declaration = doc.add_paragraph(
        "I hereby declare that the above information provided is true and correct to the best of my knowledge."
    )
    declaration.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
    declaration.runs[0].font.size = Pt(11)

    doc.add_paragraph("")  # spacing

    # --- Signature Section ---
    signature_table = doc.add_table(rows=1, cols=2)
    signature_table.style = "Table Grid"
    signature_table.autofit = True
    signature_table.cell(0, 0).text = "Signature of Applicant: ___________________________"
    signature_table.cell(0, 1).text = "Date: ___________________"

    # --- Footer ---
    doc.add_paragraph("")  # blank line
    footer = doc.add_paragraph("Federal Bank Ltd ¬© 2025 | Confidential Document")
    footer.alignment = WD_ALIGN_PARAGRAPH.CENTER
    for run in footer.runs:
        run.font.size = Pt(9)
        run.font.color.rgb = RGBColor(120, 120, 120)

    doc.save(path)
    print(f"[üíé] Stylish bank form template created successfully ‚Üí {path}")

create_stylish_bank_template()
 

 Stylish bank form template created successfully ‚Üí stylish_bank_template.docx


passport template 

In [13]:
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH

def create_passport_application_template(path: str = "passport_application_form.docx"):
    doc = Document()

    # --- Title ---
    title = doc.add_paragraph("Government of India")
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER
    title.runs[0].font.size = Pt(16)
    title.runs[0].font.bold = True
    title.runs[0].font.color.rgb = RGBColor(0, 51, 153)

    subtitle = doc.add_paragraph("Ministry of External Affairs - Passport Seva")
    subtitle.alignment = WD_ALIGN_PARAGRAPH.CENTER
    subtitle.runs[0].font.size = Pt(12)
    subtitle.runs[0].font.bold = True
    subtitle.runs[0].font.color.rgb = RGBColor(0, 102, 204)

    doc.add_paragraph("")
    header = doc.add_paragraph("PASSPORT APPLICATION FORM")
    header.alignment = WD_ALIGN_PARAGRAPH.CENTER
    header.runs[0].font.size = Pt(14)
    header.runs[0].font.bold = True
    header.runs[0].font.color.rgb = RGBColor(0, 0, 0)

    doc.add_paragraph("")  # spacing

    # --- Section 1: Applicant Information ---
    section1 = doc.add_paragraph("Section 1: Applicant Details")
    section1.runs[0].font.size = Pt(12)
    section1.runs[0].font.bold = True
    section1.runs[0].font.color.rgb = RGBColor(0, 51, 102)

    table1 = doc.add_table(rows=10, cols=2)
    table1.style = "Table Grid"
    fields1 = [
        ("Full Name", "{{full_name}}"),
        ("Date of Birth (DD/MM/YYYY)", "{{dob}}"),
        ("Place of Birth (City, State, Country)", "{{birth_place}}"),
        ("Gender", "{{gender}}"),
        ("Marital Status", "{{marital_status}}"),
        ("Citizenship/Nationality", "{{nationality}}"),
        ("PAN Card No.", "{{pan_number}}"),
        ("Aadhaar No.", "{{aadhaar_number}}"),
        ("Email ID", "{{email}}"),
        ("Mobile Number", "{{mobile}}"),
    ]

    for i, (label, placeholder) in enumerate(fields1):
        table1.cell(i, 0).text = label
        table1.cell(i, 1).text = placeholder
        for cell in table1.row_cells(i):
            for paragraph in cell.paragraphs:
                for run in paragraph.runs:
                    run.font.size = Pt(11)

    doc.add_paragraph("")

    # --- Section 2: Address Information ---
    section2 = doc.add_paragraph("Section 2: Address Details")
    section2.runs[0].font.size = Pt(12)
    section2.runs[0].font.bold = True
    section2.runs[0].font.color.rgb = RGBColor(0, 51, 102)

    table2 = doc.add_table(rows=5, cols=2)
    table2.style = "Table Grid"
    fields2 = [
        ("Residential Address", "{{res_address}}"),
        ("City", "{{city}}"),
        ("State", "{{state}}"),
        ("Pincode", "{{pincode}}"),
        ("Police Station", "{{police_station}}")
    ]

    for i, (label, placeholder) in enumerate(fields2):
        table2.cell(i, 0).text = label
        table2.cell(i, 1).text = placeholder
        for cell in table2.row_cells(i):
            for paragraph in cell.paragraphs:
                for run in paragraph.runs:
                    run.font.size = Pt(11)

    doc.add_paragraph("")

    # --- Section 3: Family Details ---
    section3 = doc.add_paragraph("Section 3: Family Information")
    section3.runs[0].font.size = Pt(12)
    section3.runs[0].font.bold = True
    section3.runs[0].font.color.rgb = RGBColor(0, 51, 102)

    table3 = doc.add_table(rows=4, cols=2)
    table3.style = "Table Grid"
    fields3 = [
        ("Father's Name", "{{father_name}}"),
        ("Mother's Name", "{{mother_name}}"),
        ("Spouse's Name (if applicable)", "{{spouse_name}}"),
        ("Guardian's Name (if minor)", "{{guardian_name}}"),
    ]

    for i, (label, placeholder) in enumerate(fields3):
        table3.cell(i, 0).text = label
        table3.cell(i, 1).text = placeholder
        for cell in table3.row_cells(i):
            for paragraph in cell.paragraphs:
                for run in paragraph.runs:
                    run.font.size = Pt(11)

    doc.add_paragraph("")

    # --- Section 4: Passport Details ---
    section4 = doc.add_paragraph("Section 4: Passport Information")
    section4.runs[0].font.size = Pt(12)
    section4.runs[0].font.bold = True
    section4.runs[0].font.color.rgb = RGBColor(0, 51, 102)

    table4 = doc.add_table(rows=4, cols=2)
    table4.style = "Table Grid"
    fields4 = [
        ("Type of Application (Fresh/Reissue)", "{{application_type}}"),
        ("Passport Booklet Type (36/60 pages)", "{{booklet_type}}"),
        ("If Reissue, specify reason", "{{reissue_reason}}"),
        ("Previous Passport No.", "{{previous_passport_no}}"),
    ]

    for i, (label, placeholder) in enumerate(fields4):
        table4.cell(i, 0).text = label
        table4.cell(i, 1).text = placeholder
        for cell in table4.row_cells(i):
            for paragraph in cell.paragraphs:
                for run in paragraph.runs:
                    run.font.size = Pt(11)

    doc.add_paragraph("")

    # --- Section 5: Emergency Contact ---
    section5 = doc.add_paragraph("Section 5: Emergency Contact Details")
    section5.runs[0].font.size = Pt(12)
    section5.runs[0].font.bold = True
    section5.runs[0].font.color.rgb = RGBColor(0, 51, 102)

    table5 = doc.add_table(rows=3, cols=2)
    table5.style = "Table Grid"
    fields5 = [
        ("Contact Name", "{{emergency_contact_name}}"),
        ("Contact Number", "{{emergency_contact_number}}"),
        ("Relationship", "{{emergency_contact_relation}}"),
    ]

    for i, (label, placeholder) in enumerate(fields5):
        table5.cell(i, 0).text = label
        table5.cell(i, 1).text = placeholder
        for cell in table5.row_cells(i):
            for paragraph in cell.paragraphs:
                for run in paragraph.runs:
                    run.font.size = Pt(11)

    doc.add_paragraph("")

    # --- Declaration Section ---
    declaration = doc.add_paragraph(
        "I hereby declare that all the information given above is true and correct to the best of my knowledge and belief. "
        "I am aware that providing false information is a punishable offence under the Passport Act."
    )
    declaration.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
    declaration.runs[0].font.size = Pt(11)

    doc.add_paragraph("")
    doc.add_paragraph("Place: ___________________________")
    doc.add_paragraph("Date: ____________________________")

    doc.add_paragraph("")
    signature = doc.add_paragraph("Signature/Thumb Impression of Applicant: ___________________________")
    signature.alignment = WD_ALIGN_PARAGRAPH.LEFT

    # --- Footer ---
    doc.add_paragraph("")
    footer = doc.add_paragraph("Government of India ¬© 2025 | Passport Application Form | Confidential Document")
    footer.alignment = WD_ALIGN_PARAGRAPH.CENTER
    for run in footer.runs:
        run.font.size = Pt(9)
        run.font.color.rgb = RGBColor(120, 120, 120)

    doc.save(path)
    print(f" Passport Application Form created successfully ‚Üí {path}")


# Generate the passport form
create_passport_application_template()


 Passport Application Form created successfully ‚Üí passport_application_form.docx


corrected code 

In [14]:
import ollama
import json
import re
from pdf2image import convert_from_path
from io import BytesIO
from pathlib import Path
import time


def pdf_to_images_bytes(pdf_path):
    """Convert all PDF pages to JPEG image bytes."""
    pages = convert_from_path(pdf_path, dpi=150)
    image_bytes_list = []
    for i, page in enumerate(pages):
        buf = BytesIO()
        page.save(buf, format="JPEG")
        image_bytes_list.append(buf.getvalue())
    return image_bytes_list


def extract_fields_from_pdf(pdf_path: str, model_name="qwen2.5vl:7b"):
    print(f"\n Processing PDF: {pdf_path}")
    start_time = time.time()

    # Convert PDF to image bytes
    image_bytes_list = pdf_to_images_bytes(pdf_path)

    # LLM OCR prompt
    prompt = """
    You are a senior OCR document understanding model.
    Analyze the uploaded passport or visa page image(s).

    Rules:
    - Keep both Arabic and English text exactly as visible.
    - If no Arabic words are present, just keep English text as is.
    - Do NOT translate.
    - If a label (key) exists in both Arabic and English, combine them using " / ".
    - If a value exists in both Arabic and English, also combine them using " / ".
    - Output must be valid JSON only, no explanations.
    - Don't skip any visible fields ‚Äî include all possible fields.

    Example format:
    {
      "ID Number / ÿ±ŸÇŸÖ ÿßŸÑŸáŸàŸäÿ©": "784199787632597",
      "File No / ÿ±ŸÇŸÖ ÿßŸÑŸÖŸÑŸÅ": "201/2023/7/663922",
      "Passport No / ÿ±ŸÇŸÖ ÿßŸÑÿ¨Ÿàÿßÿ≤": "VT1337002",
      "Name / ÿßŸÑÿßÿ≥ŸÖ": "MUHAMMAD AMIR IQBAL / ŸÖÿ≠ŸÖÿØ ÿßŸÖŸäÿ± ÿßŸÇÿ®ÿßŸÑ",
      "Profession / ÿßŸÑŸÖŸáŸÜÿ©": "PARTNER / ÿ¥ÿ±ŸäŸÉ",
      "Employer / ÿµÿßÿ≠ÿ® ÿßŸÑÿπŸÖŸÑ": "H S P INTERNATIONAL FOODSTUFF TRADING L.L.C / ÿßÿ™ÿ¥ ÿßÿ≥ ÿ®Ÿä ÿßŸÜÿ™ÿ±ŸÜÿßÿ¥ŸàŸÜÿßŸÑ ŸÑÿ™ÿ¨ÿßÿ±ÿ© ÿßŸÑŸÖŸàÿßÿØ ÿßŸÑÿ∫ÿ∞ÿßÿ¶Ÿäÿ© ÿ¥ ÿ∞ ŸÖ ŸÖ",
      "Place of Issue / ÿ¨Ÿáÿ© ÿßŸÑÿ•ÿµÿØÿßÿ±": "ÿØÿ®Ÿä",
      "Issue Date / ÿ™ÿßÿ±ŸäÿÆ ÿ•ÿµÿØÿßÿ± ÿßŸÑÿ•ŸÇÿßŸÖÿ©": "15/11/2023",
      "Expiry Date / ÿ™ÿßÿ±ŸäÿÆ ÿ•ŸÜÿ™Ÿáÿßÿ° ÿßŸÑÿ•ŸÇÿßŸÖÿ©": "14/11/2025",
      "Country / ÿßŸÑÿØŸàŸÑÿ©": "UNITED ARAB EMIRATES / ÿØŸàŸÑÿ© ÿßŸÑÿ•ŸÖÿßÿ±ÿßÿ™ ÿßŸÑÿπÿ±ÿ®Ÿäÿ© ÿßŸÑŸÖÿ™ÿ≠ÿØÿ©",
      "Type / ŸÜŸàÿπ ÿßŸÑÿ•ŸÇÿßŸÖÿ©": "RESIDENCE / ÿ•ŸÇÿßŸÖÿ©"
    }

    Only output JSON.
    """

    print(" Running model inference on all pages together...")
    response = ollama.chat(
        model=model_name,
        messages=[
            {
                "role": "user",
                "content": prompt,
                "images": image_bytes_list
            }
        ],
    )

    # Extract model output
    content = response["message"]["content"]

    # Safely parse JSON if present
    try:
        match = re.search(r"(\{[\s\S]*\})", content)
        data = json.loads(match.group(1)) if match else {"raw_text": content}
    except Exception as e:
        data = {"raw_text": content, "error": str(e)}

    # Prepare output
    output = {"file": str(pdf_path), "extracted_data": data}

    # Save as JSON file
    output_file = Path(pdf_path).with_suffix(".json")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print(f"\n Extraction completed in {time.time() - start_time:.2f} seconds.")
    print(f" Output saved to: {output_file}")

    return output


# ------------------------------
# Example Usage
# ------------------------------
if __name__ == "__main__":
    pdf_path = Path(r"D:\AI Projects\passport_extraction\passport\EID_.pdf")
    extracted_text = extract_fields_from_pdf(pdf_path)
    # Optional: print the JSON content
    print(json.dumps(extracted_text, indent=2, ensure_ascii=False))




 Processing PDF: D:\AI Projects\passport_extraction\passport\EID_.pdf
 Running model inference on all pages together...

 Extraction completed in 121.36 seconds.
 Output saved to: D:\AI Projects\passport_extraction\passport\EID_.json
{
  "file": "D:\\AI Projects\\passport_extraction\\passport\\EID_.pdf",
  "extracted_data": {
    "ID Number / ÿ±ŸÇŸÖ ÿßŸÑŸáŸàŸäÿ©": "784199787632597",
    "File No / ÿ±ŸÇŸÖ ÿßŸÑŸÖŸÑŸÅ": "133079502",
    "Passport No / ÿ±ŸÇŸÖ ÿßŸÑÿ¨Ÿàÿßÿ≤": "133079502",
    "Name / ÿßŸÑÿßÿ≥ŸÖ": "MUHAMMAD AMIR IQBAL / ŸÖÿ≠ŸÖÿØ ÿßŸÖŸäÿ± ÿßŸÇÿ®ÿßŸÑ",
    "Profession / ÿßŸÑŸÖŸáŸÜÿ©": "PARTNER / ÿ¥ÿ±ŸäŸÉ",
    "Employer / ÿµÿßÿ≠ÿ® ÿßŸÑÿπŸÖŸÑ": "H S P INTERNATIONAL FOODSTUFF TRADING L.L.C / ÿßÿ™ÿ¥ ÿßÿ≥ ÿ®Ÿä ÿßŸÜÿ™ÿ±ŸÜÿßÿ¥ŸàŸÜÿßŸÑ ŸÑÿ™ÿ¨ÿßÿ±ÿ© ÿßŸÑŸÖŸàÿßÿØ ÿßŸÑÿ∫ÿ∞ÿßÿ¶Ÿäÿ© ÿ¥ ÿ∞ ŸÖ ŸÖ",
    "Place of Issue / ÿ¨Ÿáÿ© ÿßŸÑÿ•ÿµÿØÿßÿ±": "ÿØÿ®Ÿä",
    "Issue Date / ÿ™ÿßÿ±ŸäÿÆ ÿ•ÿµÿØÿßÿ± ÿßŸÑÿ•ŸÇÿßŸÖÿ©": "15/11/2023",
    "Expiry Date / ÿ™ÿßÿ±ŸäÿÆ ÿ•ŸÜÿ™Ÿáÿßÿ° ÿßŸÑÿ•ŸÇÿßŸÖÿ©": "14/11/2

In [10]:
import json
import ollama
import re

SCHEMA = {
    "name": "",
    "dob": "",
    "nationality": "",
    "address": "",
    "account_number": "",
    "ifsc_code": ""
}


def map_to_defined_schema(extracted_text, schema, model_name="qwen2.5vl:7b"):
    response = ollama.chat(model=model_name, messages=[
        {"role": "user", "content": f"""
You are a data mapper. Given this extracted text:
{extracted_text}

Map it to this schema:
{json.dumps(schema, indent=2)}

Return only valid JSON following the schema field names. 
Do NOT wrap the output in markdown code fences.
"""}
    ])
    mapped = response["message"]["content"]

    #  Remove markdown fences if present
    mapped = mapped.strip()
    mapped = re.sub(r"^```(?:json)?", "", mapped)
    mapped = re.sub(r"```$", "", mapped)
    mapped = mapped.strip()

    try:
        mapped_json = json.loads(mapped)
    except Exception as e:
        print(f" JSON Parse Error: {e}")
        print("Raw Output:\n", mapped)
        mapped_json = {}
    return mapped_json


#  Extracted content only (not file path)
extracted_only = json.dumps(extracted_text["extracted_data"], ensure_ascii=False, indent=2)
mapped_data = map_to_defined_schema(extracted_only, SCHEMA)

print("=== Mapped JSON ===")
print(json.dumps(mapped_data, indent=2, ensure_ascii=False))


=== Mapped JSON ===
{
  "name": "MUHAMMAD AMIR IQBAL / ŸÖÿ≠ŸÖÿØ ÿßŸÖŸäÿ± ÿßŸÇÿ®ÿßŸÑ",
  "dob": "",
  "nationality": "",
  "address": "",
  "account_number": "",
  "ifsc_code": ""
}


In [11]:
from docx import Document
from docx.shared import RGBColor

def fill_template(template_path, output_path, data):
    doc = Document(template_path)

    def replace_text_in_paragraph(paragraph, replacements):
        full_text = "".join(run.text for run in paragraph.runs)
        for key, value in replacements.items():
            placeholder = "{{" + key + "}}"
            if placeholder in full_text:
                full_text = full_text.replace(placeholder, str(value))
        if full_text:
            for run in paragraph.runs:
                run.text = ""  # clear
            new_run = paragraph.add_run(full_text)
            # highlight inserted values in blue for verification
            for key, value in replacements.items():
                if value and value in full_text:
                    new_run.font.color.rgb = RGBColor(0, 0, 255)  # Blue

    def replace_text_in_table(table, replacements):
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    replace_text_in_paragraph(paragraph, replacements)

    # Replace in body
    for paragraph in doc.paragraphs:
        replace_text_in_paragraph(paragraph, data)

    # Replace in tables
    for table in doc.tables:
        replace_text_in_table(table, data)

    doc.save(output_path)
    print(f" Filled DOCX saved successfully ‚Üí {output_path}")


# üîπ Run the final fill step
template_path = "stylish_bank_template.docx"
output_path = "filled_output.docx"
fill_template(template_path, output_path, mapped_data)


 Filled DOCX saved successfully ‚Üí filled_output.docx


passport template mapping code 

In [15]:
import json
import re
import ollama
from pydantic import BaseModel, Field

# -------------------------------
#  Step 1: Define Schema (Pydantic)
# -------------------------------
class PassportApplicationSchema(BaseModel):
    full_name: str = Field(default="", description="Full name of applicant")
    dob: str = Field(default="", description="Date of Birth (DD/MM/YYYY)")
    birth_place: str = Field(default="", description="Place of Birth (City, State, Country)")
    gender: str = Field(default="", description="Gender of the applicant")
    marital_status: str = Field(default="", description="Marital status")
    nationality: str = Field(default="", description="Nationality or Citizenship")
    pan_number: str = Field(default="", description="PAN Card Number")
    aadhaar_number: str = Field(default="", description="Aadhaar Number")
    email: str = Field(default="", description="Email ID")
    mobile: str = Field(default="", description="Mobile Number")
    res_address: str = Field(default="", description="Residential Address")
    city: str = Field(default="", description="City")
    state: str = Field(default="", description="State")
    pincode: str = Field(default="", description="Pincode")
    police_station: str = Field(default="", description="Nearest Police Station")
    father_name: str = Field(default="", description="Father‚Äôs Full Name")
    mother_name: str = Field(default="", description="Mother‚Äôs Full Name")
    spouse_name: str = Field(default="", description="Spouse Name (if applicable)")
    guardian_name: str = Field(default="", description="Guardian Name (if minor)")
    application_type: str = Field(default="", description="Type of Application (Fresh/Reissue)")
    booklet_type: str = Field(default="", description="Passport Booklet Type (36/60 pages)")
    reissue_reason: str = Field(default="", description="Reason for Reissue (if applicable)")
    previous_passport_no: str = Field(default="", description="Previous Passport Number")
    emergency_contact_name: str = Field(default="", description="Emergency Contact Name")
    emergency_contact_number: str = Field(default="", description="Emergency Contact Number")
    emergency_contact_relation: str = Field(default="", description="Emergency Contact Relationship")

# -------------------------------
#  Step 2: LLM Mapping Function
# -------------------------------
def map_to_defined_schema(extracted_text: str, schema_model, model_name="qwen2.5vl:7b"):
    """
    Uses an LLM to map extracted OCR text to a structured Pydantic schema.
    """
    schema_json = schema_model.model_json_schema()
    response = ollama.chat(
        model=model_name,
        messages=[
            {
                "role": "user",
                "content": f"""
You are a data mapping expert.

Given this extracted document data:
{extracted_text}

Map it to this structured schema:
{json.dumps(schema_json, indent=2)}

Rules:
- Return ONLY valid JSON.
- The keys must match the schema exactly.
- Fill empty fields with empty string if not found.
- Do not include any explanations or markdown formatting.
"""
            }
        ],
    )

    mapped_output = response["message"]["content"].strip()

    # Clean markdown fences if LLM adds them
    mapped_output = re.sub(r"^```(?:json)?", "", mapped_output)
    mapped_output = re.sub(r"```$", "", mapped_output)
    mapped_output = mapped_output.strip()

    try:
        mapped_json = json.loads(mapped_output)
        # Validate & auto-complete missing fields using Pydantic
        mapped_instance = schema_model(**mapped_json)
    except Exception as e:
        print(f" JSON Parse Error: {e}")
        print("Raw Output:\n", mapped_output)
        mapped_instance = schema_model()

    return mapped_instance


# -------------------------------
# ‚öôÔ∏è Step 3: Run Mapping
# -------------------------------
# Assume `extracted_text` is already loaded (from your extraction step)
extracted_only = json.dumps(extracted_text["extracted_data"], ensure_ascii=False, indent=2)

mapped_instance = map_to_defined_schema(extracted_only, PassportApplicationSchema)
print("===  Mapped & Validated Schema ===")
print(mapped_instance.model_dump_json(indent=2, ensure_ascii=False))


===  Mapped & Validated Schema ===
{
  "full_name": "MUHAMMAD AMIR IQBAL / ŸÖÿ≠ŸÖÿØ ÿßŸÖŸäÿ± ÿßŸÇÿ®ÿßŸÑ",
  "dob": "",
  "birth_place": "",
  "gender": "",
  "marital_status": "",
  "nationality": "",
  "pan_number": "",
  "aadhaar_number": "",
  "email": "",
  "mobile": "",
  "res_address": "",
  "city": "",
  "state": "",
  "pincode": "",
  "police_station": "",
  "father_name": "",
  "mother_name": "",
  "spouse_name": "",
  "guardian_name": "",
  "application_type": "",
  "booklet_type": "",
  "reissue_reason": "",
  "previous_passport_no": "",
  "emergency_contact_name": "",
  "emergency_contact_number": "",
  "emergency_contact_relation": ""
}


In [18]:
from docx import Document
from docx.shared import RGBColor


def fill_template(template_path, output_path, data):
    """
    Fill placeholders {{field}} in a DOCX template with actual data.
    Highlights replaced text in blue for verification.
    """
    #  Ensure we have a dictionary (convert Pydantic model if needed)
    if hasattr(data, "model_dump"):
        data = data.model_dump()

    doc = Document(template_path)

    def replace_text_in_paragraph(paragraph, replacements):
        full_text = "".join(run.text for run in paragraph.runs)
        for key, value in replacements.items():
            placeholder = "{{" + key + "}}"
            if placeholder in full_text:
                full_text = full_text.replace(placeholder, str(value))
        if full_text:
            for run in paragraph.runs:
                run.text = ""  # Clear old text
            new_run = paragraph.add_run(full_text)
            # Highlight inserted values in blue
            for key, value in replacements.items():
                if value and str(value) in full_text:
                    new_run.font.color.rgb = RGBColor(0, 0, 255)

    def replace_text_in_table(table, replacements):
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    replace_text_in_paragraph(paragraph, replacements)

    # Replace placeholders in both paragraphs and tables
    for paragraph in doc.paragraphs:
        replace_text_in_paragraph(paragraph, data)

    for table in doc.tables:
        replace_text_in_table(table, data)

    doc.save(output_path)
    print(f" Filled DOCX saved successfully ‚Üí {output_path}")


# ------------------------------
#  Run the final fill step
# ------------------------------
template_path = "passport_application_form.docx"
output_path = "filled_output.docx"
fill_template(template_path, output_path, mapped_instance)


 Filled DOCX saved successfully ‚Üí filled_output.docx
