In [1]:
import os
import re
import json
import base64
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional

import pandas as pd
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

In [2]:
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]  # read-only for safety
CLIENT_SECRET_PATH = "client_secret.json"   # download from GCP > APIs & Services > Credentials (Desktop App)
TOKEN_PATH = "token.json"                   # saved after first consent (delete to force re-consent)
ATTACH_SAVE_DIR = "gmail_attachments_poc"
os.makedirs(ATTACH_SAVE_DIR, exist_ok=True)

In [3]:
def build_gmail_service() -> Any:
    """On-screen OAuth: opens a browser for consent via run_local_server()."""
    creds = None
    if os.path.exists(TOKEN_PATH):
        creds = Credentials.from_authorized_user_file(TOKEN_PATH, SCOPES)

    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            if not os.path.exists(CLIENT_SECRET_PATH):
                raise FileNotFoundError(
                    f"Missing {CLIENT_SECRET_PATH}. Create OAuth Client ID (Desktop App) and place the JSON here."
                )
            flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_PATH, SCOPES)
            # This opens your browser for on-screen consent:
            creds = flow.run_local_server(port=0, prompt='consent')  # prompt='consent' ensures the screen appears
        with open(TOKEN_PATH, "w") as f:
            f.write(creds.to_json())
    return build("gmail", "v1", credentials=creds)
service = build_gmail_service()

In [4]:
# --- Core helpers ---
def list_messages(service, query, max_results=100):
    resp = service.users().messages().list(userId="me", q=query, maxResults=max_results).execute()
    return resp.get("messages", [])

def get_message(service, msg_id):
    return service.users().messages().get(userId="me", id=msg_id, format="full").execute()

def iter_pdf_attachments(msg):
    """Yield (filename, attachmentId) for PDF attachments found in message parts."""
    payload = msg.get("payload", {}) or {}
    stack = [payload]
    while stack:
        part = stack.pop()
        # push nested parts
        for sub in part.get("parts", []) or []:
            stack.append(sub)
        # check current part for attachment
        filename = part.get("filename") or ""
        body = part.get("body") or {}
        if filename and "attachmentId" in body and filename.lower().endswith(".pdf"):
            yield filename, body["attachmentId"]

def download_attachment(service, msg_id, attachment_id, filename):
    att = service.users().messages().attachments().get(
        userId="me", messageId=msg_id, id=attachment_id
    ).execute()
    data = att.get("data")
    if not data:
        return None
    file_bytes = base64.urlsafe_b64decode(data.encode("utf-8"))
    safe = re.sub(r'[\\/:*?"<>|]+', "_", filename) or f"{msg_id}.pdf"
    path = os.path.join(ATTACH_SAVE_DIR, safe)
    with open(path, "wb") as f:
        f.write(file_bytes)
    return path

In [5]:
from datetime import datetime, timezone
import os, re, json, base64, calendar


now = datetime.now(timezone.utc)
year = now.year
month = now.month
# compute previous month
prev_year = year if month > 1 else year - 1
prev_month = month - 1 if month > 1 else 12
start_day = 1
end_day = calendar.monthrange(prev_year, prev_month)[1]
# Gmail expects YYYY/MM/DD
after_str  = f"{prev_year}/{prev_month:02d}/{start_day:02d}"
before_str = f"{prev_year}/{prev_month:02d}/{end_day:02d}"

In [6]:
query = (
    f"subject:\"HSBC GOLD VISA e-Statement\" after:{after_str} before:{before_str}"
)

In [7]:
# --- Run: fetch PDFs for last month only ---
service = build_gmail_service()
msgs = list_messages(service, query, max_results=100)

saved = []
for m in msgs:
    full = get_message(service, m["id"])
    for fname, att_id in iter_pdf_attachments(full):
        out = download_attachment(service, m["id"], att_id, fname)
        if out:
            saved.append(out)

print(f"Search query:\n  {query}")
print(f"Matched emails: {len(msgs)}")
print("Saved files:")
for p in saved:
    print(" -", os.path.abspath(p))
print(f"\nDownload folder: {os.path.abspath(ATTACH_SAVE_DIR)}")

Search query:
  subject:"HSBC GOLD VISA e-Statement" after:2025/10/01 before:2025/10/31
Matched emails: 1
Saved files:
 - /root/dev/ledgerx/ledgerx-api/notebooks/gmail_attachments_poc/20251012.pdf

Download folder: /root/dev/ledgerx/ledgerx-api/notebooks/gmail_attachments_poc


In [8]:
from getpass import getpass
from pathlib import Path
import shutil
import sys

import pikepdf
import fitz  # PyMuPDF
import ocrmypdf

def has_selectable_text(pdf_path: str, password: str | None = None, sample_pages: int = 3) -> bool:
    """Return True if at least one of the first N pages has selectable text."""
    doc = fitz.open(pdf_path)  # just open
    if password:
        ok = doc.authenticate(password)
        if not ok:
            raise ValueError("Password incorrect")

    n = min(sample_pages, len(doc))
    for i in range(n):
        txt = doc.load_page(i).get_text().strip()
        if txt:
            return True
    return False

def decrypt_pdf(enc_pdf: str, out_pdf: str, password: str) -> None:
    """Fully decrypt (remove encryption) to a new PDF."""
    with pikepdf.open(enc_pdf, password=password) as pdf:
        pdf.save(out_pdf)

def ocr_to_searchable_pdf(input_pdf: str, output_pdf: str, lang: str = "eng") -> None:
    """
    Run OCR with ocrmypdf; keeps images, adds invisible text layer.
    """
    ocrmypdf.ocr(
        input_file=input_pdf,
        output_file=output_pdf,
        language=lang,
        deskew=True,
        rotate_pages=True,
        progress_bar=False,   # set True in notebooks if you want
        pdfa_image_compression="lossless",
    )

def process_encrypted_pdf(
    encrypted_pdf: str,
    output_pdf: str,
    lang: str = "eng",
    keep_plain_copy: bool = False,
) -> str:
    """
    1) Ask for password
    2) If PDF has text, just decrypt and save (or copy as-is)
    3) If no text, decrypt then OCR to searchable pdf
    Returns the path of the produced file.
    """
    enc_path = Path(encrypted_pdf)
    if not enc_path.exists():
        raise FileNotFoundError(f"Input not found: {enc_path}")

    password = getpass("Enter PDF password: ")

    # Quick text check without saving a decrypted copy yet
    text_exists = has_selectable_text(str(enc_path), password=password)

    tmp_decrypted = enc_path.with_suffix(".decrypted.tmp.pdf")
    decrypt_pdf(str(enc_path), str(tmp_decrypted), password=password)

    out_path = Path(output_pdf)

    if text_exists:
        # Already has text -> you might just want a decrypted copy
        if keep_plain_copy:
            shutil.move(str(tmp_decrypted), str(out_path))
            print(f"[OK] Decrypted (no OCR needed): {out_path}")
            return str(out_path)
        else:
            # If you still want to normalize to PDF/A (optional), you can feed it through ocrmypdf with --skip-text
            ocrmypdf.ocr(
                input_file=str(tmp_decrypted),
                output_file=str(out_path),
                language=lang,
                skip_text=True,        # do not OCR existing text
                progress_bar=False
            )
            tmp_decrypted.unlink(missing_ok=True)
            print(f"[OK] Decrypted + normalized (skip_text): {out_path}")
            return str(out_path)
    else:
        # No selectable text -> OCR it
        ocr_to_searchable_pdf(str(tmp_decrypted), str(out_path), lang=lang)
        tmp_decrypted.unlink(missing_ok=True)
        print(f"[OK] OCR complete: {out_path}")
        return str(out_path)

# ===== Run it =====
# Example usage:
# - encrypted input: "input_encrypted.pdf"
# - final output:    "output_searchable.pdf"
# - language packs:  "eng" or "eng+deu" etc.
try:
    produced = process_encrypted_pdf(
        encrypted_pdf="./gmail_attachments_poc/20250914.pdf",
        output_pdf="output_searchable.pdf",
        lang="eng",           # change to "eng+fil" etc. if you installed those packs
        keep_plain_copy=False # True = just decrypt if text already exists
    )
except Exception as e:
    print("Error:", e, file=sys.stderr)
    raise


Error: document closed or encrypted


ValueError: document closed or encrypted

In [None]:
from __future__ import annotations
import re, os, tempfile
from pathlib import Path
from typing import Optional, List, Tuple, Dict
from dateutil import parser as dtparser

import pikepdf
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import pytesseract

# -------------------------------
# 1) Helpers: decrypt + text extraction
# -------------------------------
def decrypt_to_temp(encrypted_pdf: str, password: str) -> str:
    """Decrypts to a temporary PDF path and returns its filename."""
    tmp = Path(tempfile.gettempdir()) / (Path(encrypted_pdf).stem + ".decrypted.tmp.pdf")
    with pikepdf.open(encrypted_pdf, password=password) as pdf:
        pdf.save(str(tmp))
    return str(tmp)

def pdf_text_lines(pdf_path: str, max_pages: Optional[int] = None) -> List[str]:
    """Extract text lines with PyMuPDF (selectable text)."""
    lines: List[str] = []
    with fitz.open(pdf_path) as doc:
        n = len(doc) if max_pages is None else min(max_pages, len(doc))
        for i in range(n):
            page = doc.load_page(i)
            # Use 'text' (layout-agnostic) or 'blocks' if you need coordinates
            txt = page.get_text("text")
            if txt:
                lines.extend(s for s in txt.splitlines() if s.strip())
    return lines

def ocr_text_lines(pdf_path: str, dpi: int = 300, lang: str = "eng", max_pages: Optional[int] = None) -> List[str]:
    """OCR each page (no Ghostscript needed). Requires poppler + Tesseract installed."""
    images = convert_from_path(pdf_path, dpi=dpi)
    if max_pages is not None:
        images = images[:max_pages]
    lines: List[str] = []
    for idx, img in enumerate(images, 1):
        txt = pytesseract.image_to_string(img, lang=lang)
        lines.extend(s for s in txt.splitlines() if s.strip())
    return lines

def get_text_lines_smart(encrypted_pdf: str, password: str, lang: str = "eng") -> Tuple[List[str], str]:
    """Decrypt, try PyMuPDF; if no text found, fall back to OCR."""
    dec_path = decrypt_to_temp(encrypted_pdf, password)
    lines = pdf_text_lines(dec_path)
    if not any(lines):
        # Fallback to OCR (can be slow on big PDFs; adjust dpi/lang as needed)
        lines = ocr_text_lines(dec_path, dpi=300, lang=lang)
    return lines, dec_path

# -------------------------------
# 2) Parsing logic (Due Date & Amount)
# -------------------------------
DATE_KEYWORDS = [
    r"due\s*date", r"payment\s*due", r"pay\s*by", r"statement\s*due", r"bill\s*due",
    r"payment\s*deadline", r"date\s*due"
]
AMOUNT_KEYWORDS_PRIMARY = [
    r"total\s+amount\s+due", r"amount\s+due", r"total\s+due", r"statement\s+balance",
    r"outstanding\s+balance", r"current\s+balance"
]
AMOUNT_KEYWORDS_AVOID = [
    r"minimum\s+amount\s+due", r"minimum\s+due"
]

CURRENCY_SYMS = r"(?:₱|\bPHP\b|(?<!\S)Php)"
AMOUNT_NUM = r"(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d{2})?"
AMOUNT_RX = re.compile(rf"{CURRENCY_SYMS}?\s*({AMOUNT_NUM})", re.IGNORECASE)

def normalize_amount(s: str) -> Optional[float]:
    m = AMOUNT_RX.search(s)
    if not m:
        return None
    num = m.group(1).replace(",", "")
    try:
        return float(num)
    except ValueError:
        return None

def parse_date_any(s: str) -> Optional[str]:
    """Return ISO date (YYYY-MM-DD) if parsable."""
    s = s.strip()
    # Try several liberal parses (handles 'Oct 15, 2025', '15 Oct 2025', '10/15/2025')
    for dayfirst in (False, True):
        try:
            dt = dtparser.parse(s, dayfirst=dayfirst, fuzzy=True, yearfirst=False)
            return dt.date().isoformat()
        except Exception:
            continue
    return None

def find_nearby(lines: List[str], idx: int, window: int = 2) -> List[str]:
    L = max(0, idx - window)
    R = min(len(lines), idx + window + 1)
    return lines[L:R]

def match_any(line: str, patterns: List[str]) -> Optional[re.Match]:
    for pat in patterns:
        m = re.search(pat, line, flags=re.IGNORECASE)
        if m:
            return m
    return None

def extract_due_and_amount(lines: List[str]) -> Dict[str, Optional[str]]:
    """
    Strategy:
      - For due date: look for date keywords; parse date on the same line or the next 2 lines.
      - For amount: prioritize PRIMARY keywords; avoid 'Minimum Amount Due' if a better one exists.
      - If conflicting amounts exist, prefer the one nearest to a primary keyword.
    """
    due_date_iso: Optional[str] = None
    amount_value: Optional[float] = None
    amount_source: Optional[str] = None

    # Pass 1: Due Date
    for i, line in enumerate(lines):
        if match_any(line, DATE_KEYWORDS):
            # Try same line first
            dd = parse_date_any(line)
            if dd:
                due_date_iso = dd
                break
            # Try next lines (within small window)
            for ctx in find_nearby(lines, i, window=2):
                if ctx == line:
                    continue
                dd = parse_date_any(ctx)
                if dd:
                    due_date_iso = dd
                    break
            if due_date_iso:
                break

    # Pass 2: Amounts - collect candidates with simple scoring
    candidates: List[Tuple[float, int, str]] = []  # (amount, score, context)
    for i, line in enumerate(lines):
        # Skip minimum due if possible
        if match_any(line, AMOUNT_KEYWORDS_AVOID):
            am = normalize_amount(line)
            if am is not None:
                # Lower score for minimum due
                candidates.append((am, 1, line))
            continue

        pri_hit = match_any(line, AMOUNT_KEYWORDS_PRIMARY)
        am = normalize_amount(line)
        if am is not None:
            score = 3 if pri_hit else 2
            # small bonus if currency symbol present
            if re.search(CURRENCY_SYMS, line, flags=re.IGNORECASE):
                score += 1
            candidates.append((am, score, line))

        # Look-ahead: amount on next line after a primary keyword
        if pri_hit and i + 1 < len(lines):
            am2 = normalize_amount(lines[i + 1])
            if am2 is not None:
                candidates.append((am2, 4, lines[i] + " | " + lines[i + 1]))

    if candidates:
        # Pick highest score; if tie, pick the largest amount (credit card/utility “Total Due” is usually max)
        candidates.sort(key=lambda t: (t[1], t[0]), reverse=True)
        amount_value, _, amount_source = candidates[0]

    return {
        "due_date": due_date_iso,                          # e.g., "2025-10-15"
        "amount": f"{amount_value:.2f}" if amount_value is not None else None,  # stringified amount
        "amount_context": amount_source                    # the line where we found it (useful for debugging)
    }

# -------------------------------
# 3) One-call entry point
# -------------------------------
def extract_bill_fields(encrypted_pdf: str, password: str, lang: str = "eng") -> Dict[str, Optional[str]]:
    lines, dec_path = get_text_lines_smart(encrypted_pdf, password, lang=lang)
    out = extract_due_and_amount(lines)
    # Clean up decrypted temp file
    print(lines)
    try:
        os.remove(dec_path)
    except Exception:
        pass
    return out

# -------------------------------
# Example usage
# -------------------------------
#result = extract_bill_fields("./gmail_attachments_poc/20250914.pdf", password="20Oct1997814614", lang="eng")
result = extract_bill_fields("./gmail_attachments_poc/BPI Rewards - September 2025.pdf", password="20Oct1997814614", lang="eng")

print(result)
# -> {'due_date': '2025-10-17', 'amount': '12345.67', 'amount_context': 'Total Amount Due  ₱12,345.67'}


['ñðòðñùù÷', 'öóùóøöøóð÷÷ó', 'ÑÖâÈäÁÙÖãÕÁÃ|ÇÔÁÉÓKÃÖÔ', 'ÑÖâÈäÁ@é@ÃÁÕãÖÙ', '\\', 'Â\x93\x92@ò@Ó\x96£@ñô@×', '£', '\x99@â£\x99', '£', 'Ã\x89', '\x93\x89£\x96@È\x96\x94', '¢@Â\x99\x87¨@ñ÷÷', 'ñôòò@Ã\x81\x93\x96\x96\x83\x81\x95@Ã\x89£¨', '×\x99', '\x97\x81\x99', '\x84@\x86\x96\x99', 'Ù', '\x86', '\x99', '\x95\x83', '@Õ\x96K@ð÷ñÁ÷ùõö`ô', 'ÃäâãÖÔÅÙ@ÕäÔÂÅÙ', 'âãÁãÅÔÅÕã@ÄÁãÅ', '×ÁèÔÅÕã@ÄäÅ@ÄÁãÅ', 'ÃÙÅÄÉã@ÓÉÔÉã', 'ãÖãÁÓ@ÁÔÖäÕã@ÄäÅ', 'ÔÉÕÉÔäÔ@ÁÔÖäÕã@ÄäÅ', 'ðòðñðð`ô`ñð`÷ùõöð÷ñ', 'ÁäÇäâã@òøk@òðòõ', 'âÅ×ãÅÔÂÅÙ@ñ÷k@òðòõ', 'óñôkðððKðð', 'ó÷kòöõKóõ', 'øõðKðð', '×\x99', '¥\x89\x96¤¢', 'M`]@×\x81¨\x94', '\x95£¢@a', 'MN]@×¤\x99\x83\x88\x81¢', '¢', 'MN]@É\x95¢£\x81\x93\x93\x94', '\x95£', 'MN]@Æ\x89\x95\x81\x95\x83', 'MN]@Ó\x81£', '@×\x81¨\x94', '\x95£', 'Ã\x81\x99\x84@ã¨\x97', 'Â\x81\x93\x81\x95\x83', 'Ã\x99', '\x84\x89£¢@\x81\x95\x84', '\x81\x95\x84@Á\x84¥\x81\x95\x83', '¢', 'Ä¤', 'Ã\x88\x81\x99\x87', '¢@\x81\x95\x84', 'Ã\x88\x81\x99\x87', '¢', 'Á\x94\x96¤\x95£@Ä¤', 'Ù', '\x82\x81£', '¢', 'Ö£\x88', '\x99

  pdf = Pdf._open(


In [None]:
from __future__ import annotations
from pathlib import Path
import fitz  # PyMuPDF
import cv2, numpy as np, pytesseract, re

LANG = "eng+fil"            # add/remove as needed
PSMS = [6, 4, 12, 11]       # try in this order
OEM = 1                     # LSTM only
DPI = 600                   # bump for tiny fonts

def render_page(page, dpi=DPI):
    z = dpi/72.0
    mat = fitz.Matrix(z, z)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)[:, :, :3]
    return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

def deskew(gray):
    # binarize roughly to find text for angle estimation
    th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)[1]
    coords = np.column_stack(np.where(th == 0))
    if len(coords) < 100: return gray
    rect = cv2.minAreaRect(coords)
    angle = rect[-1]
    angle = -(90 + angle) if angle < -45 else -angle
    M = cv2.getRotationMatrix2D((gray.shape[1]//2, gray.shape[0]//2), angle, 1.0)
    return cv2.warpAffine(gray, M, (gray.shape[1], gray.shape[0]),
                          flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

def sauvola(gray, win=31, k=0.2):
    # approximate Sauvola (use skimage if available); fallback to adaptive Gaussian
    try:
        from skimage.filters import threshold_sauvola
        thresh_s = threshold_sauvola(gray, window_size=win, k=k)
        return (gray > thresh_s).astype(np.uint8) * 255
    except Exception:
        return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                     cv2.THRESH_BINARY, 31, 11)

def remove_table_lines(bw):
    # remove horizontal & vertical lines to help OCR
    inv = 255 - bw
    h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
    v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
    h = cv2.morphologyEx(inv, cv2.MORPH_OPEN, h_kernel, iterations=1)
    v = cv2.morphologyEx(inv, cv2.MORPH_OPEN, v_kernel, iterations=1)
    cleaned = cv2.bitwise_and(inv, cv2.bitwise_not(cv2.bitwise_or(h, v)))
    return 255 - cleaned

def tesseract_text(img, psm=6, numeric=False):
    cfg = f"--oem {OEM} --psm {psm}"
    if numeric:
        cfg += " -c tessedit_char_whitelist=0123456789.,-/₱"
    return pytesseract.image_to_string(img, lang=LANG, config=cfg)

def best_text(img):
    # vote by picking the variant with most alnum characters
    best = ("", -1)
    for psm in PSMS:
        txt = tesseract_text(img, psm=psm)
        score = sum(c.isalnum() for c in txt)
        if score > best[1]:
            best = (txt, score)
    return best[0]

def extract_fields(text: str):
    date = re.search(r"(due\s*date|payment\s*due|pay\s*by)\s*[:\-]?\s*([A-Za-z]{3,9}\s+\d{1,2},\s+\d{4})", re.IGNORECASE)
    amt  = re.search(r"(amount\s*due|total\s*amount)\s*[:\-]?\s*(₱?\s*[0-9][0-9,]*\.?[0-9]{0,2})", re.IGNORECASE)
    return {
        "due_date": date.group(2) if date else None,
        "amount_due": amt.group(2).replace(" ", "") if amt else None
    }

def pdf_ocr_fields(pdf_path: str | Path):
    import json
    pdf_path = Path(pdf_path)
    texts = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            g = render_page(page)
            g = deskew(g)
            bw = sauvola(g)
            bw = remove_table_lines(bw)
            texts.append(best_text(bw))
    full = "\n".join(texts)
    #fields = extract_fields(full)
    fields = None

    # If amount missing, try ROI numeric OCR (top 35% of first page as example)
    if fields["amount_due"] is None:
        first = sauvola(deskew(render_page(fitz.open(pdf_path)[0])))
        h = first.shape[0]
        roi = first[: int(h*0.35), :]
        amt_text = tesseract_text(roi, psm=6, numeric=True)
        m = re.search(r"(₱?\s*[0-9][0-9,]*\.?[0-9]{0,2})", amt_text)
        if m: fields["amount_due"] = m.group(1).replace(" ", "")
    return {"fields": fields, "text": full}

# usage:
# result = pdf_ocr_fields("soa.pdf")
# print(result["fields"])
def pdf_to_ocr_text(pdf_path: str | Path):
    return pdf_ocr_fields(pdf_path)

data = pdf_to_ocr_text("./gmail_attachments_poc/BPI Rewards - September 2025.pdf")
print(json.dumps(data["fields"], indent=2))

null


In [None]:
data

{'fields': None,
 'text': '—_\n\n4.4-795607"\n\n—\n\n= NUME\n\nCu\n\nWi |\n\naR\n\nCuSO\n\nONE\n\n28, 2025\n\n—\n\nAUG Us|\n\nPrepared or\n\ntT\n\nfi!\n\n=\\2 4\n\n74 A7T956-4\n\nSyATEMENT DAT:\n\nLl?\n\nLem™\n\n7, 2025\n\nReference | No. 0\n\nHY Tell VA\n\nSEPTEM 2\n\n\\V] Wiyill itl ayy AN\n\nA \\=\n\nOd)\n\naeiimeeere\n\nju\n\nPAYME! Nv DU\n\nA741\n\npy ddd pb i). LA j iL\n\nLMI\n\n37 985 35\n\nry\n\nesi AZ\n\nre ee\n\nUL CAN. ne}\n\no>\n\na !\n\n=IN\n\nate\n\nre ier 1 St acl\n\noR\n\n850) |. 0)\n\nWe Bik 2 I ol 1 AA!\n\nTAL AMO INT DUE\n\n~\n\n3TGy © wai\n\nCielito |r iomes \'2\n\nMAINA ;AMOUN DUE\n\n“199 Caloocan Cily\n\n61L1 viata sO\n\n(+ ‘Late Paymen\n\nAmoun Du\n\nen!\n\n(+) Finance\n\nClaarqes\n\n(+) Inst tallim\n\nCrnarges anc\n\n(+) Purcinases\n\nDue\n\ni) Paymen\'s !\n\nanc Advances\n\nOtner Fees\n\nPrevious\n\nCredits anc\n\nBalance\n\n0.00\n\n37,265.39\n\nRe\'paiies\n\nCard Type\n\n0.00\n\n9,604.19\n\n7,66’\n\n6\n\n22,645.27\n\n22,645.27\n\nBP Rewarc\n\nI\n\n0.00\n\n37 2

In [None]:
from pdfminer.high_level import extract_text_to_fp
from io import StringIO, BytesIO

output = StringIO()
with open("./gmail_attachments_poc/BPI Rewards - September 2025.pdf", "rb") as f:
    extract_text_to_fp(f, output, output_type='text', codec='latin-1')  # try cp1252, latin1, etc.
print(output.getvalue()[:500])

æ(cid:240)(cid:242)(cid:240)æøø(cid:247)(cid:246)(cid:243)ø(cid:243)ł(cid:246)ł(cid:243)(cid:240)(cid:247)(cid:247)(cid:243)(cid:209)(cid:214)(cid:226)¨(cid:228)`(cid:217)(cid:214)ª(cid:213)`ˆ|˙(cid:212)`(cid:201)(cid:211)Kˆ(cid:214)(cid:212)(cid:209)(cid:214)(cid:226)¨(cid:228)`@Ø@ˆ`(cid:213)ª(cid:214)(cid:217)\´(cid:147)(cid:146)@(cid:242)@(cid:211)(cid:150)£@æ(cid:244)@(cid:215)(cid:133)£(cid:133)(cid:153)@(cid:226)£(cid:153)(cid:133)(cid:133)£ˆ(cid:137)(cid:133)(cid:147)(cid:137)£(cid:150)@¨


In [None]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

source = "./gmail_attachments_poc/BPI Rewards - September 2025.pdf"

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
pipeline_options.ocr_options = ocr_options

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)
result = converter.convert(source)
print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"

2025-10-31 23:36:06,683 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-31 23:36:06,697 - INFO - Going to convert document batch...
2025-10-31 23:36:06,698 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 53106240f2272fb11f43fd10d9d82596
2025-10-31 23:36:06,722 - INFO - command: tesseract --list-langs
2025-10-31 23:36:06,745 - INFO - Accelerator device: 'cuda:0'
2025-10-31 23:36:07,911 - INFO - Processing document BPI Rewards - September 2025.pdf
2025-10-31 23:36:08,415 - INFO - command: tesseract --psm 0 -l osd /tmp/tmpgjvb5yyd.png stdout
2025-10-31 23:36:09,132 - INFO - command: tesseract -l fra+deu+spa+eng /tmp/tmpgjvb5yyd.png stdout tsv


len(pages)=1, 0-0
len(valid_pages)=1
len(valid_page_images)=1


2025-10-31 23:36:10,348 - INFO - command: tesseract --psm 0 -l osd /tmp/tmpzf5yxlrq.png stdout
2025-10-31 23:36:11,049 - INFO - command: tesseract -l fra+deu+spa+eng /tmp/tmpzf5yxlrq.png stdout tsv
2025-10-31 23:36:13,273 - INFO - command: tesseract --psm 0 -l osd /tmp/tmpgnhr8plg.png stdout
2025-10-31 23:36:13,886 - INFO - command: tesseract -l fra+deu+spa+eng /tmp/tmpgnhr8plg.png stdout tsv
2025-10-31 23:36:14,758 - INFO - command: tesseract --psm 0 -l osd /tmp/tmpz6rplr6a.png stdout
2025-10-31 23:36:15,021 - ERROR - OSD failed (doc BPI Rewards - September 2025.pdf, page: 2, OCR rectangle: 0, processed image file <tempfile._TemporaryFileWrapper object at 0x721852955490>):
2025-10-31 23:36:15,023 - INFO - command: tesseract -l fra+deu+spa+eng /tmp/tmpz6rplr6a.png stdout tsv


len(pages)=3, 1-3
len(valid_pages)=3
len(valid_page_images)=3


2025-10-31 23:36:15,949 - INFO - Finished converting document BPI Rewards - September 2025.pdf in 9.27 sec.


<!-- image -->

6ILT PPM StStONVITMAIOMa

## BPI Credit Cards

Prepared for HITTIN AUN QM ATER

<!-- image -->

&amp; Blk 2 Lot 14 Peter Street Cielito Homes Brgy 177 1422 Caloocan City

## Statement of Account

CUSTOMER NUMBER STATEMENT DATE CREDIT LIMIT TOTAL AMOUNT DUE MINIMUM AMOUNT DUE PAYMENT DUE DATE

020100-4-10-7956071 AUGUST 28, 2025 314,000.00 37,265.35 850.00 SEPTEMBER 17, 2025

| Previous  (-)  Payments  /  |  (+)  Purchases  |  (+)  Installment  |  (+)  Finance  (+)  Late  Payment  Card  Type  Balance  Credits  and  and  Advances  |  Due  Charges  and  |  Charges  Amount  Due  Rebates  Other  Fees  BPI  Rewards  22,645.27  22,645.27  17,661.16  19,604.19  0.00  0.00  37,265.35  Total  22,645.27  22,645.27  17,661.16  19,604.19  0.00  0.00  37,265.35  Past Due Amount Minimum Amount Due Unbilled Installment Amount Balance  0.00  850.00  627,303.76  Total Outstanding  664,569.11   |
|--------------------------------------------------------------------------------------------

In [None]:
import re
from datetime import datetime
from decimal import Decimal

TEXT = """CUSTOMER NUMBER STATEMENT DATE CREDIT LIMIT TOTAL AMOUNT DUE MINIMUM AMOUNT DUE PAYMENT DUE DATE

020123-4-10-7956071 AUGUST 28, 2025 314,000.00 37,265.35 850.00 SEPTEMBER 17, 2025
"""

MONTH = r"(?:January|February|March|April|May|June|July|August|September|October|November|December)"
DATE = rf"{MONTH}\s+\d{{1,2}},\s+\d{{4}}"
MONEY = r"\d{1,3}(?:,\d{3})*(?:\.\d{2})?"  # 314,000.00 / 850.00 / 37,265.35

# Strict, positional pattern: values appear after the header line, in order.
PATTERN = re.compile(
    rf"""
    (?P<customer>\d[\d-]+)                     # 020123-4-10-7956071
    \s+(?P<statement_date>{DATE})              # AUGUST 28, 2025
    \s+(?P<credit_limit>{MONEY})               # 314,000.00
    \s+(?P<total_due>{MONEY})                  # 37,265.35
    \s+(?P<min_due>{MONEY})                    # 850.00
    \s+(?P<payment_due>{DATE})                 # SEPTEMBER 17, 2025
    """,
    re.IGNORECASE | re.VERBOSE | re.DOTALL,
)

def parse_date(s: str) -> datetime.date:
    s = " ".join(s.split())  # collapse odd spaces from OCR
    for fmt in ("%B %d, %Y", "%b %d, %Y"):  # e.g., AUGUST 28, 2025 / Aug 28, 2025
        try:
            return datetime.strptime(s.title(), fmt).date()
        except ValueError:
            pass
    raise ValueError(f"Unrecognized date format: {s!r}")

def parse_money(s: str) -> Decimal:
    return Decimal(s.replace(",", ""))

def extract_fields(text: str) -> dict:
    m = PATTERN.search(text)
    if not m:
        raise ValueError("Could not match expected sequence. Check OCR noise or adjust the pattern.")
    d = m.groupdict()
    return {
        "customer_number": d["customer"],
        "statement_date": parse_date(d["statement_date"]),
        "credit_limit": parse_money(d["credit_limit"]),
        "total_amount_due": parse_money(d["total_due"]),
        "minimum_amount_due": parse_money(d["min_due"]),
        "payment_due_date": parse_date(d["payment_due"]),
    }

fields = extract_fields(TEXT)
for k, v in fields.items():
    print(f"{k}: {v}")

customer_number: 020123-4-10-7956071
statement_date: 2025-08-28
credit_limit: 314000.00
total_amount_due: 37265.35
minimum_amount_due: 850.00
payment_due_date: 2025-09-17


In [None]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

source = "./output_searchable.pdf"

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
pipeline_options.ocr_options = ocr_options

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)
result = converter.convert(source)
print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"

text2 = result.document.export_to_markdown()

2025-11-01 00:11:01,349 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-01 00:11:01,352 - INFO - Going to convert document batch...
2025-11-01 00:11:01,353 - INFO - Initializing pipeline for StandardPdfPipeline with options hash bdac8f1a8b06fc8c9f7596031da59d30
2025-11-01 00:11:01,373 - INFO - command: tesseract --list-langs
2025-11-01 00:11:01,393 - INFO - Accelerator device: 'cuda:0'
2025-11-01 00:11:04,249 - INFO - Accelerator device: 'cuda:0'
2025-11-01 00:11:05,106 - INFO - Processing document output_searchable.pdf
2025-11-01 00:11:05,526 - INFO - command: tesseract --psm 0 -l osd /tmp/tmpxx3cpue5.png stdout
2025-11-01 00:11:06,199 - INFO - command: tesseract -l fra+deu+spa+eng /tmp/tmpxx3cpue5.png stdout tsv


len(pages)=1, 0-0
len(valid_pages)=1
len(valid_page_images)=1


2025-11-01 00:11:07,898 - INFO - command: tesseract --psm 0 -l osd /tmp/tmpq6hzcp2r.png stdout
2025-11-01 00:11:08,962 - INFO - command: tesseract -l fra+deu+spa+eng /tmp/tmpq6hzcp2r.png stdout tsv
2025-11-01 00:11:11,662 - INFO - command: tesseract --psm 0 -l osd /tmp/tmpci6ashya.png stdout
2025-11-01 00:11:12,258 - INFO - command: tesseract -l fra+deu+spa+eng /tmp/tmpci6ashya.png stdout tsv
2025-11-01 00:11:13,354 - INFO - command: tesseract --psm 0 -l osd /tmp/tmpuulbmee1.png stdout
2025-11-01 00:11:13,957 - INFO - command: tesseract -l fra+deu+spa+eng /tmp/tmpuulbmee1.png stdout tsv


len(pages)=3, 1-3
len(valid_pages)=3
len(valid_page_images)=3


2025-11-01 00:11:18,950 - INFO - Finished converting document output_searchable.pdf in 18.18 sec.


<!-- image -->

## HSBC GOLD VISA

The Hongkong and Shanghai Banking Corporation Limited Card Products Centre, PO BOX 1096 Makati Central Post Office, 1250 Makati Metro Manila,

## JOSHUA CANTOR

Statement From 13 AUG 2025 to 14 SEP 2025

Please examine your statement immediately upon receipt. If no error is reported within 30 days, the account will be considered correct.

TRAN

POST

DATE

DATE

DESCRIPTION

AMOUNT(PHP

Joshua

Cantor

4028-XXXX-XXXX-4614

Sa

## CONTACT US

Customer

Service

(02) 8858 0000

From

Overseas

63

2

7976

8000

| ACCOUNT SUMMARY            | PHP         |
|----------------------------|-------------|
| Previous Statement Balance | 11,578.85   |
| Payments & Credits         | 13,927.33CR |
| Purchases & Debits         | 34,182.09   |
| Outstanding Installments   | 1,019.55    |
| Total Account Balance      | 32,853.16   |

| PAYMENT SUMMARY   | PHP         |
|-------------------|-------------|
| Payment Due Date  | 06 Oct 2025 |
| Minimum Payment   | 2,5

In [None]:
import re
from datetime import datetime
from decimal import Decimal


MONTH = r"(?:January|February|March|April|May|June|July|August|September|October|November|December)"
DATE_LONG = rf"{MONTH}\s+\d{{1,2}},\s+\d{{4}}"   # e.g., August 28, 2025
DATE_DMY  = r"\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\s+\d{4}"  # 06 Oct 2025
DATE_ANY  = rf"(?:{DATE_LONG}|{DATE_DMY})"

MONEY_CORE = r"\d{1,3}(?:,\d{3})*(?:\.\d{2})?"
MONEY = rf"(?:[\(]?\s*(?:₱|P)?\s*{MONEY_CORE}\s*[\)]?)"  # optional ₱/P, parentheses

# 1) Original strict, positional pattern (kept as-is, just reusing DATE_ANY):
STRICT_SEQUENCE = re.compile(
    rf"""
    (?P<customer>\d[\d-]+)
    \s+(?P<statement_date>{DATE_ANY})
    \s+(?P<credit_limit>{MONEY_CORE})
    \s+(?P<total_due>{MONEY_CORE})
    \s+(?P<min_due>{MONEY_CORE})
    \s+(?P<payment_due>{DATE_ANY})
    """,
    re.IGNORECASE | re.VERBOSE | re.DOTALL,
)

CUSTOMER_PATTERN = re.compile(
    r"(?P<customer>\d[\d-]+)", re.IGNORECASE | re.VERBOSE
)

STATEMENT_DATE_PATTERN = re.compile(
    rf"(?P<statement_date>{DATE_ANY})", re.IGNORECASE | re.VERBOSE
)

CREDIT_LIMIT_PATTERN = re.compile(
    rf"(?P<credit_limit>{MONEY_CORE})", re.IGNORECASE | re.VERBOSE
)

TOTAL_DUE_PATTERN = re.compile(
    rf"(?P<total_due>{MONEY_CORE})", re.IGNORECASE | re.VERBOSE
)

MIN_DUE_PATTERN = re.compile(
    rf"(?P<min_due>{MONEY_CORE})", re.IGNORECASE | re.VERBOSE
)

PAYMENT_DUE_PATTERN = re.compile(
    rf"(?P<payment_due>{DATE_ANY})", re.IGNORECASE | re.VERBOSE
)


# 2) NEW: Markdown-like table row matcher for the data row (4 cells).
#    We don't rely on the header text being perfect; we match any row of 4 cells following a header line.
TABLE_ROW = re.compile(
    rf"""
    ^\|\s*(?P<total_due>[\s₱P\(\)\d,\.]+?)\s*\|
      \s*(?P<min_due>[\s₱P\(\)\d,\.]+?)\s*\|
      \s*(?P<payment_due>{DATE_ANY})\s*\|
      \s*(?P<amount_paid>[\s₱P\(\)\d,\.]*)\|?
    """,
    re.IGNORECASE | re.VERBOSE | re.MULTILINE,
)

def parse_date(s: str):
    s = " ".join(s.split())
    # Try multiple common OCR-safe formats
    fmts = ("%B %d, %Y", "%b %d, %Y", "%d %b %Y", "%d %B %Y")
    for fmt in fmts:
        try:
            # title() helps when OCR shouts (e.g., "AUGUST 28, 2025")
            return datetime.strptime(s.title(), fmt).date()
        except ValueError:
            pass
    raise ValueError(f"Unrecognized date format: {s!r}")

def parse_money(s: str) -> Decimal | None:
    s = (s or "").strip()
    if not s:
        return None
    neg = "(" in s and ")" in s
    # strip currency and punctuation we don't need
    clean = s.replace("₱", "").replace("P", "").replace("p", "").replace(" ", "")
    clean = clean.replace("(", "").replace(")", "")
    val = Decimal(clean.replace(",", ""))
    return -val if neg else val

def extract_fields(text: str) -> dict:
    fields = {}

    for name, pattern in [
        ("customer", CUSTOMER_PATTERN),
        ("statement_date", STATEMENT_DATE_PATTERN),
        ("credit_limit", CREDIT_LIMIT_PATTERN),
        ("total_due", TOTAL_DUE_PATTERN),
        ("min_due", MIN_DUE_PATTERN),
        ("payment_due", PAYMENT_DUE_PATTERN),
    ]:
        if match := pattern.search(text):
            fields[name] = match.group(name)

    return fields

def extract_fields2(text: str) -> dict:
    # First try the strict, positional sequence
    m = STRICT_SEQUENCE.search(text)
    print(m)
    if m:
        d = m.groupdict()
        return {
            "customer_number": d.get("customer"),
            "statement_date": parse_date(d["statement_date"]),
            "credit_limit": Decimal(d["credit_limit"].replace(",", "")),
            "total_amount_due": Decimal(d["total_due"].replace(",", "")),
            "minimum_amount_due": Decimal(d["min_due"].replace(",", "")),
            "payment_due_date": parse_date(d["payment_due"]),
            "source_layout": "strict_sequence",
        }

    # Then try the markdown/table row layout
    t = TABLE_ROW.search(text)
    if t:
        d = t.groupdict()
        return {
            "customer_number": None,  # not present in this layout
            "statement_date": None,   # not present in this layout
            "credit_limit": None,     # not present in this layout
            "total_amount_due": parse_money(d["total_due"]),
            "minimum_amount_due": parse_money(d["min_due"]),
            "payment_due_date": parse_date(d["payment_due"]),
            "source_layout": "table_row",
        }
    print(t)
    raise ValueError("Could not match either strict sequence or table-row layout. Inspect OCR text and adjust patterns.")

result = extract_fields(TEXT)
for k, v in result.items():
    print(f"{k}: {v}")


customer: 020123-4-10-7956071
statement_date: AUGUST 28, 2025
credit_limit: 020
total_due: 020
min_due: 020
payment_due: AUGUST 28, 2025


In [None]:
text2

"<!-- image -->\n\n## HSBC GOLD VISA\n\nThe Hongkong and Shanghai Banking Corporation Limited Card Products Centre, PO BOX 1096 Makati Central Post Office, 1250 Makati Metro Manila,\n\n## JOSHUA CANTOR\n\nStatement From 13 AUG 2025 to 14 SEP 2025\n\nPlease examine your statement immediately upon receipt. If no error is reported within 30 days, the account will be considered correct.\n\nTRAN\n\nPOST\n\nDATE\n\nDATE\n\nDESCRIPTION\n\nAMOUNT(PHP\n\nJoshua\n\nCantor\n\n4028-XXXX-XXXX-4614\n\nSa\n\n## CONTACT US\n\nCustomer\n\nService\n\n(02) 8858 0000\n\nFrom\n\nOverseas\n\n63\n\n2\n\n7976\n\n8000\n\n| ACCOUNT SUMMARY            | PHP         |\n|----------------------------|-------------|\n| Previous Statement Balance | 11,578.85   |\n| Payments & Credits         | 13,927.33CR |\n| Purchases & Debits         | 34,182.09   |\n| Outstanding Installments   | 1,019.55    |\n| Total Account Balance      | 32,853.16   |\n\n| PAYMENT SUMMARY   | PHP         |\n|-------------------|-------------|

In [None]:
text = """
| ## ACCOUNT SUMMARY  PHP  .  Previous  Statement  Balance  11,578.85  Payments  &amp;  Credits  13,927.33CR  Purchases  &amp;  Debits  34,182.09  Outstanding  Installments  1,019.55  Total  Account  Balance  32,853.16   |
|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|

| PAYMENT  SUMMARY  PHP  Payment  Due  Date  06  Oct  2025  Minimum  Payment  2,519.55   |
|----------------------------------------------------------------------------------------|
"""

import re

patterns = {
    "total_balance": r"(?i)total\s+account\s+balance\s+([\d,]+\.\d{2})",
    "due_date": r"(?i)(?:payment\s+)?due\s+date\s+(\d{1,2}\s+\w+\s+\d{4})",
    "min_payment": r"(?i)minimum\s+payment\s+([\d,]+\.\d{2})"
}

result = {}
for key, pattern in patterns.items():
    match = re.search(pattern, text)
    result[key] = match.group(1) if match else None

print(result)

{'total_balance': '32,853.16', 'due_date': '06  Oct  2025', 'min_payment': '2,519.55'}


In [None]:
lol = {
    "asdlamsd":{
        "bills_path": "bills_path",
        "password": "password"
    }
}

for l, v in lol.items():
    print(v["bills_path"])

bills_path


In [14]:
import re

text = """<!-- image -->

ACCOUNT SUMMARY

PREVIOUS BALANCE

189.03 CR

:

TOTAL AMOUNT DUE

PHP

38,864.35

LESS:

PAYMENTS/CREDITS

73.37.

|

MINIMUM AMOUNT DUE

PHP

1,165.93

PLUS:

PURCHASES/ADV/DEBITS

39,126.75

## REBATES SUMMARY

## A better way to track your cashback and savings

Introducing, BetterBanking Rewards. Your one-stop shop to view all your savings from cash rebates, the latest promos, and exclusive offers tastefully curated just for you!

Register with your credit card number and account details here: bit.ly/SBMastercardRebates, or scan the QR Code now!

Reminders: Rebates are credited every 2nd of the month, based on the posting date of transactions from the previous calendar month, regardless of Statement Date. The monthly rebate cap of PHP250 and annual cap of PHP3,000 for online transactions still applies.

<!-- image -->

Important Reminder: Paying less than the total amount due will increase the amount of interest and other charges you pay and the time it takes to repay your balance.

## ONLINE DEALS

<!-- image -->

@)

Enjoy 10% off on full-priced items for a min. spend of USD300 Promo runs until December 31, 2025

Enjoy up to PHP600 off

<!-- image -->

<!-- image -->

Min. spend requirement applies Promo runs until May 31, 2026

<!-- image -->

Enjoy up to 18% off on shipping fees

Promo runs until December 31, 2025

—

:

:

:

b

:

Terms and Conditions apply. Visit www.securitybank.com/promos/ for more details. Per DTI Fair Trade Permit Nos. FTEB-235398, 215691, 234138, Series of 2025.

## INSTALLMENT DEALS

<!-- image -->

Enjoy 0% ChargeLight Installment or Charge Now, Pay Later for up to 24 months

<!-- image -->

|

METRO

Enjoy 0% ChargeLight Installment up to 3 months for a min. spend of PHP3,000 Promo runs until December 31, 2025

<!-- image -->

Enjoy 0% ChargeLight Installment or Charge Now, Pay Later for up to 24 months at select SM Retail stores

;

y

‘

‘

;

i

H

:

Terms and Conditions apply. Visit www.securitybank.com/promos/ for more details.

<!-- image -->

—

## CREDIT CARD ACCOUNT NUMBER 5101-8600-1144-0298

CUT-OFF STATEMENT DATE

12 OCT 2025

PAYMENT DUE DATE CREDIT LIMIT

03 NOV 2025

PHP

315,000.00

TOTAL AMOUNT DUE

PHP

38,864.35

MINIMUM AMOUNT DUE

PHP

1,165.93

="""

due_date = re.search(r"PAYMENT\s+DUE\s+DATE\b[^\n\r]*[\r\n]+([0-9]{1,2}\s+[A-Za-z]{3,9}\s+[0-9]{4})", text, re.IGNORECASE)
total_due = re.search(r"TOTAL AMOUNT DUE[\sA-Z]*([0-9][0-9,]*\.\d{2})", text)
min_due = re.search(r"MINIMUM AMOUNT DUE[\sA-Z]*([0-9][0-9,]*\.\d{2})", text)

print("Due Date:", due_date.group(1) if due_date else None)
print("Total Amount Due:", total_due.group(1) if total_due else None)
print("Minimum Amount Due:", min_due.group(1) if min_due else None)


Due Date: 03 NOV 2025
Total Amount Due: 38,864.35
Minimum Amount Due: 1,165.93
