## This file contains the code of extraction from Pdf's


In [None]:
"""
extract_tables_multi.py
---------------------------------
Robust PDF table extraction helper using pdfplumber + pandas.

This script is designed to process multiple PDF files (for example, five PSDP reports),
extract all tables found on each page, save each table as a CSV under
`documents/<pdf_basename>/tables/` and write a small Markdown summary per PDF
under `documents/<pdf_basename>/summary.md`.

Key features:
- Safe handling when a PDF has zero tables.
- Attempts to use the first row of a table as header; falls back to numeric column names.
- Saves example rows (head) in the Markdown summary for quick review.
- Command-line friendly: pass a list of PDF paths or a glob pattern.

Usage (from project root):
    python documents/extract_tables_multi.py "Expenditure-Summary-2025-26-Sept-2025.pdf" "other_report.pdf"

Or process many with a glob (PowerShell / bash):
    python documents/extract_tables_multi.py "docs/*.pdf"

Requirements:
    pip install pdfplumber pandas

"""
from pathlib import Path
import pdfplumber
import pandas as pd
import argparse
import textwrap
import json


def extract_tables_from_pdf(pdf_path: Path):
    """Extract tables from a single PDF file using pdfplumber.

    Returns a list of pandas.DataFrame objects and a brief report dict.
    """
    tables = []
    report = {"pdf": str(pdf_path.name), "tables_found": 0, "table_shapes": []}

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            try:
                page_tables = page.extract_tables()
            except Exception as e:
                # If extraction fails for a page, continue but log the issue
                print(f"Warning: failed to extract tables from {pdf_path.name} page {page_num}: {e}")
                continue

            for i, raw_table in enumerate(page_tables, start=1):
                if not raw_table:
                    continue

                # Coerce to DataFrame. Prefer the first row as header when it looks like strings
                header = raw_table[0]
                body = raw_table[1:]

                # Clean header: if header row contains many None or numeric-like entries, fallback
                header_valid = any(h and isinstance(h, str) and h.strip() for h in header)

                if header_valid:
                    df = pd.DataFrame(body, columns=[str(h).strip() for h in header])
                else:
                    # Generate numeric column names
                    max_cols = max(len(r) for r in raw_table)
                    cols = [f"col_{i+1}" for i in range(max_cols)]
                    # Pad rows to equal length
                    padded = [list(r) + [None] * (max_cols - len(r)) for r in raw_table]
                    df = pd.DataFrame(padded[1:], columns=cols)

                # Basic cleanup: strip whitespace from string columns
                for c in df.columns:
                    if df[c].dtype == object:
                        df[c] = df[c].astype(str).str.strip()

                tables.append(df)
                report["tables_found"] += 1
                report["table_shapes"].append({"page": page_num, "index_on_page": i, "shape": df.shape})
                print(f"Found table on {pdf_path.name} page {page_num} (table #{i}) — shape {df.shape}")

    return tables, report


def save_tables_and_summary(pdf_path: Path, tables, report, out_root: Path):
    """Save extracted tables as CSV and write a markdown summary for the PDF."""
    pdf_stem = pdf_path.stem
    pdf_folder = out_root / pdf_stem
    tables_folder = pdf_folder / "tables"
    pdf_folder.mkdir(parents=True, exist_ok=True)
    tables_folder.mkdir(parents=True, exist_ok=True)

    # Save each table
    for idx, df in enumerate(tables, start=1):
        csv_path = tables_folder / f"table_{idx}.csv"
        # Try to coerce numeric columns when possible
        try:
            df.to_csv(csv_path, index=False)
        except Exception as e:
            # fallback: save as JSON if CSV fails
            json_path = tables_folder / f"table_{idx}.json"
            df.to_json(json_path, orient="records", force_ascii=False)
            print(f"Warning: failed to save CSV for {csv_path.name} ({e}), saved JSON instead: {json_path}")

    # Write a small markdown summary with shapes and preview rows
    summary_md = tables_folder / "../summary.md"
    with summary_md.open("w", encoding="utf-8") as f:
        f.write(f"# Summary for {pdf_path.name}\n\n")
        f.write(f"**Total tables found:** {report['tables_found']}\n\n")

        if report["tables_found"] == 0:
            f.write("No tables were detected in this PDF using pdfplumber's table extraction.\n")
            f.write("Consider alternative extraction strategies (OCR, PDFMiner, manual review).\n")
            return

        for idx, info in enumerate(report["table_shapes"], start=1):
            f.write(f"## Table {idx}\n")
            f.write(f"- Page: {info['page']}\n")
            f.write(f"- Shape: {info['shape']}\n\n")
            # Write a small preview of the CSV head
            df_preview = tables[idx - 1].head(5)
            f.write("```csv\n")
            f.write(df_preview.to_csv(index=False))
            f.write("```\n\n")

    print(f"Saved {len(tables)} tables and summary for {pdf_path.name} -> {pdf_folder}")


def process_pdfs(pdf_paths, out_root: Path):
    out_root.mkdir(parents=True, exist_ok=True)
    master_report = []

    for pdf in pdf_paths:
        pdf_path = Path(pdf)
        if not pdf_path.exists():
            print(f"Warning: file not found: {pdf}")
            continue

        print(f"Processing {pdf_path}...")
        tables, report = extract_tables_from_pdf(pdf_path)
        save_tables_and_summary(pdf_path, tables, report, out_root)
        master_report.append(report)

    # Save a master JSON report listing processed PDFs
    master_path = out_root / "master_report.json"
    with master_path.open("w", encoding="utf-8") as f:
        json.dump(master_report, f, indent=2)

    print(f"Done. Master report saved to {master_path}")


def parse_args():
    parser = argparse.ArgumentParser(description="Extract tables from multiple PDF files and save CSVs + summaries.")
    parser.add_argument("pdfs", nargs="+", help="PDF file paths or glob patterns")
    parser.add_argument("--out", default="documents", help="Output root folder (default: documents)")
    return parser.parse_args()


def expand_globs(paths):
    expanded = []
    for p in paths:
        p_obj = Path(p)
        if any(c in p for c in "*?[]"):
            matched = list(p_obj.parent.glob(p_obj.name))
            expanded.extend([str(m) for m in matched])
        else:
            expanded.append(str(p))
    return expanded


if __name__ == "__main__":
    args = parse_args()
    pdf_list = expand_globs(args.pdfs)
    out_root = Path(args.out)
    process_pdfs(pdf_list, out_root)
