In [None]:
!apt-get -qq update
!apt-get -qq install -y poppler-utils
!pip -q install pdf2image pillow pandas google-generativeai

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package poppler-utils.
(Reading database ... 126371 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.10_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.10) ...
Setting up poppler-utils (22.02.0-2ubuntu0.10) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
import os, json
import pandas as pd
from datetime import datetime
from typing import List, Dict, Any

from PIL import Image
from pdf2image import convert_from_path
import google.generativeai as genai
from google.colab import files

# ---- Set Gemini API key securely ----
from getpass import getpass
os.environ["GEMINI_API_KEY"] = getpass("Enter your GEMINI_API_KEY: ")

def get_api_key() -> str:
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise ValueError("Please set GEMINI_API_KEY (use input above).")
    return api_key

# Configure Gemini
genai.configure(api_key=get_api_key())
model = genai.GenerativeModel("models/gemini-1.5-flash")

Enter your GEMINI_API_KEY: ··········


In [None]:
def get_pdf_dpi() -> int:
    return int(os.getenv("PDF_DPI", "200"))

def convert_pdf_to_images(pdf_path: str, dpi: int = None) -> List[Image.Image]:
    if dpi is None:
        dpi = get_pdf_dpi()
    return convert_from_path(pdf_path, dpi=dpi)

def extract_table_info(image: Image.Image, page_number: int) -> List[Dict[str, Any]]:
    prompt = (
        "You are a JSON-only response system. Analyze this PDF page image and identify tables.\n\n"
        "1. Detect any tables.\n"
        "2. For each table, identify its title (above or near).\n"
        "3. If no clear title: 'Unknown'.\n\n"
        "Respond ONLY with JSON array like:\n"
        '[{\"title\": \"Table Title or Unknown\", \"page_number\": 1}]\n\n'
        "If no tables: return []."
    )
    response = model.generate_content([prompt, image])
    if not getattr(response, "text", "").strip():
        return [{"title": "Unknown", "page_number": page_number, "error": "Empty response"}]

    cleaned = response.text.strip().replace("```json", "").replace("```", "").strip()
    try:
        result = json.loads(cleaned)
        if not isinstance(result, list):
            result = [result]
        for entry in result:
            if "title" not in entry:
                entry["title"] = "Unknown"
            entry["page_number"] = page_number
        return result
    except json.JSONDecodeError as e:
        return [{"title": "Unknown", "page_number": page_number, "error": f"Invalid JSON: {e}"}]

def save_results(results: List[Dict[str, Any]], pdf_path: str) -> Dict[str, str]:
    output_dir = "output"
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base = os.path.splitext(os.path.basename(pdf_path))[0]

    json_path = os.path.join(output_dir, f"{base}_{timestamp}.json")
    csv_path  = os.path.join(output_dir, f"{base}_{timestamp}.csv")

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    pd.DataFrame(results).to_csv(csv_path, index=False, encoding="utf-8")

    print(f"✅ Saved for {pdf_path}:\n - {json_path}\n - {csv_path}")
    files.download(json_path)
    files.download(csv_path)
    return {"json": json_path, "csv": csv_path}

def process_pdf_with_gemini(pdf_path: str) -> List[Dict[str, Any]]:
    images = convert_pdf_to_images(pdf_path)
    if not images:
        raise RuntimeError("No images extracted from PDF.")
    output = []
    for i, img in enumerate(images, start=1):
        try:
            output.extend(extract_table_info(img, i))
        finally:
            img.close()
    save_results(output, pdf_path)
    return output

In [None]:
def validate_pdf(pdf_path: str) -> bool:
    if not os.path.exists(pdf_path):
        print(f"❌ Error: {pdf_path} does not exist!")
        return False
    if not pdf_path.lower().endswith(".pdf"):
        print(f"❌ Error: {pdf_path} is not a PDF!")
        return False
    return True

def main(pdf_paths: List[str]):
    """Process one or many PDFs"""
    all_outputs = {}
    for pdf_path in pdf_paths:
        try:
            if not validate_pdf(pdf_path):
                continue
            print(f"\n📄 Processing {pdf_path} ...")
            results = process_pdf_with_gemini(pdf_path)
            print(f"✅ Done: {pdf_path}")
            all_outputs[pdf_path] = results
        except Exception as e:
            print(f"⚠️ Error in {pdf_path}: {e}")
    return all_outputs

In [None]:
uploaded = files.upload()  # select 1 or many PDFs
pdf_paths = list(uploaded.keys())
print("Uploaded files:", pdf_paths)

Saving Document1.pdf to Document1 (2).pdf
Saving Document3.pdf to Document3.pdf
Uploaded files: ['Document1 (2).pdf', 'Document3.pdf']


In [None]:
all_results = main(pdf_paths)


📄 Processing Document1 (2).pdf ...
✅ Saved for Document1 (2).pdf:
 - output/Document1 (2)_20250823_133059.json
 - output/Document1 (2)_20250823_133059.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Done: Document1 (2).pdf

📄 Processing Document3.pdf ...
✅ Saved for Document3.pdf:
 - output/Document3_20250823_133219.json
 - output/Document3_20250823_133219.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Done: Document3.pdf


In [None]:
summary = []
for name, res in all_results.items():
    tables = [r for r in res if "error" not in r]
    errors = [r for r in res if "error" in r]
    summary.append({
        "pdf": name,
        "total_entries": len(res),
        "tables_detected": len(tables),
        "errors": len(errors)
    })
pd.DataFrame(summary)

Unnamed: 0,pdf,total_entries,tables_detected,errors
0,Document1 (2).pdf,6,6,0
1,Document3.pdf,19,19,0
