<a href="https://colab.research.google.com/github/Lekhana2004/table-detection-/blob/main/table_detection_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get -qq update
!apt-get -qq install -y poppler-utils
!pip -q install pdf2image pillow pandas google-generativeai

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package poppler-utils.
(Reading database ... 126371 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.10_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.10) ...
Setting up poppler-utils (22.02.0-2ubuntu0.10) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
import os
import json
import pandas as pd
from datetime import datetime
from typing import List, Dict, Any

from PIL import Image
from pdf2image import convert_from_path
import google.generativeai as genai
from google.colab import files

# ---- Set your Gemini API key securely ----

from getpass import getpass
os.environ["GEMINI_API_KEY"] = getpass("Enter your GEMINI_API_KEY: ")

def get_api_key() -> str:
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise ValueError("Please set GEMINI_API_KEY (use input above).")
    return api_key

# Configure Gemini model
genai.configure(api_key=get_api_key())
model = genai.GenerativeModel("models/gemini-1.5-flash")

Enter your GEMINI_API_KEY: ··········


In [None]:
def get_pdf_dpi() -> int:
    return int(os.getenv("PDF_DPI", "200"))

def convert_pdf_to_images(pdf_path: str, dpi: int = None) -> List[Image.Image]:
    if dpi is None:
        dpi = get_pdf_dpi()
    return convert_from_path(pdf_path, dpi=dpi)

def extract_table_info(image: Image.Image, page_number: int) -> List[Dict[str, Any]]:
    prompt = (
        "You are a JSON-only response system designed to identify tables in PDF pages.\n\n"
        "Your task is to analyze the provided image of a PDF page and identify any tables present.\n"
        "For each table found, determine its main title. The main title is typically located directly above or very close to the table.\n"
        "Crucially, distinguish main titles from subheadings, row/column headers, or any other text *within* the table itself. The main title is *not* part of the table content.\n"
        "If no clear main title can be identified for a table, use 'Unknown'.\n\n"
        "Respond ONLY with a JSON array containing objects for each identified table. Each object should have the following structure:\n"
        '[{"title": "Main Table Title or Unknown", "page_number": 1}]\n\n'
        "If no tables are detected on the page, return an empty JSON array: [].\n"
        "Ensure your response is valid JSON and contains only the requested data."
    )

    response = model.generate_content([prompt, image])

    if not getattr(response, "text", "").strip():
        return [{"title": "Unknown", "page_number": page_number, "error": "Empty response"}]

    cleaned = response.text.strip().replace("```json", "").replace("```", "").strip()

    try:
        result = json.loads(cleaned)
        if not isinstance(result, list):
            result = [result]
        for entry in result:
            if "title" not in entry:
                entry["title"] = "Unknown"
            entry["page_number"] = page_number
        return result
    except json.JSONDecodeError as e:
        return [{"title": "Unknown", "page_number": page_number, "error": f"Invalid JSON: {e}"}]

def save_results(results: List[Dict[str, Any]], pdf_path: str) -> Dict[str, str]:
    output_dir = "output"
    os.makedirs(output_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base = os.path.splitext(os.path.basename(pdf_path))[0]

    json_path = os.path.join(output_dir, f"{base}_{timestamp}.json")
    csv_path  = os.path.join(output_dir, f"{base}_{timestamp}.csv")

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    pd.DataFrame(results).to_csv(csv_path, index=False, encoding="utf-8")

    print("Saved files:\n -", json_path, "\n -", csv_path)

    files.download(json_path)
    files.download(csv_path)

    return {"json": json_path, "csv": csv_path}

def process_pdf_with_gemini(pdf_path: str) -> List[Dict[str, Any]]:
    images = convert_pdf_to_images(pdf_path)
    if not images:
        raise RuntimeError("No images extracted from PDF.")

    output = []
    for i, img in enumerate(images, start=1):
        try:
            output.extend(extract_table_info(img, i))
        finally:
            img.close()

    save_results(output, pdf_path)
    return output

In [None]:
def validate_pdf(pdf_path: str) -> bool:
    if not os.path.exists(pdf_path):
        print(f"Error: File '{pdf_path}' does not exist!")
        return False
    if not pdf_path.lower().endswith(".pdf"):
        print(f"Error: File '{pdf_path}' must be a PDF!")
        return False
    return True

def main(pdf_paths: List[str]):
    for pdf_path in pdf_paths:
        try:
            if not validate_pdf(pdf_path):
                continue
            print(f"\nProcessing PDF: {pdf_path}...")
            try:
                process_pdf_with_gemini(pdf_path)
                print(f"✅ Tables extracted successfully from {pdf_path}!")
            except Exception as e:
                print(f"Error processing PDF {pdf_path}: {str(e)}")
        except Exception as e:
            print(f"Unexpected error processing {pdf_path}: {str(e)}")

In [None]:
uploaded = files.upload()  # Choose your PDF files
print("Uploaded:", list(uploaded.keys()))

Saving Document1.pdf to Document1 (2).pdf
Saving Document3.pdf to Document3.pdf
Uploaded: ['Document1 (2).pdf', 'Document3.pdf']


In [None]:
pdf_paths = list(uploaded.keys())
if pdf_paths:
    main(pdf_paths)
else:
    print("No PDF files uploaded.")

# Displaying results for the last processed PDF (or modify to aggregate if needed)
if pdf_paths:
    # Assuming you want to see the dataframe for the last processed file
    # You might want to combine results from multiple files if needed
    last_pdf_path = pdf_paths[-1]
    output_dir = "output"
    base = os.path.splitext(os.path.basename(last_pdf_path))[0]
    # Find the latest generated CSV for the last processed PDF
    latest_csv = None
    for f in os.listdir(output_dir):
        if f.startswith(base) and f.endswith(".csv"):
            if latest_csv is None or os.path.getmtime(os.path.join(output_dir, f)) > os.path.getmtime(os.path.join(output_dir, latest_csv)):
                latest_csv = f
    if latest_csv:
        results_df = pd.read_csv(os.path.join(output_dir, latest_csv))
        display(results_df)
    else:
        print(f"No results found for {last_pdf_path}")


Processing PDF: Document1 (2).pdf...
Saved files:
 - output/Document1 (2)_20250830_024429.json 
 - output/Document1 (2)_20250830_024429.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Tables extracted successfully from Document1 (2).pdf!

Processing PDF: Document3.pdf...
Saved files:
 - output/Document3_20250830_024543.json 
 - output/Document3_20250830_024543.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Tables extracted successfully from Document3.pdf!


Unnamed: 0,title,page_number
0,Unknown,5
1,"Balance Sheet as at 31st March, 2022",6
2,Statement of Profit and loss for the year ende...,7
3,Share Capital,8
4,The details of shareholders holding more than ...,8
5,Reserves & Surplus,8
6,Long Term Provisions,8
7,Current Liabilities,9
8,Non Current Investments,9
9,Long Term Loans & Advances,9


In [None]:
import glob
import os
import pandas as pd

output_dir = "output"
all_results = []

# Find all CSV files in the output directory
csv_files = glob.glob(os.path.join(output_dir, "*.csv"))

if not csv_files:
    print("No CSV files found in the output directory.")
else:
    # Read and append each CSV file to the list
    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)
            # Add a column for the original filename
            df['source_file'] = os.path.basename(csv_file)
            all_results.append(df)
        except Exception as e:
            print(f"Error reading {csv_file}: {e}")

    if all_results:
        # Concatenate all dataframes
        summary_df = pd.concat(all_results, ignore_index=True)

        # Display a summary table
        print("Summary of Tables Found Across PDFs:")
        display(summary_df[['source_file', 'page_number', 'title']])
    else:
        print("No data to summarize after attempting to read CSV files.")

Summary of Tables Found Across PDFs:


Unnamed: 0,source_file,page_number,title
0,Document3_20250830_024543.csv,5,Unknown
1,Document3_20250830_024543.csv,6,"Balance Sheet as at 31st March, 2022"
2,Document3_20250830_024543.csv,7,Statement of Profit and loss for the year ende...
3,Document3_20250830_024543.csv,8,Share Capital
4,Document3_20250830_024543.csv,8,The details of shareholders holding more than ...
5,Document3_20250830_024543.csv,8,Reserves & Surplus
6,Document3_20250830_024543.csv,8,Long Term Provisions
7,Document3_20250830_024543.csv,9,Current Liabilities
8,Document3_20250830_024543.csv,9,Non Current Investments
9,Document3_20250830_024543.csv,9,Long Term Loans & Advances
