In [None]:
pip install PyMuPDF opencv-python numpy pandas pillow pytesseract

Collecting PyMuPDF
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract, PyMuPDF
Successfully installed PyMuPDF-1.26.6 pytesseract-0.3.13


In [None]:
import os
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional, Union
from enum import Enum
import numpy as np
from PIL import Image
import cv2
import fitz  # PyMuPDF
import pandas as pd
from collections import defaultdict


In [None]:
# ============================================================================
# CORE DATA STRUCTURES
# ============================================================================

class ElementType(Enum):
    """Types of elements that can be extracted"""
    TABLE = "table"
    LINE_CHART = "line_chart"
    BAR_CHART = "bar_chart"
    PIE_CHART = "pie_chart"
    SCATTER_PLOT = "scatter_plot"
    HEATMAP = "heatmap"
    UNKNOWN = "unknown"


@dataclass
class BoundingBox:
    """Bounding box for extracted elements"""
    x1: float
    y1: float
    x2: float
    y2: float

    @property
    def width(self) -> float:
        return self.x2 - self.x1

    @property
    def height(self) -> float:
        return self.y2 - self.y1

    @property
    def area(self) -> float:
        return self.width * self.height

    @property
    def aspect_ratio(self) -> float:
        return self.width / self.height if self.height > 0 else 0

    def to_dict(self) -> Dict:
        return {
            'x1': self.x1, 'y1': self.y1,
            'x2': self.x2, 'y2': self.y2,
            'width': self.width, 'height': self.height
        }


@dataclass
class Cell:
    """Individual cell in a table"""
    text: str
    bbox: BoundingBox
    row: int
    col: int
    rowspan: int = 1
    colspan: int = 1
    confidence: float = 1.0


class Table:
    """
    Represents an extracted table with rich metadata
    Similar to Camelot's Table object
    """

    def __init__(self, data: List[List[str]], bbox: BoundingBox,
                 page: int, accuracy: float = 0.0):
        self.data = data
        self.bbox = bbox
        self.page = page
        self.accuracy = accuracy
        self._df = None
        self.cells = []
        self.parsing_report = {}

    @property
    def df(self) -> pd.DataFrame:
        """Convert table to pandas DataFrame"""
        if self._df is None:
            if not self.data:
                self._df = pd.DataFrame()
            else:
                # First row as header
                if len(self.data) > 1:
                    self._df = pd.DataFrame(self.data[1:], columns=self.data[0])
                else:
                    self._df = pd.DataFrame(self.data)
        return self._df

    @property
    def shape(self) -> Tuple[int, int]:
        """Return (rows, cols)"""
        if not self.data:
            return (0, 0)
        return (len(self.data), len(self.data[0]) if self.data else 0)

    def to_csv(self, path: str):
        """Export table to CSV"""
        self.df.to_csv(path, index=False)

    def to_excel(self, path: str):
        """Export table to Excel"""
        self.df.to_excel(path, index=False)

    def to_dict(self) -> Dict:
        """Export table metadata"""
        return {
            'page': self.page,
            'bbox': self.bbox.to_dict(),
            'shape': self.shape,
            'accuracy': self.accuracy,
            'data': self.data
        }


class Graph:
    """Represents an extracted graph/chart"""

    def __init__(self, image: np.ndarray, bbox: BoundingBox,
                 page: int, graph_type: ElementType, confidence: float = 0.0):
        self.image = image
        self.bbox = bbox
        self.page = page
        self.graph_type = graph_type
        self.confidence = confidence
        self.metadata = {}
        self.extracted_data = None

    def save_image(self, path: str):
        """Save graph image"""
        cv2.imwrite(path, self.image)

    def to_dict(self) -> Dict:
        """Export graph metadata"""
        return {
            'page': self.page,
            'type': self.graph_type.value,
            'bbox': self.bbox.to_dict(),
            'confidence': self.confidence,
            'metadata': self.metadata
        }


class ExtractionResult:
    """Container for all extracted elements from a PDF"""

    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.tables: List[Table] = []
        self.graphs: List[Graph] = []
        self.n_pages = 0

    def __repr__(self):
        return f"ExtractionResult(tables={len(self.tables)}, graphs={len(self.graphs)})"

    def filter_by_page(self, page: int):
        """Get elements from specific page"""
        return ExtractionResult._filter(self, page)

    @staticmethod
    def _filter(result, page):
        filtered = ExtractionResult(result.pdf_path)
        filtered.tables = [t for t in result.tables if t.page == page]
        filtered.graphs = [g for g in result.graphs if g.page == page]
        filtered.n_pages = result.n_pages
        return filtered


In [None]:
# ============================================================================
# ABSTRACT BASE CLASSES FOR EXTRACTORS
# ============================================================================

class BaseDetector(ABC):
    """Base class for element detection strategies"""

    @abstractmethod
    def detect(self, page_image: np.ndarray, page_num: int) -> List[BoundingBox]:
        """Detect elements and return bounding boxes"""
        pass


class BaseParser(ABC):
    """Base class for parsing detected elements"""

    @abstractmethod
    def parse(self, region: np.ndarray, bbox: BoundingBox) -> Union[Table, Graph]:
        """Parse a detected region into structured data"""
        pass


In [None]:
# ============================================================================
# TABLE DETECTION STRATEGIES
# ============================================================================

class LineBasedTableDetector(BaseDetector):
    """
    Detect tables based on line structures (borders)
    Similar to Camelot's lattice mode
    """

    def __init__(self, min_lines: int = 4, line_scale: int = 15):
        self.min_lines = min_lines
        self.line_scale = line_scale

    def detect(self, page_image: np.ndarray, page_num: int) -> List[BoundingBox]:
        """Detect tables using line detection"""
        gray = cv2.cvtColor(page_image, cv2.COLOR_BGR2GRAY)

        # Adaptive threshold
        thresh = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY_INV, 11, 2
        )

        # Detect horizontal and vertical lines
        horizontal_kernel = cv2.getStructuringElement(
            cv2.MORPH_RECT, (page_image.shape[1]//self.line_scale, 1)
        )
        vertical_kernel = cv2.getStructuringElement(
            cv2.MORPH_RECT, (1, page_image.shape[0]//self.line_scale)
        )

        h_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel)
        v_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel)

        # Combine lines
        table_mask = cv2.add(h_lines, v_lines)

        # Find contours
        contours, _ = cv2.findContours(
            table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )

        bboxes = []
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)

            # Filter by size and aspect ratio
            if w > 100 and h > 100 and 0.1 < (w/h) < 10:
                bboxes.append(BoundingBox(x, y, x+w, y+h))

        return bboxes


class TextClusterTableDetector(BaseDetector):
    """
    Detect tables based on text alignment patterns
    Similar to Camelot's stream mode
    """

    def __init__(self, row_tol: float = 2, col_tol: float = 5):
        self.row_tol = row_tol
        self.col_tol = col_tol

    def detect(self, page_image: np.ndarray, page_num: int) -> List[BoundingBox]:
        """Detect tables using text clustering"""
        # This would use PyMuPDF text extraction
        # Simplified version here
        gray = cv2.cvtColor(page_image, cv2.COLOR_BGR2GRAY)

        # Detect text regions
        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

        # Find connected components (text blocks)
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # Cluster nearby text blocks
        text_blocks = []
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            if w > 10 and h > 10:  # Minimum text size
                text_blocks.append((x, y, w, h))

        # Group into table regions (simplified)
        bboxes = self._cluster_text_blocks(text_blocks)
        return bboxes

    def _cluster_text_blocks(self, blocks: List[Tuple]) -> List[BoundingBox]:
        """Cluster text blocks into table regions"""
        if not blocks:
            return []

        # Sort by y-coordinate
        blocks = sorted(blocks, key=lambda b: b[1])

        # Group into rows
        rows = []
        current_row = [blocks[0]]

        for block in blocks[1:]:
            if abs(block[1] - current_row[-1][1]) < self.row_tol:
                current_row.append(block)
            else:
                rows.append(current_row)
                current_row = [block]
        rows.append(current_row)

        # Find rectangular table regions
        bboxes = []
        if len(rows) >= 2:
            x_min = min(b[0] for row in rows for b in row)
            y_min = min(b[1] for row in rows for b in row)
            x_max = max(b[0] + b[2] for row in rows for b in row)
            y_max = max(b[1] + b[3] for row in rows for b in row)

            bboxes.append(BoundingBox(x_min, y_min, x_max, y_max))

        return bboxes


class MLTableDetector(BaseDetector):
    """
    ML-based table detection
    Can be extended with custom models (YOLO, Faster R-CNN, etc.)
    """

    def __init__(self, model_path: Optional[str] = None, confidence: float = 0.5):
        self.model_path = model_path
        self.confidence = confidence
        self.model = None

        # Load model if provided
        if model_path:
            self._load_model()

    def _load_model(self):
        """Load custom ML model"""
        # Placeholder for custom model loading
        # Could use TensorFlow, PyTorch, or ONNX
        pass

    def detect(self, page_image: np.ndarray, page_num: int) -> List[BoundingBox]:
        """Detect tables using ML model"""
        if self.model is None:
            return []

        # Run inference
        # This is a placeholder - implement based on your model
        predictions = []
        return predictions

In [None]:

# ============================================================================
# TABLE PARSING STRATEGIES
# ============================================================================

class GridBasedTableParser(BaseParser):
    """Parse tables by detecting grid structure"""

    def parse(self, region: np.ndarray, bbox: BoundingBox) -> Table:
        """Parse table region into structured data"""
        gray = cv2.cvtColor(region, cv2.COLOR_BGR2GRAY)

        # Detect lines
        edges = cv2.Canny(gray, 50, 150, apertureSize=3)
        lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100,
                                minLineLength=50, maxLineGap=10)

        if lines is None:
            return Table([], bbox, 0, 0.0)

        # Separate horizontal and vertical lines
        h_lines = []
        v_lines = []

        for line in lines:
            x1, y1, x2, y2 = line[0]
            angle = np.abs(np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi)

            if angle < 10 or angle > 170:  # Horizontal
                h_lines.append(y1)
            elif 80 < angle < 100:  # Vertical
                v_lines.append(x1)

        # Remove duplicates and sort
        h_lines = sorted(list(set(h_lines)))
        v_lines = sorted(list(set(v_lines)))

        # Extract cells using OCR
        cells = self._extract_cells(region, h_lines, v_lines)

        # Convert to 2D array
        data = self._cells_to_array(cells)

        # Calculate accuracy
        accuracy = self._calculate_accuracy(len(h_lines), len(v_lines), len(cells))

        return Table(data, bbox, 0, accuracy)

    def _extract_cells(self, image: np.ndarray, h_lines: List, v_lines: List) -> List[Cell]:
        """Extract text from grid cells using OCR"""
        cells = []

        try:
            import pytesseract

            for i in range(len(h_lines) - 1):
                for j in range(len(v_lines) - 1):
                    y1, y2 = h_lines[i], h_lines[i+1]
                    x1, x2 = v_lines[j], v_lines[j+1]

                    cell_img = image[y1:y2, x1:x2]
                    text = pytesseract.image_to_string(cell_img).strip()

                    cell = Cell(
                        text=text,
                        bbox=BoundingBox(x1, y1, x2, y2),
                        row=i,
                        col=j
                    )
                    cells.append(cell)
        except ImportError:
            pass  # OCR not available

        return cells

    def _cells_to_array(self, cells: List[Cell]) -> List[List[str]]:
        """Convert cells to 2D array"""
        if not cells:
            return []

        max_row = max(c.row for c in cells) + 1
        max_col = max(c.col for c in cells) + 1

        data = [["" for _ in range(max_col)] for _ in range(max_row)]

        for cell in cells:
            data[cell.row][cell.col] = cell.text

        return data

    def _calculate_accuracy(self, n_h_lines: int, n_v_lines: int, n_cells: int) -> float:
        """Calculate parsing accuracy score"""
        expected_cells = (n_h_lines - 1) * (n_v_lines - 1)
        if expected_cells == 0:
            return 0.0
        return min(100.0, (n_cells / expected_cells) * 100)


class TextBasedTableParser(BaseParser):
    """Parse tables using text position analysis"""

    def parse(self, region: np.ndarray, bbox: BoundingBox) -> Table:
        """Parse using text extraction and alignment"""
        # This would use PyMuPDF's text extraction with position info
        # Simplified implementation
        data = [["Sample", "Data"], ["Row1", "Value1"], ["Row2", "Value2"]]
        return Table(data, bbox, 0, 85.0)


In [None]:
# ============================================================================
# GRAPH DETECTION AND CLASSIFICATION
# ============================================================================

class GraphDetector(BaseDetector):
    """Detect and classify graphs/charts"""

    def __init__(self, min_area: int = 5000):
        self.min_area = min_area

    def detect(self, page_image: np.ndarray, page_num: int) -> List[Tuple[BoundingBox, ElementType]]:
        """Detect graphs and return bbox with type"""
        gray = cv2.cvtColor(page_image, cv2.COLOR_BGR2GRAY)

        # Detect colored/shaded regions (charts often have these)
        hsv = cv2.cvtColor(page_image, cv2.COLOR_BGR2HSV)
        mask = cv2.inRange(hsv, np.array([0, 30, 30]), np.array([180, 255, 255]))

        # Find contours
        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        results = []
        for contour in contours:
            area = cv2.contourArea(contour)
            if area < self.min_area:
                continue

            x, y, w, h = cv2.boundingRect(contour)
            bbox = BoundingBox(x, y, x+w, y+h)

            # Extract region for classification
            region = page_image[y:y+h, x:x+w]
            graph_type = self._classify_graph(region, bbox)

            results.append((bbox, graph_type))

        return results

    def _classify_graph(self, image: np.ndarray, bbox: BoundingBox) -> ElementType:
        """Classify graph type using heuristics"""
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Check for circles (pie charts)
        circles = cv2.HoughCircles(
            gray, cv2.HOUGH_GRADIENT, dp=1, minDist=20,
            param1=50, param2=30, minRadius=20, maxRadius=min(image.shape[:2])//2
        )

        if circles is not None and len(circles[0]) > 0:
            return ElementType.PIE_CHART

        # Check for bars (bar charts) - vertical rectangles
        edges = cv2.Canny(gray, 50, 150)
        contours, _ = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

        vertical_rects = 0
        for cnt in contours:
            x, y, w, h = cv2.boundingRect(cnt)
            if h > w and h > 20:
                vertical_rects += 1

        if vertical_rects >= 3:
            return ElementType.BAR_CHART

        # Check for lines (line charts)
        lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=50,
                                minLineLength=30, maxLineGap=10)

        if lines is not None and len(lines) > 5:
            return ElementType.LINE_CHART

        return ElementType.UNKNOWN


In [None]:
import os
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional, Union
from enum import Enum
import numpy as np
from PIL import Image
import cv2
import fitz  # PyMuPDF
import pandas as pd
from collections import defaultdict




# ============================================================================
# MAIN EXTRACTION ENGINE
# ============================================================================

class TableGraphX:
    """
    Main extraction engine - the public API
    Usage: result = TableGraphX.extract('document.pdf', flavor='lattice')
    """

    def __init__(self):
        self.detectors = {
            'lattice': LineBasedTableDetector(),
            'stream': TextClusterTableDetector(),
            'ml': MLTableDetector()
        }
        self.parsers = {
            'grid': GridBasedTableParser(),
            'text': TextBasedTableParser()
        }
        self.graph_detector = GraphDetector()

    @staticmethod
    def extract(pdf_path: str,
                pages: str = 'all',
                flavor: str = 'lattice',
                extract_graphs: bool = True,
                parser: str = 'grid') -> ExtractionResult:
        """
        Main extraction method

        Args:
            pdf_path: Path to PDF file
            pages: Pages to extract ('all', '1', '1,2,3', '1-5')
            flavor: Detection method ('lattice', 'stream', 'ml')
            extract_graphs: Whether to extract graphs
            parser: Parsing method ('grid', 'text')

        Returns:
            ExtractionResult containing tables and graphs
        """
        print(f"\n{'='*60}")
        print(f"Starting TableGraphX Extraction")
        print(f"{'='*60}")
        print(f"PDF: {pdf_path}")
        print(f"Flavor: {flavor}, Parser: {parser}, Extract Graphs: {extract_graphs}")

        engine = TableGraphX()
        result = ExtractionResult(pdf_path)
        doc = None

        try:
            # Open PDF
            print(f"\n[1/4] Opening PDF...")
            doc = fitz.open(pdf_path)
            result.n_pages = len(doc)
            print(f"      ✓ PDF opened successfully - {result.n_pages} pages found")

            pages_to_process = engine._parse_pages(pages, result.n_pages)
            print(f"\n[2/4] Processing pages: {', '.join(str(p+1) for p in pages_to_process)}")

            table_detector = engine.detectors.get(flavor)
            if not table_detector:
                raise ValueError(f"Unknown table detection flavor: {flavor}")
            table_parser = engine.parsers.get(parser)
            if not table_parser:
                raise ValueError(f"Unknown table parser: {parser}")

            for page_num in pages_to_process:
                page = doc.load_page(page_num)
                # Render page to image at a higher DPI for better detection/OCR
                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x resolution
                page_image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
                # Convert to BGR for OpenCV if necessary (PyMuPDF typically returns RGB or grayscale)
                if pix.n == 3: # RGB
                    page_image = cv2.cvtColor(page_image, cv2.COLOR_RGB2BGR)
                elif pix.n == 4: # RGBA
                    page_image = cv2.cvtColor(page_image, cv2.COLOR_RGBA2BGR)

                # Scale factors for bounding box coordinates
                scale_x = page.rect.width / pix.width
                scale_y = page.rect.height / pix.height

                # 3. Detect tables
                print(f"      > Detecting tables on page {page_num+1} using '{flavor}' flavor...")
                table_bboxes = table_detector.detect(page_image.copy(), page_num)
                for bbox in table_bboxes:
                    # Scale bbox back to original PDF coordinates if detector works on pixmap coords
                    # (Assuming detector returns pixmap coords for now, adjust if it's PDF coords)
                    scaled_bbox = BoundingBox(
                        bbox.x1 * scale_x,
                        bbox.y1 * scale_y,
                        bbox.x2 * scale_x,
                        bbox.y2 * scale_y
                    )
                    table_region_image = page_image[int(bbox.y1):int(bbox.y2), int(bbox.x1):int(bbox.x2)]
                    if table_region_image.size > 0:
                        table = table_parser.parse(table_region_image, scaled_bbox)
                        table.page = page_num + 1
                        result.tables.append(table)
                print(f"        Found {len(table_bboxes)} potential tables on page {page_num+1}")

                # 4. Detect and classify graphs
                if extract_graphs:
                    print(f"      > Detecting graphs on page {page_num+1}...")
                    graph_detections = engine.graph_detector.detect(page_image.copy(), page_num)
                    for bbox, graph_type in graph_detections:
                        # Scale bbox back to original PDF coordinates
                        scaled_bbox = BoundingBox(
                            bbox.x1 * scale_x,
                            bbox.y1 * scale_y,
                            bbox.x2 * scale_x,
                            bbox.y2 * scale_y
                        )
                        graph_image_region = page_image[int(bbox.y1):int(bbox.y2), int(bbox.x1):int(bbox.x2)]
                        if graph_image_region.size > 0:
                            graph = Graph(graph_image_region, scaled_bbox, page_num + 1, graph_type)
                            result.graphs.append(graph)
                    print(f"        Found {len(graph_detections)} potential graphs on page {page_num+1}")

        except Exception as e:
            print(f"Error during extraction: {e}")
            return None # Return None or raise an exception to indicate failure
        finally:
            if doc:
                doc.close()

        print(f"\n{'='*60}")
        print(f"Extraction complete.")
        print(f"Total tables found: {len(result.tables)}")
        print(f"Total graphs found: {len(result.graphs)}")
        print(f"{'='*60}")

        return result

    def _parse_pages(self, pages: str, total_pages: int) -> List[int]:
        """Parse page selection string"""
        if pages == 'all':
            return list(range(total_pages))

        page_nums = []
        parts = pages.split(',')

        for part in parts:
            if '-' in part:
                start_str, end_str = part.split('-')
                try:
                    start = int(start_str)
                    end = int(end_str)
                    page_nums.extend(range(start - 1, end))
                except ValueError:
                    print(f"Warning: Invalid page range format '{part}'. Skipping.")
            else:
                try:
                    page_num = int(part)
                    page_nums.append(page_num - 1)
                except ValueError:
                    print(f"Warning: Invalid page number format '{part}'. Skipping.")

        # Filter out invalid page numbers and remove duplicates
        valid_page_nums = sorted(list(set([p for p in page_nums if 0 <= p < total_pages])))
        return valid_page_nums


# ============================================================================
# CONVENIENCE FUNCTIONS (like camelot.read_pdf)
# ============================================================================

def read_pdf(filepath: str, **kwargs) -> ExtractionResult:
    """
    Convenience function similar to camelot.read_pdf()

    Example:
        tables = read_pdf('document.pdf', pages='all', flavor='lattice')
        print(f"Found {len(tables.tables)} tables")
        tables.tables[0].to_csv('output.csv')
    """
    return TableGraphX.extract(filepath, **kwargs)

In [None]:


if __name__ == "__main__":
    # Example 1: Extract tables (Camelot-style API)
    result = read_pdf('/content/image-to-text-13.pdf', pages='all', flavor='lattice')

    print(f"Found {len(result.tables)} tables")
    print(f"Found {len(result.graphs)} graphs")

    # Save tables
    for i, table in enumerate(result.tables):
        print(f"\nTable {i+1} (Page {table.page}):")
        print(f"  Shape: {table.shape}")
        print(f"  Accuracy: {table.accuracy:.2f}%")
        table.to_csv(f'table_{i+1}.csv')

    # Save graphs
    for i, graph in enumerate(result.graphs):
        print(f"\nGraph {i+1} (Page {graph.page}):")
        print(f"  Type: {graph.graph_type.value}")
        print(f"  Confidence: {graph.confidence:.2f}")
        graph.save_image(f'graph_{i+1}.png')

    # Example 2: Different detection methods
    result_stream = read_pdf('/content/image-to-text-13.pdf', flavor='stream')
    result_ml = read_pdf('/content/image-to-text-13.pdf', flavor='ml')

    # Example 3: Filter by page
    page_1_results = result.filter_by_page(1)

    # Example 4: Access DataFrame directly
    if result.tables:
        df = result.tables[0].df
        print("\nFirst table as DataFrame:")
        print(df.head())


Starting TableGraphX Extraction
PDF: /content/image-to-text-13.pdf
Flavor: lattice, Parser: grid, Extract Graphs: True

[1/4] Opening PDF...
      ✓ PDF opened successfully - 1 pages found

[2/4] Processing pages: 1
      > Detecting tables on page 1 using 'lattice' flavor...
        Found 1 potential tables on page 1
      > Detecting graphs on page 1...
        Found 3 potential graphs on page 1

Extraction complete.
Total tables found: 1
Total graphs found: 3
Found 1 tables
Found 3 graphs

Table 1 (Page 1):
  Shape: (241, 62)
  Accuracy: 100.00%

Graph 1 (Page 1):
  Type: pie_chart
  Confidence: 0.00

Graph 2 (Page 1):
  Type: line_chart
  Confidence: 0.00

Graph 3 (Page 1):
  Type: unknown
  Confidence: 0.00

Starting TableGraphX Extraction
PDF: /content/image-to-text-13.pdf
Flavor: stream, Parser: grid, Extract Graphs: True

[1/4] Opening PDF...
      ✓ PDF opened successfully - 1 pages found

[2/4] Processing pages: 1
      > Detecting tables on page 1 using 'stream' flavor...
 