In [17]:
# Install required packages
%pip install matplotlib numpy opencv-python PyMuPDF pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
# Import required libraries
import os
import random
import json
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict, List
import matplotlib.style as mplstyle

class ChartDatasetGenerator:
    """Generator for chart datasets with PDF output"""

    def __init__(self, output_dir: str = "chart_dataset"):
        self.output_dir = output_dir
        self.charts_dir = os.path.join(output_dir, "charts")
        self.metadata_file = os.path.join(output_dir, "metadata.json")

        # Create directories
        os.makedirs(self.charts_dir, exist_ok=True)

        # Available styles and color schemes
        self.styles = ['default', 'seaborn', 'ggplot', 'bmh', 'fivethirtyeight']
        self.color_schemes = ['viridis', 'plasma', 'inferno', 'magma', 'cividis', 'tab10', 'tab20']

        # Initialize metadata
        self.metadata = {
            "total_charts": 0,
            "generation_date": "2025-12-13",
            "charts": []
        }

    def generate_dataset(self, num_charts: int = 10):
        """Generate a dataset of charts"""
        print(f"Generating {num_charts} charts...")

        for i in range(num_charts):
            # Randomly choose chart type
            chart_types = ['bar_chart', 'line_chart']  # Add more types as needed
            chart_type = random.choice(chart_types)

            if chart_type == 'bar_chart':
                result = self.generate_bar_chart(i)
            elif chart_type == 'line_chart':
                result = self.generate_line_chart(i)

            self.metadata["charts"].append(result)
            self.metadata["total_charts"] += 1

            if (i + 1) % 10 == 0:
                print(f"Generated {i + 1}/{num_charts} charts...")

        # Save metadata
        with open(self.metadata_file, 'w') as f:
            json.dump(self.metadata, f, indent=2)

        print(f"Dataset generation complete! {num_charts} charts saved to {self.charts_dir}")

    def generate_line_chart(self, idx: int, num_lines: int = None) -> Dict:
        if num_lines is None:
            num_lines = random.randint(1, 5)

        months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                  'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

        fig, ax = plt.subplots(figsize=(10, 6))
        try:
            plt.style.use(random.choice(self.styles))
        except:
            plt.style.use('default')

        for i in range(num_lines):
            base = random.randint(50, 200)
            trend = random.randint(-5, 15)
            noise = np.random.randn(12) * 10
            values = [base + trend*j + noise[j] for j in range(12)]
            ax.plot(months, values, marker='o', label=f'Series {i+1}', linewidth=2)

        ax.set_xlabel('Month', fontsize=12)
        ax.set_ylabel('Value', fontsize=12)
        ax.set_title('Line Chart - Trend Analysis', fontsize=14, fontweight='bold')
        ax.legend()
        ax.grid(True, alpha=0.3)
        plt.tight_layout()

        # Save as PDF
        filename = f'line_chart_{idx:04d}.pdf'
        filepath = os.path.join(self.charts_dir, filename)
        plt.savefig(filepath, bbox_inches='tight')
        plt.close(fig)

        return {
            'id': idx,
            'type': 'line_chart',
            'filename': filename,
            'num_series': num_lines,
            'data_points': 12
        }

    def generate_bar_chart(self, idx: int, orientation: str = None) -> Dict:
        if orientation is None:
            orientation = random.choice(['vertical', 'horizontal'])

        categories = [f'Cat {i+1}' for i in range(random.randint(4, 8))]
        values = [random.randint(20, 100) for _ in categories]

        fig, ax = plt.subplots(figsize=(10, 6))
        try:
            plt.style.use(random.choice(self.styles))
        except:
            plt.style.use('default')

        colors = plt.colormaps.get_cmap(random.choice(self.color_schemes))(range(len(categories)))

        if orientation == 'vertical':
            ax.bar(categories, values, color=colors, edgecolor='black', linewidth=1.2)
            ax.set_xlabel('Categories', fontsize=12)
            ax.set_ylabel('Value', fontsize=12)
            plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
        else:
            ax.barh(categories, values, color=colors, edgecolor='black', linewidth=1.2)
            ax.set_ylabel('Categories', fontsize=12)
            ax.set_xlabel('Value', fontsize=12)

        ax.set_title(f'Bar Chart - {orientation.capitalize()}', fontsize=14, fontweight='bold')
        ax.grid(axis='x' if orientation == 'horizontal' else 'y', alpha=0.3)
        plt.tight_layout()

        # Save as PDF
        filename = f'bar_chart_{idx:04d}.pdf'
        filepath = os.path.join(self.charts_dir, filename)
        plt.savefig(filepath, bbox_inches='tight')
        plt.close(fig)

        return {
            'id': idx,
            'type': 'bar_chart',
            'filename': filename,
            'orientation': orientation,
            'num_categories': len(categories)
        }

In [19]:
# Generate sample PDFs for testing
generator = ChartDatasetGenerator()
generator.generate_dataset(num_charts=5)  # Generate 5 sample charts

Generating 5 charts...
Dataset generation complete! 5 charts saved to chart_dataset\charts
Dataset generation complete! 5 charts saved to chart_dataset\charts


In [1]:
# Test generated PDFs with DocXtract
import sys
import os
sys.path.append(os.path.join(os.getcwd(), 'docxtract'))

from docxtract import read_pdf
import json

def test_pdf_with_docxtract(pdf_path: str):
    """Test a PDF file with DocXtract and return results"""
    try:
        result = read_pdf(pdf_path, pages='all', flavor='lattice')

        test_results = {
            'pdf_path': pdf_path,
            'tables_found': len(result.tables),
            'graphs_found': len(result.graphs),
            'tables': [],
            'graphs': []
        }

        # Extract table information
        for i, table in enumerate(result.tables):
            table_info = {
                'index': i,
                'page': table.page,
                'shape': table.shape,
                'accuracy': table.accuracy,
                'data_preview': table.data[:3] if table.data else []  # First 3 rows
            }
            test_results['tables'].append(table_info)

        # Extract graph information
        for i, graph in enumerate(result.graphs):
            graph_info = {
                'index': i,
                'page': graph.page,
                'type': graph.graph_type.value,
                'confidence': graph.confidence,
                'bbox': graph.bbox.to_dict()
            }
            test_results['graphs'].append(graph_info)

        return test_results

    except Exception as e:
        return {
            'pdf_path': pdf_path,
            'error': str(e),
            'tables_found': 0,
            'graphs_found': 0
        }

# Example usage: Test all generated PDFs
charts_dir = 'chart_dataset/charts'  # Adjust path as needed
if os.path.exists(charts_dir):
    pdf_files = [f for f in os.listdir(charts_dir) if f.endswith('.pdf')]

    all_test_results = []
    for pdf_file in pdf_files[:5]:  # Test first 5 PDFs for demonstration
        pdf_path = os.path.join(charts_dir, pdf_file)
        result = test_pdf_with_docxtract(pdf_path)
        all_test_results.append(result)
        print(f"Tested {pdf_file}: {result['tables_found']} tables, {result['graphs_found']} graphs")

    # Save test results
    with open('docxtract_test_results.json', 'w') as f:
        json.dump(all_test_results, f, indent=2)

    print(f"DocXtract testing completed for {len(all_test_results)} PDFs")
else:
    print(f"Charts directory {charts_dir} not found")

Tested bar_chart_0000.pdf: 0 tables, 0 graphs
Tested bar_chart_0001.pdf: 0 tables, 0 graphs
Tested bar_chart_0002.pdf: 0 tables, 0 graphs
Tested bar_chart_0003.pdf: 3 tables, 0 graphs
Tested line_chart_0000.pdf: 0 tables, 2 graphs
DocXtract testing completed for 5 PDFs
