In [7]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from typing import List, Dict, Tuple
import random

class ChartDatasetGenerator:
    """
    Generate diverse chart datasets for training/testing ML models
    """

    def __init__(self, output_dir: str = 'chart_dataset'):
        self.output_dir = output_dir
        self.charts_dir = os.path.join(output_dir, 'charts')
        self.metadata_path = os.path.join(output_dir, 'metadata.json')

        # Create directories
        os.makedirs(self.charts_dir, exist_ok=True)

        # Dataset metadata
        self.metadata = []

        # Style configurations (fixed for newer matplotlib versions)
        self.styles = ['default', 'ggplot', 'bmh', 'fivethirtyeight', 'seaborn-v0_8']
        self.color_schemes = ['Set1', 'Set2', 'Set3', 'Pastel1', 'Pastel2', 'tab10']

    def generate_line_chart(self, idx: int, num_lines: int = None) -> Dict:
        """Generate line chart with random data"""
        if num_lines is None:
            num_lines = random.randint(1, 5)

        # Generate time series data
        months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                  'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

        fig, ax = plt.subplots(figsize=(10, 6))

        # Safely set style
        try:
            plt.style.use(random.choice(self.styles))
        except:
            plt.style.use('default')

        for i in range(num_lines):
            base = random.randint(50, 200)
            trend = random.randint(-5, 15)
            noise = np.random.randn(12) * 10
            values = [base + trend*j + noise[j] for j in range(12)]
            ax.plot(months, values, marker='o', label=f'Series {i+1}', linewidth=2)

        ax.set_xlabel('Month', fontsize=12)
        ax.set_ylabel('Value', fontsize=12)
        ax.set_title(f'Line Chart - Trend Analysis', fontsize=14, fontweight='bold')
        ax.legend()
        ax.grid(True, alpha=0.3)
        plt.tight_layout()

        filename = f'line_chart_{idx:04d}.png'
        filepath = os.path.join(self.charts_dir, filename)
        plt.savefig(filepath, dpi=150, bbox_inches='tight')
        plt.close(fig)

        return {
            'id': idx,
            'type': 'line_chart',
            'filename': filename,
            'num_series': num_lines,
            'data_points': 12
        }

    def generate_bar_chart(self, idx: int, orientation: str = None) -> Dict:
        """Generate bar chart with random data"""
        if orientation is None:
            orientation = random.choice(['vertical', 'horizontal'])

        categories = [f'Cat {i+1}' for i in range(random.randint(4, 8))]
        values = [random.randint(20, 100) for _ in categories]

        fig, ax = plt.subplots(figsize=(10, 6))

        try:
            plt.style.use(random.choice(self.styles))
        except:
            plt.style.use('default')

        colors = plt.colormaps.get_cmap(random.choice(self.color_schemes))(range(len(categories)))

        if orientation == 'vertical':
            ax.bar(categories, values, color=colors, edgecolor='black', linewidth=1.2)
            ax.set_xlabel('Categories', fontsize=12)
            ax.set_ylabel('Value', fontsize=12)
            plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
        else:
            ax.barh(categories, values, color=colors, edgecolor='black', linewidth=1.2)
            ax.set_ylabel('Categories', fontsize=12)
            ax.set_xlabel('Value', fontsize=12)

        ax.set_title(f'Bar Chart - {orientation.capitalize()}', fontsize=14, fontweight='bold')
        ax.grid(axis='x' if orientation == 'horizontal' else 'y', alpha=0.3)
        plt.tight_layout()

        filename = f'bar_chart_{idx:04d}.png'
        filepath = os.path.join(self.charts_dir, filename)
        plt.savefig(filepath, dpi=150, bbox_inches='tight')
        plt.close(fig)

        return {
            'id': idx,
            'type': 'bar_chart',
            'filename': filename,
            'orientation': orientation,
            'num_categories': len(categories)
        }

    def generate_pie_chart(self, idx: int) -> Dict:
        """Generate pie chart with random data"""
        num_slices = random.randint(3, 7)
        labels = [f'Segment {i+1}' for i in range(num_slices)]
        sizes = [random.randint(10, 50) for _ in range(num_slices)]

        fig, ax = plt.subplots(figsize=(8, 8))

        try:
            plt.style.use(random.choice(self.styles))
        except:
            plt.style.use('default')

        colors = plt.colormaps.get_cmap(random.choice(self.color_schemes))(range(num_slices))
        explode = [0.05 if random.random() > 0.7 else 0 for _ in range(num_slices)]

        ax.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',
                startangle=random.randint(0, 360), explode=explode,
                textprops={'fontsize': 11})
        ax.set_title('Pie Chart - Distribution', fontsize=14, fontweight='bold')
        ax.axis('equal')
        plt.tight_layout()

        filename = f'pie_chart_{idx:04d}.png'
        filepath = os.path.join(self.charts_dir, filename)
        plt.savefig(filepath, dpi=150, bbox_inches='tight')
        plt.close(fig)

        return {
            'id': idx,
            'type': 'pie_chart',
            'filename': filename,
            'num_segments': num_slices
        }

    def generate_scatter_plot(self, idx: int) -> Dict:
        """Generate scatter plot with random data"""
        n_points = random.randint(30, 100)
        num_groups = random.randint(1, 4)

        fig, ax = plt.subplots(figsize=(10, 6))

        try:
            plt.style.use(random.choice(self.styles))
        except:
            plt.style.use('default')

        for i in range(num_groups):
            x = np.random.randn(n_points) * 10 + random.randint(-20, 20)
            y = np.random.randn(n_points) * 10 + random.randint(-20, 20)
            ax.scatter(x, y, s=100, alpha=0.6, label=f'Group {i+1}',
                       edgecolors='black', linewidth=0.5)

        ax.set_xlabel('X Variable', fontsize=12)
        ax.set_ylabel('Y Variable', fontsize=12)
        ax.set_title('Scatter Plot - Correlation Analysis', fontsize=14, fontweight='bold')
        ax.legend()
        ax.grid(True, alpha=0.3)
        plt.tight_layout()

        filename = f'scatter_plot_{idx:04d}.png'
        filepath = os.path.join(self.charts_dir, filename)
        plt.savefig(filepath, dpi=150, bbox_inches='tight')
        plt.close(fig)

        return {
            'id': idx,
            'type': 'scatter_plot',
            'filename': filename,
            'num_groups': num_groups,
            'points_per_group': n_points
        }

    def generate_heatmap(self, idx: int) -> Dict:
        """Generate heatmap with random data"""
        rows, cols = random.randint(5, 10), random.randint(5, 10)
        data = np.random.randn(rows, cols)

        fig, ax = plt.subplots(figsize=(10, 8))

        try:
            plt.style.use(random.choice(self.styles))
        except:
            plt.style.use('default')

        sns.heatmap(data, annot=True, fmt='.1f', cmap='coolwarm',
                   center=0, linewidths=0.5, cbar_kws={'label': 'Value'}, ax=ax)

        ax.set_title('Heatmap - Correlation Matrix', fontsize=14, fontweight='bold')
        ax.set_xlabel('Columns', fontsize=12)
        ax.set_ylabel('Rows', fontsize=12)
        plt.tight_layout()

        filename = f'heatmap_{idx:04d}.png'
        filepath = os.path.join(self.charts_dir, filename)
        plt.savefig(filepath, dpi=150, bbox_inches='tight')
        plt.close(fig)

        return {
            'id': idx,
            'type': 'heatmap',
            'filename': filename,
            'dimensions': (rows, cols)
        }

    def generate_stacked_bar(self, idx: int) -> Dict:
        """Generate stacked bar chart"""
        categories = [f'Q{i+1}' for i in range(4)]
        num_groups = random.randint(2, 4)

        data = {}
        for i in range(num_groups):
            data[f'Group {i+1}'] = [random.randint(20, 80) for _ in categories]

        df = pd.DataFrame(data, index=categories)

        fig, ax = plt.subplots(figsize=(10, 6))

        try:
            plt.style.use(random.choice(self.styles))
        except:
            plt.style.use('default')

        df.plot(kind='bar', stacked=True, ax=ax,
               colormap=random.choice(self.color_schemes),
               edgecolor='black', linewidth=1.2)

        ax.set_xlabel('Quarter', fontsize=12)
        ax.set_ylabel('Value', fontsize=12)
        ax.set_title('Stacked Bar Chart - Quarterly Performance', fontsize=14, fontweight='bold')
        ax.legend(title='Groups')
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)
        ax.grid(axis='y', alpha=0.3)
        plt.tight_layout()

        filename = f'stacked_bar_{idx:04d}.png'
        filepath = os.path.join(self.charts_dir, filename)
        plt.savefig(filepath, dpi=150, bbox_inches='tight')
        plt.close(fig)

        return {
            'id': idx,
            'type': 'stacked_bar_chart',
            'filename': filename,
            'num_groups': num_groups,
            'num_categories': len(categories)
        }

    def generate_area_chart(self, idx: int) -> Dict:
        """Generate area chart"""
        months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                  'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        num_series = random.randint(2, 4)

        fig, ax = plt.subplots(figsize=(10, 6))

        try:
            plt.style.use(random.choice(self.styles))
        except:
            plt.style.use('default')

        data = []
        for i in range(num_series):
            base = random.randint(20, 50)
            trend = random.randint(0, 10)
            noise = np.random.randn(12) * 5
            values = [max(0, base + trend*j + noise[j]) for j in range(12)]
            data.append(values)

        ax.stackplot(range(12), *data, labels=[f'Series {i+1}' for i in range(num_series)],
                     alpha=0.7)

        ax.set_xticks(range(12))
        ax.set_xticklabels(months)
        ax.set_xlabel('Month', fontsize=12)
        ax.set_ylabel('Value', fontsize=12)
        ax.set_title('Area Chart - Cumulative Growth', fontsize=14, fontweight='bold')
        ax.legend(loc='upper left')
        ax.grid(True, alpha=0.3)
        plt.tight_layout()

        filename = f'area_chart_{idx:04d}.png'
        filepath = os.path.join(self.charts_dir, filename)
        plt.savefig(filepath, dpi=150, bbox_inches='tight')
        plt.close(fig)

        return {
            'id': idx,
            'type': 'area_chart',
            'filename': filename,
            'num_series': num_series
        }

    def generate_dataset(self, num_samples: int = 100,
                        chart_types: List[str] = None) -> None:
        """
        Generate complete dataset with specified number of samples

        Args:
            num_samples: Total number of charts to generate
            chart_types: List of chart types to generate (None = all types)
        """
        if chart_types is None:
            chart_types = ['line', 'bar', 'pie', 'scatter', 'heatmap',
                          'stacked_bar', 'area']

        print(f"Generating {num_samples} charts...")
        print(f"Chart types: {', '.join(chart_types)}")
        print(f"Output directory: {self.output_dir}\n")

        generators = {
            'line': self.generate_line_chart,
            'bar': self.generate_bar_chart,
            'pie': self.generate_pie_chart,
            'scatter': self.generate_scatter_plot,
            'heatmap': self.generate_heatmap,
            'stacked_bar': self.generate_stacked_bar,
            'area': self.generate_area_chart
        }

        idx = 0
        for i in range(num_samples):
            chart_type = random.choice(chart_types)

            try:
                metadata = generators[chart_type](idx)
                self.metadata.append(metadata)

                if (i + 1) % 10 == 0:
                    print(f"Generated {i + 1}/{num_samples} charts...")

                idx += 1
            except Exception as e:
                print(f"Error generating {chart_type} chart: {e}")

        # Save metadata
        with open(self.metadata_path, 'w') as f:
            json.dump({
                'total_charts': len(self.metadata),
                'generation_date': datetime.now().isoformat(),
                'charts': self.metadata
            }, f, indent=2)

        print(f"\n✓ Dataset generation complete!")
        print(f"  Total charts: {len(self.metadata)}")
        print(f"  Charts directory: {self.charts_dir}")
        print(f"  Metadata file: {self.metadata_path}")

        # Print statistics
        self._print_statistics()

    def _print_statistics(self):
        """Print dataset statistics"""
        type_counts = {}
        for item in self.metadata:
            chart_type = item['type']
            type_counts[chart_type] = type_counts.get(chart_type, 0) + 1

        print("\nDataset Statistics:")
        print("-" * 40)
        for chart_type, count in sorted(type_counts.items()):
            print(f"  {chart_type:20s}: {count:4d}")
        print("-" * 40)


# Example usage
if __name__ == "__main__":
    # Create generator
    generator = ChartDatasetGenerator(output_dir='chart_dataset')

    # Generate 100 charts of all types
    generator.generate_dataset(num_samples=100)

    # Or generate specific chart types
    # generator.generate_dataset(num_samples=50, chart_types=['line', 'bar', 'pie'])

Generating 100 charts...
Chart types: line, bar, pie, scatter, heatmap, stacked_bar, area
Output directory: chart_dataset

Generated 10/100 charts...
Generated 20/100 charts...
Generated 30/100 charts...
Generated 40/100 charts...
Generated 50/100 charts...
Generated 60/100 charts...
Generated 70/100 charts...
Generated 80/100 charts...
Generated 90/100 charts...
Generated 100/100 charts...

✓ Dataset generation complete!
  Total charts: 100
  Charts directory: chart_dataset/charts
  Metadata file: chart_dataset/metadata.json

Dataset Statistics:
----------------------------------------
  area_chart          :   12
  bar_chart           :   15
  heatmap             :   18
  line_chart          :   13
  pie_chart           :   15
  scatter_plot        :   12
  stacked_bar_chart   :   15
----------------------------------------


In [3]:
pip install datasets huggingface_hub



In [14]:
!hf auth login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `hf auth whoami` to get more information or `hf auth logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The tok

In [15]:
from datasets import load_dataset

dataset = load_dataset("imagefolder", data_dir="chart_dataset")

Resolving data files:   0%|          | 0/232 [00:00<?, ?it/s]

In [10]:
dataset["train"][0]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1482x880>}

In [20]:
dataset.push_to_hub("Rasanjali9/chartdatasetDOCX")

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6937f571-788a9db570940ed4287c61b3;d7ad1424-258f-468b-a1a4-9a259661d16b)

Invalid username or password.

In [24]:
!huggingface-cli logout

Successfully logged out from all access tokens.


In [30]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `Docxtract` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-au

In [33]:
dataset.push_to_hub("Rasanjali9/chartdatasetDOCX")

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6937fc7a-72e701501b3ee3b11b7b7412;cf8fed62-5e6f-44ab-96c4-f8f519804c38)

Invalid username or password.

In [29]:
!rm -rf ~/.huggingface

In [36]:
from huggingface_hub import HfApi

api = HfApi()

api.create_repo(
    repo_id="Rasanjali9/chart_dataset",
    repo_type="dataset",
    private=False  # change to True if you want it private
)


HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6937fdad-242f7d8957b540952b7f852d;2fa96c32-ec25-41ba-9ca0-9a98793d9647)

Invalid username or password.

In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
import shutil

src = "/content/chart_dataset"
dst = "/content/drive/MyDrive/chartdataset_backup"

shutil.copytree(src, dst, dirs_exist_ok=True)
print("Dataset saved to Google Drive!")


Dataset saved to Google Drive!
