In [6]:
import json
from datasets import load_dataset

In [2]:
ds = load_dataset("bigcode/the-stack", data_dir="data/jupyter-notebook", split="train", streaming=True)

In [None]:
import pandas as pd
import base64
from io import StringIO
import json

# Function to extract markdown tables
def extract_markdown_tables(cells):
    tables = []
    for idx, cell in enumerate(cells):
        if cell.get('cell_type') == 'markdown':
            source = "".join(cell.get('source', []))
            if "|" in source:  # Detects possible markdown table
                tables.append({'cell_index': idx, 'table': source})
    return tables

# Function to extract images from code outputs
def extract_images(cells):
    images = []
    for idx, cell in enumerate(cells):
        if cell.get('cell_type') == 'code':
            for output in cell.get('outputs', []):
                if output.get('output_type') == 'display_data':
                    if 'image/png' in output.get('data', {}):
                        image_data = output['data']['image/png']
                        images.append({'cell_index': idx, 'image': image_data})
    return images

# Function to extract pandas DataFrames from code outputs
def extract_code_output_tables(cells):
    """Extracts pandas DataFrames printed as outputs in code cells."""
    tables = []
    for idx, cell in enumerate(cells):
        if cell.get('cell_type') == 'code':
            for output in cell.get('outputs', []):
                # Check if the output type is 'display_data' or 'execute_result'
                if output.get('output_type') in {'display_data', 'execute_result'}:
                    # Also check 'text/plain' for any potential DataFrame-like output
                    if 'text/plain' in output.get('data', {}):
                        text_data = output['data']['text/plain']
                        if "   " in text_data and "\n" in text_data:  # Rudimentary check for tabular format
                            try:
                                # Attempt to manually parse it as a DataFrame-like structure
                                df = pd.read_csv(StringIO(text_data), sep=r'\s+')
                                tables.append({'cell_index': idx, 'dataframe': df})
                            except Exception:
                                continue
    return tables

# Example function to create a JSON structure for each notebook
def create_notebook_json(notebooks):
    notebooks_json = []

    for notebook_idx, cells in enumerate(notebooks):
        notebook_data = {
            'notebook_index': notebook_idx,
            'cells': []
        }

        # Extract markdown tables
        markdown_tables = extract_markdown_tables(cells)
        for table in markdown_tables:
            notebook_data['cells'].append({
                'cell_index': table['cell_index'],
                'type': 'markdown_table',
                'content': table['table']
            })

        # Extract images
        images = extract_images(cells)
        for image in images:
            notebook_data['cells'].append({
                'cell_index': image['cell_index'],
                'type': 'image',
                'content': image['image']
            })

        # Extract code output tables (DataFrames)
        code_output_tables = extract_code_output_tables(cells)
        for table in code_output_tables:
            notebook_data['cells'].append({
                'cell_index': table['cell_index'],
                'type': 'code_output_table',
                'content': table['dataframe'].to_dict()  # Convert DataFrame to dict for JSON serialization
            })

        notebooks_json.append(notebook_data)

    return notebooks_json

# Example Usage
if __name__ == "__main__":
    # Example list of notebooks, each with list of cells (simplified notebook format)
    notebooks = [json.loads(d['content'])['cells'] for d in ds.take(50)]

    # Create JSON data for notebooks
    notebooks_json = create_notebook_json(notebooks)

    # Print JSON-like structure (convert to actual JSON for saving or processing)
    json_output = json.dumps(notebooks_json, indent=4)
    print(json_output)