In [1]:
import json
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("bigcode/the-stack", data_dir="data/jupyter-notebook", split="train", streaming=True)

In [11]:
import pandas as pd
import base64
from io import StringIO
import json
import html

# Function to extract markdown outputs worth being extracted
def extract_markdown_outputs(cells):
    outputs = []
    for idx, cell in enumerate(cells):
        if cell.get('cell_type') == 'markdown':
            source = "".join(cell.get('source', []))
            if any(keyword in source for keyword in ['$$', '\\begin{', '\\end{', '|', '---']):
                outputs.append({'cell_index': idx, 'output': source})
    return outputs

# Function to extract images from code outputs
def extract_images(cells):
    images = []
    for idx, cell in enumerate(cells):
        if cell.get('cell_type') == 'code':
            for output in cell.get('outputs', []):
                if output.get('output_type') == 'display_data':
                    for image_type in ['image/png', 'image/jpeg', 'image/svg+xml', 'image/gif']:
                        if image_type in output.get('data', {}):
                            image_data = output['data'][image_type]
                            images.append({'cell_index': idx, 'image_type': image_type, 'image': image_data})
    return images

# Function to extract pandas DataFrames from code outputs
def extract_code_output_tables(cells):
    """Extracts pandas DataFrames printed as outputs in code cells."""
    tables = []
    for idx, cell in enumerate(cells):
        if cell.get('cell_type') == 'code':
            for output in cell.get('outputs', []):
                # Check if the output type is 'display_data' or 'execute_result'
                if output.get('output_type') in {'display_data', 'execute_result'}:
                    # Also check 'text/plain' for any potential DataFrame-like output
                    if 'text/plain' in output.get('data', {}):
                        text_data = output['data']['text/plain']
                        if "   " in text_data and "\n" in text_data:  # Rudimentary check for tabular format
                            try:
                                # Attempt to manually parse it as a DataFrame-like structure
                                df = pd.read_csv(StringIO(text_data), sep=r'\s+')
                                tables.append({'cell_index': idx, 'dataframe': df})
                            except Exception:
                                continue
    return tables

def extract_json_html_outputs(cells):
    outputs = []
    for idx, cell in enumerate(cells):
        if cell.get('cell_type') == 'code':
            for output in cell.get('outputs', []):
                if output.get('output_type') in ['display_data', 'execute_result']:
                    data = output.get('data', {})
                    
                    # Extract JSON
                    if 'application/json' in data:
                        json_data = data['application/json']
                        outputs.append({
                            'cell_index': idx,
                            'type': 'json',
                            'content': json_data
                        })
                    
                    # Extract HTML
                    if 'text/html' in data:
                        html_data = data['text/html']
                        if isinstance(html_data, list):
                            html_data = ''.join(html_data)
                        outputs.append({
                            'cell_index': idx,
                            'type': 'html',
                            'content': html.unescape(html_data)
                        })
    
    return outputs

# Example function to create a JSON structure for each notebook
def create_notebook_json(notebooks):
    notebooks_json = []

    for notebook_idx, cells in enumerate(notebooks):
        notebook_data = {
            'notebook_index': notebook_idx,
            'cells': []
        }

        # Extract markdown outputs
        markdown_outputs = extract_markdown_outputs(cells)
        for output in markdown_outputs:
            notebook_data['cells'].append({
                'cell_index': output['cell_index'],
                'type': 'markdown_output',
                'content': output['output']
            })

        # Extract images
        images = extract_images(cells)
        for image in images:
            notebook_data['cells'].append({
                'cell_index': image['cell_index'],
                'type': 'image',
                'content': image['image']
            })

        # Extract code output tables (DataFrames)
        code_output_tables = extract_code_output_tables(cells)
        for table in code_output_tables:
            notebook_data['cells'].append({
                'cell_index': table['cell_index'],
                'type': 'code_output_table',
                'content': table['dataframe'].to_dict()
            })

        # Extract JSON and HTML outputs
        json_html_outputs = extract_json_html_outputs(cells)
        for output in json_html_outputs:
            notebook_data['cells'].append(output)

        notebooks_json.append(notebook_data)

    return notebooks_json


In [13]:
[d for d in ds.take(5)][0]

{'hexsha': 'f7000a1b2071da53b0b775f45373d43d26e15ece',
 'size': 101336,
 'ext': 'ipynb',
 'lang': 'Jupyter Notebook',
 'max_stars_repo_path': 'HandCrafted_features_Evaluation.ipynb',
 'max_stars_repo_name': 'renatosjoao/infotweets',
 'max_stars_repo_head_hexsha': 'e81661ae18591686c656f099fa39bfdf04cd96fd',
 'max_stars_repo_licenses': ['Apache-2.0'],
 'max_stars_count': None,
 'max_stars_repo_stars_event_min_datetime': None,
 'max_stars_repo_stars_event_max_datetime': None,
 'max_issues_repo_path': 'HandCrafted_features_Evaluation.ipynb',
 'max_issues_repo_name': 'renatosjoao/infotweets',
 'max_issues_repo_head_hexsha': 'e81661ae18591686c656f099fa39bfdf04cd96fd',
 'max_issues_repo_licenses': ['Apache-2.0'],
 'max_issues_count': None,
 'max_issues_repo_issues_event_min_datetime': None,
 'max_issues_repo_issues_event_max_datetime': None,
 'max_forks_repo_path': 'HandCrafted_features_Evaluation.ipynb',
 'max_forks_repo_name': 'renatosjoao/infotweets',
 'max_forks_repo_head_hexsha': 'e81661

In [12]:
# Example list of notebooks, each with list of cells (simplified notebook format)
notebooks = [json.loads(d['content'])['cells'] for d in ds.take(5)]

# Create JSON data for notebooks
notebooks_json = create_notebook_json(notebooks)

# Print JSON-like structure (convert to actual JSON for saving or processing)
json_output = json.dumps(notebooks_json, indent=4)
print(json_output)

[
    {
        "notebook_index": 0,
        "cells": []
    },
    {
        "notebook_index": 1,
        "cells": []
    },
    {
        "notebook_index": 2,
        "cells": [
            {
                "cell_index": 12,
                "type": "html",
                "content": "<p style=\"color: red;\">\nThe default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.<br>\nWe recommend you <a href=\"https://www.tensorflow.org/guide/migrate\" target=\"_blank\">upgrade</a> now \nor ensure your notebook will continue to use TensorFlow 1.x via the <code>%tensorflow_version 1.x</code> magic:\n<a href=\"https://colab.research.google.com/notebooks/tensorflow_version.ipynb\" target=\"_blank\">more info</a>.</p>\n"
            }
        ]
    },
    {
        "notebook_index": 3,
        "cells": [
            {
                "cell_index": 1,
                "type": "image",
                "content": "iVBORw0KGgoAAAANSUhEUgAAAdEAAAKOCAYAAAD58yMkAAAABHNCSVQICAgIfAhkiAAA