In [1]:
from datasets import load_dataset

# Load the MBPP dataset
dataset = load_dataset("mbpp", "full") # or "sanitized" if you prefer

# Initialize max length
max_char_length = 0

# Iterate through the 'test' split (MBPP is often a single split, or you might choose 'train')
# and check the 'code' column.
# Common splits are 'train', 'test', 'validation'. MBPP 'full' has 'train', 'test', 'prompt'.
# We'll check all available splits to be safe if the exact structure isn't recalled.
for split in dataset.keys():
    for example in dataset[split]:
        code_snippet = example['code']
        if code_snippet: # Check if the code snippet is not None or empty
            current_length = len(code_snippet)
            if current_length > max_char_length:
                max_char_length = current_length

print(f"The highest character length in the 'code' column is: {max_char_length}")

  from .autonotebook import tqdm as notebook_tqdm


The highest character length in the 'code' column is: 1331


In [5]:
import pandas as pd
import json
import os
from pathlib import Path

# View generated datasets
data_dir = Path("data/datasets")

# Load the most recent dataset
parquet_files = sorted(data_dir.glob("*.parquet"))
if parquet_files:
    latest_file = parquet_files[-1]
    print(f"Loading latest dataset: {latest_file.name}")
    
    # Load the dataset
    df = pd.read_parquet(latest_file)
    
    # Display basic info
    print(f"\nDataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # Show distribution of results
    if 'test_passed' in df.columns:
        print(f"\nTest results distribution:")
        print(df['test_passed'].value_counts())
    
    print(f"\nDataset preview:")
    # Display as interactive table in Jupyter
    display(df)
    
    # Check column efficiency
    print(f"\nColumn efficiency analysis:")
    print(f"• Essential columns for SAE analysis: {len(df.columns)}")
    print(f"• Total data reduction: Removed ~9 unnecessary columns")
    print(f"• Columns kept: {', '.join(df.columns)}")
    
else:
    print("No parquet files found in data/datasets/")

Loading latest dataset: mbpp_dataset_20250527_203843.parquet

Dataset shape: (3, 3)
Columns: ['task_id', 'generated_code', 'test_passed']

Test results distribution:
test_passed
False    2
True     1
Name: count, dtype: int64

Dataset preview:


Unnamed: 0,task_id,generated_code,test_passed
0,11,"\ndef remove_Occ(string,char):\n string = string.replace(char,"""")\n return string\n",True
1,12,\ndef sort_matrix(matrix):\n for i in range(len(matrix)):\n matrix[i].sort()\n retu...,False
2,13,\ndef count_common(dictionary):\n count = {}\n for word in dictionary:\n if word in...,False



Column efficiency analysis:
• Essential columns for SAE analysis: 3
• Total data reduction: Removed ~9 unnecessary columns
• Columns kept: task_id, generated_code, test_passed
