In [22]:
import pandas as pd

# Read the most recent dataset
import glob
import os

# Find the most recent parquet file
parquet_files = glob.glob('data/datasets/mbpp_dataset_*.parquet')
if parquet_files:
    latest_file = max(parquet_files, key=os.path.getmtime)
    print(f"Loading: {latest_file}")
    df = pd.read_parquet(latest_file)
else:
    print("No parquet files found")
    df = None

if df is not None:
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    
    # Show basic statistics
    if 'complexity_score' in df.columns:
        print(f"\nComplexity score statistics:")
        print(df['complexity_score'].describe())
        
        # Show sample data
        print(f"\nSample data:")
        display_cols = ['task_id', 'test_passed', 'complexity_score']
        print(df[display_cols].head())
    else:
        print("\nComplexity score column not found!")
        print("Available columns:", df.columns.tolist())

Loading: data/datasets/mbpp_dataset_20250528_123423.parquet
Dataset shape: (3, 4)
Columns: ['task_id', 'generated_code', 'test_passed', 'complexity_score']

Complexity score statistics:
count    3.000000
mean     2.333333
std      2.309401
min      1.000000
25%      1.000000
50%      1.000000
75%      3.000000
max      5.000000
Name: complexity_score, dtype: float64

Sample data:
   task_id  test_passed  complexity_score
0       11         True                 5
1       12        False                 1
2       13        False                 1


In [30]:
# Force reload the updated DatasetManager module to clear cache
import importlib
import sys

# Remove the module from cache if it exists
if 'phase1_dataset_building.dataset_manager' in sys.modules:
    del sys.modules['phase1_dataset_building.dataset_manager']
if 'phase1_dataset_building' in sys.modules:
    del sys.modules['phase1_dataset_building']

# Now import the updated DatasetManager
from phase1_dataset_building.dataset_manager import DatasetManager

try:
    # Initialize the dataset manager
    dataset_manager = DatasetManager()
    
    # Load the full MBPP dataset
    print("Loading MBPP dataset using updated DatasetManager (after module reload)...")
    dataset_manager.load_dataset()
    
    print(f"✅ Successfully loaded MBPP dataset")
    print(f"Total records: {dataset_manager.get_size()}")
    print(f"Expected: 974 records")
    
    if dataset_manager.get_size() == 974:
        print("✅ All 974 records loaded successfully!")
    else:
        print(f"❌ Expected 974 records, but got {dataset_manager.get_size()}")
        print("The module may still be cached. Try restarting the kernel.")
    
    # Show some basic info about the dataset
    print(f"\nDataset structure:")
    if dataset_manager.is_loaded():
        sample_record = dataset_manager.get_record(0)
        print(f"Keys in each record: {list(sample_record.keys())}")
        print(f"First task_id: {sample_record.get('task_id', 'unknown')}")
        print(f"Text: {sample_record.get('text', '')[:100]}...")

except Exception as e:
    print(f"❌ Error loading MBPP dataset: {e}")
    import traceback
    traceback.print_exc()

Loading MBPP dataset using updated DatasetManager (after module reload)...
✅ Successfully loaded MBPP dataset
Total records: 974
Expected: 974 records
✅ All 974 records loaded successfully!

Dataset structure:
Keys in each record: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list']
First task_id: 1
Text: Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix...


In [31]:
# Alternative: Test with direct dataset loading to verify our fix
from datasets import load_dataset

# Test the exact same call that DatasetManager should now make
print("Testing direct call to load_dataset('Muennighoff/mbpp', 'full')...")
dataset = load_dataset("Muennighoff/mbpp", "full")
print(f"Direct load result: {len(dataset['test'])} records")

# Now test DatasetManager
from phase1_dataset_building.dataset_manager import DatasetManager
dm = DatasetManager()
dm.load_dataset()
print(f"DatasetManager result: {dm.get_size()} records")

Testing direct call to load_dataset('Muennighoff/mbpp', 'full')...
Direct load result: 974 records
DatasetManager result: 974 records
