In [1]:
import pandas as pd
import os
from pathlib import Path

In [ ]:
# Paste the parquet file path here, or leave empty to auto-find latest
file_path = ""  # Replace with your parquet file path, or leave empty for auto-discovery

# Auto-discovery of latest dataset if file_path is empty
if not file_path.strip():
    import glob
    
    # Search for all MBPP difficulty mapping parquet files
    datasets_dir = "../data/phase0/"
    pattern = os.path.join(datasets_dir, "*mbpp_difficulty_mapping_*.parquet")
    matching_files = glob.glob(pattern)
    
    if matching_files:
        # Sort by modification time to get the latest file
        latest_file = max(matching_files, key=os.path.getmtime)
        file_path = latest_file
        print(f"🔍 Auto-discovered latest dataset: {Path(file_path).name}")
        print(f"📅 Last modified: {pd.Timestamp.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S')}")
    else:
        raise FileNotFoundError(f"No MBPP difficulty mapping files found in {datasets_dir}")
else:
    print(f"📁 Using specified file: {Path(file_path).name}")

# Verify file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

In [3]:
# Load parquet file and display basic info
df = pd.read_parquet(file_path)

print(f"Dataset Information:")
print(f"Number of records: {len(df):,}")
print(f"Number of columns: {len(df.columns)}")
print(f"Columns: {list(df.columns)}")
print(f"File size: {os.path.getsize(file_path) / (1024**2):.2f} MB")

Dataset Information:
Number of records: 974
Number of columns: 2
Columns: ['task_id', 'cyclomatic_complexity']
File size: 0.01 MB


In [4]:
# Display column types and basic statistics
print("Column Information:")
print(df.info())
print("\nData types:")
print(df.dtypes)

Column Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   task_id                974 non-null    object
 1   cyclomatic_complexity  974 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 15.3+ KB
None

Data types:
task_id                  object
cyclomatic_complexity     int64
dtype: object


In [5]:
# Display first 10 records in a clean table format
print("First 10 records:")
df.head(10)

First 10 records:


Unnamed: 0,task_id,cyclomatic_complexity
0,1,7
1,2,1
2,3,3
3,4,1
4,5,2
5,6,2
6,7,1
7,8,1
8,9,3
9,10,1


In [6]:
# Comprehensive Difficulty Summary Analysis
import numpy as np
from datetime import datetime

print("=" * 60)
print("MBPP DIFFICULTY MAPPING SUMMARY")
print("=" * 60)

# Basic statistics
total_problems = len(df)
complexity_scores = df['cyclomatic_complexity'].values

print(f"\n📊 DATASET OVERVIEW")
print(f"Total problems analyzed: {total_problems:,}")
print(f"Data source: {Path(file_path).name}")
print(f"Analysis timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

print(f"\n🧮 COMPLEXITY STATISTICS")
print(f"Minimum complexity: {complexity_scores.min()}")
print(f"Maximum complexity: {complexity_scores.max()}")
print(f"Mean complexity: {complexity_scores.mean():.2f}")
print(f"Median complexity: {np.median(complexity_scores):.1f}")
print(f"Standard deviation: {complexity_scores.std():.2f}")

print(f"\n📈 PERCENTILE DISTRIBUTION")
print(f"25th percentile: {np.percentile(complexity_scores, 25):.1f}")
print(f"75th percentile: {np.percentile(complexity_scores, 75):.1f}")
print(f"90th percentile: {np.percentile(complexity_scores, 90):.1f}")

print(f"\n📋 COMPLEXITY DISTRIBUTION")
# Show distribution by complexity levels
complexity_counts = df['cyclomatic_complexity'].value_counts().sort_index()
print("Complexity Level | Count | Percentage")
print("-" * 40)
for complexity, count in complexity_counts.head(10).items():
    percentage = (count / total_problems) * 100
    print(f"{complexity:>13} | {count:>5} | {percentage:>7.1f}%")

if len(complexity_counts) > 10:
    remaining = len(complexity_counts) - 10
    print(f"... and {remaining} more complexity levels")

print(f"\n📝 METHODOLOGY NOTE")
print("Analysis uses cyclomatic complexity as the primary difficulty metric.")
print("This enables interleaved sampling based on complexity scores.")

MBPP DIFFICULTY MAPPING SUMMARY

📊 DATASET OVERVIEW
Total problems analyzed: 974
Data source: mbpp_difficulty_mapping_20250608_175751.parquet
Analysis timestamp: 2025-06-08 18:00:54

🧮 COMPLEXITY STATISTICS
Minimum complexity: 1
Maximum complexity: 16
Mean complexity: 2.84
Median complexity: 2.0
Standard deviation: 2.02

📈 PERCENTILE DISTRIBUTION
25th percentile: 1.0
75th percentile: 4.0
90th percentile: 5.0

📋 COMPLEXITY DISTRIBUTION
Complexity Level | Count | Percentage
----------------------------------------
            1 |   268 |    27.5%
            2 |   250 |    25.7%
            3 |   193 |    19.8%
            4 |   121 |    12.4%
            5 |    67 |     6.9%
            6 |    22 |     2.3%
            7 |    16 |     1.6%
            8 |    13 |     1.3%
            9 |    10 |     1.0%
           10 |     4 |     0.4%
... and 4 more complexity levels

📝 METHODOLOGY NOTE
Analysis uses cyclomatic complexity as the primary difficulty metric.
This enables interleaved samp