In [2]:
import pandas as pd
import os
from pathlib import Path

In [3]:
# Paste the parquet file path here, or leave empty to auto-find latest
file_path = ""  # Replace with your parquet file path, or leave empty for auto-discovery

# Auto-discovery of latest dataset if file_path is empty
if not file_path.strip():
    import glob
    
    # Search for all MBPP enriched dataset parquet files
    datasets_dir = "../data/phase0/"
    pattern = os.path.join(datasets_dir, "mbpp_with_complexity_*.parquet")
    matching_files = glob.glob(pattern)
    
    if matching_files:
        # Sort by modification time to get the latest file
        latest_file = max(matching_files, key=os.path.getmtime)
        file_path = latest_file
        print(f"🔍 Auto-discovered latest enriched dataset: {Path(file_path).name}")
        print(f"📅 Last modified: {pd.Timestamp.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S')}")
    else:
        raise FileNotFoundError(f"No MBPP enriched dataset files found in {datasets_dir}")
else:
    print(f"📁 Using specified file: {Path(file_path).name}")

# Verify file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

🔍 Auto-discovered latest enriched dataset: mbpp_with_complexity_20250622_230039.parquet
📅 Last modified: 2025-06-22 23:00:39


In [4]:
# Load parquet file and display basic info
df = pd.read_parquet(file_path)

print(f"Enriched MBPP Dataset Information:")
print(f"Number of records: {len(df):,}")
print(f"Number of columns: {len(df.columns)}")
print(f"\nColumns:")
for col in df.columns:
    print(f"  - {col}")
print(f"\nFile size: {os.path.getsize(file_path) / (1024**2):.2f} MB")

Enriched MBPP Dataset Information:
Number of records: 974
Number of columns: 5

Columns:
  - task_id
  - text
  - code
  - test_list
  - cyclomatic_complexity

File size: 0.20 MB


In [5]:
# Display column types and basic statistics
print("Column Information:")
print(df.info())
print("\nData types:")
print(df.dtypes)

Column Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   task_id                974 non-null    int64 
 1   text                   974 non-null    object
 2   code                   974 non-null    object
 3   test_list              974 non-null    object
 4   cyclomatic_complexity  974 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 38.2+ KB
None

Data types:
task_id                   int64
text                     object
code                     object
test_list                object
cyclomatic_complexity     int64
dtype: object


In [6]:
# Display first 10 records - full table view
# Set pandas display options to show full content
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows (for head(10))
pd.set_option('display.max_colwidth', None) # Show full textin each cell
pd.set_option('display.width', None)        # Don't wrap to terminal width
print("First 10 records (complete table):")
df.head(3)

First 10 records (complete table):


Unnamed: 0,task_id,text,code,test_list,cyclomatic_complexity
0,1,"Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix cost[][] and a position (m, n) in cost[][].","R = 3\r\nC = 3\r\ndef min_cost(cost, m, n): \r\n\ttc = [[0 for x in range(C)] for x in range(R)] \r\n\ttc[0][0] = cost[0][0] \r\n\tfor i in range(1, m+1): \r\n\t\ttc[i][0] = tc[i-1][0] + cost[i][0] \r\n\tfor j in range(1, n+1): \r\n\t\ttc[0][j] = tc[0][j-1] + cost[0][j] \r\n\tfor i in range(1, m+1): \r\n\t\tfor j in range(1, n+1): \r\n\t\t\ttc[i][j] = min(tc[i-1][j-1], tc[i-1][j], tc[i][j-1]) + cost[i][j] \r\n\treturn tc[m][n]","[assert min_cost([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 2, 2) == 8, assert min_cost([[2, 3, 4], [5, 9, 3], [2, 6, 4]], 2, 2) == 12, assert min_cost([[3, 4, 5], [6, 10, 4], [3, 7, 5]], 2, 2) == 16]",7
1,2,Write a function to find the similar elements from the given two tuple lists.,"def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)","[assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5), assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4), assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)]",1
2,3,Write a python function to identify non-prime numbers.,"import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result","[assert is_not_prime(2) == False, assert is_not_prime(10) == True, assert is_not_prime(35) == True]",3


In [7]:
# Comprehensive Enriched Dataset Analysis
import numpy as np
from datetime import datetime

print("=" * 60)
print("MBPP ENRICHED DATASET SUMMARY")
print("=" * 60)

# Basic statistics
total_problems = len(df)
complexity_scores = df['cyclomatic_complexity'].values

print(f"\n📊 DATASET OVERVIEW")
print(f"Total problems analyzed: {total_problems:,}")
print(f"Data source: {Path(file_path).name}")
print(f"Analysis timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

print(f"\n📦 ENRICHED DATASET CONTENTS")
print("This dataset contains the complete MBPP test set with:")
print("  • task_id: Unique identifier for each problem")
print("  • text: Problem description")
print("  • code: Reference solution")
print("  • test_list: Test cases for validation")
print("  • cyclomatic_complexity: Computed difficulty metric")

print(f"\n🧮 COMPLEXITY STATISTICS")
print(f"Minimum complexity: {complexity_scores.min()}")
print(f"Maximum complexity: {complexity_scores.max()}")
print(f"Mean complexity: {complexity_scores.mean():.2f}")
print(f"Median complexity: {np.median(complexity_scores):.1f}")
print(f"Standard deviation: {complexity_scores.std():.2f}")

print(f"\n📈 PERCENTILE DISTRIBUTION")
print(f"25th percentile: {np.percentile(complexity_scores, 25):.1f}")
print(f"75th percentile: {np.percentile(complexity_scores, 75):.1f}")
print(f"90th percentile: {np.percentile(complexity_scores, 90):.1f}")

print(f"\n📋 COMPLEXITY DISTRIBUTION")
# Show distribution by complexity levels
complexity_counts = df['cyclomatic_complexity'].value_counts().sort_index()
print("Complexity Level | Count | Percentage")
print("-" * 40)
for complexity, count in complexity_counts.head(10).items():
    percentage = (count / total_problems) * 100
    print(f"{complexity:>13} | {count:>5} | {percentage:>7.1f}%")

if len(complexity_counts) > 10:
    remaining = len(complexity_counts) - 10
    print(f"... and {remaining} more complexity levels")

print(f"\n📝 METHODOLOGY NOTE")
print("This enriched dataset serves as the single source of truth for all")
print("downstream phases. It combines the original MBPP data with computed")
print("cyclomatic complexity metrics, enabling consistent difficulty-based")
print("sampling and analysis throughout the PVA-SAE pipeline.")

MBPP ENRICHED DATASET SUMMARY

📊 DATASET OVERVIEW
Total problems analyzed: 974
Data source: mbpp_with_complexity_20250622_230039.parquet
Analysis timestamp: 2025-08-25 13:12:19

📦 ENRICHED DATASET CONTENTS
This dataset contains the complete MBPP test set with:
  • task_id: Unique identifier for each problem
  • text: Problem description
  • code: Reference solution
  • test_list: Test cases for validation
  • cyclomatic_complexity: Computed difficulty metric

🧮 COMPLEXITY STATISTICS
Minimum complexity: 1
Maximum complexity: 16
Mean complexity: 2.84
Median complexity: 2.0
Standard deviation: 2.02

📈 PERCENTILE DISTRIBUTION
25th percentile: 1.0
75th percentile: 4.0
90th percentile: 5.0

📋 COMPLEXITY DISTRIBUTION
Complexity Level | Count | Percentage
----------------------------------------
            1 |   268 |    27.5%
            2 |   250 |    25.7%
            3 |   193 |    19.8%
            4 |   121 |    12.4%
            5 |    67 |     6.9%
            6 |    22 |     2.3%
   

In [8]:
# Example: Accessing full MBPP data from the enriched dataset
print("💡 Example Usage - Accessing Complete Problem Information:")
print("=" * 60)

# Show a complete example record
example_idx = 0
example = df.iloc[example_idx]

print(f"\nProblem {example['task_id']}:")
print(f"Description: {example['text']}")
print(f"\nReference Solution:")
print(example['code'])
print(f"\nTest Cases ({len(example['test_list'])} total):")
for i, test in enumerate(example['test_list'][:3]):  # Show first 3 tests
    print(f"  {i+1}. {test}")
if len(example['test_list']) > 3:
    print(f"  ... and {len(example['test_list']) - 3} more tests")
print(f"\nComplexity Score: {example['cyclomatic_complexity']}")
print(f"\nThis enriched format provides everything needed for downstream phases!")

💡 Example Usage - Accessing Complete Problem Information:

Problem 1:
Description: Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix cost[][] and a position (m, n) in cost[][].

Reference Solution:
R = 3
C = 3
def min_cost(cost, m, n): 
	tc = [[0 for x in range(C)] for x in range(R)] 
	tc[0][0] = cost[0][0] 
	for i in range(1, m+1): 
		tc[i][0] = tc[i-1][0] + cost[i][0] 
	for j in range(1, n+1): 
		tc[0][j] = tc[0][j-1] + cost[0][j] 
	for i in range(1, m+1): 
		for j in range(1, n+1): 
			tc[i][j] = min(tc[i-1][j-1], tc[i-1][j], tc[i][j-1]) + cost[i][j] 
	return tc[m][n]

Test Cases (3 total):
  1. assert min_cost([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 2, 2) == 8
  2. assert min_cost([[2, 3, 4], [5, 9, 3], [2, 6, 4]], 2, 2) == 12
  3. assert min_cost([[3, 4, 5], [6, 10, 4], [3, 7, 5]], 2, 2) == 16

Complexity Score: 7

This enriched format provides everything needed for downstream phases!


In [9]:
# Test prompt builder with multiple examples
import sys
sys.path.append('..')  # Add parent directory to path
from common.prompt_utils import PromptBuilder
import numpy as np

print("=== Testing Prompt Builder with Multiple Examples ===\n")

for i in range(min(3, len(df))):  # Test first 3 problems
    sample = df.iloc[i]
    
    # Get test cases
    test_cases = sample.get('test_list', [])
    
    # Handle if test_cases is a numpy array or list
    if isinstance(test_cases, np.ndarray):
        test_cases = test_cases.tolist()
    
    if test_cases and len(test_cases) > 0:
        test_cases_str = '\n'.join(test_cases)
    else:
        test_cases_str = "# No test cases provided"
    
    # Build prompt
    prompt = PromptBuilder.build_prompt(
        problem_description=sample['text'],
        test_cases=test_cases_str
    )
    
    print(f"Example {i+1}:")
    print(f"Task ID: {sample['task_id']}")
    print(f"Problem: {sample['text'][:100]}..." if len(sample['text']) > 100 else sample['text'])
    print(f"\nPrompt Generated:")
    print("```")
    print(prompt)
    print("```")
    print("\n" + "="*70 + "\n")

=== Testing Prompt Builder with Multiple Examples ===

Example 1:
Task ID: 1
Problem: Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix...

Prompt Generated:
```
Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix cost[][] and a position (m, n) in cost[][].

assert min_cost([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 2, 2) == 8
assert min_cost([[2, 3, 4], [5, 9, 3], [2, 6, 4]], 2, 2) == 12
assert min_cost([[3, 4, 5], [6, 10, 4], [3, 7, 5]], 2, 2) == 16

# Solution:
```


Example 2:
Task ID: 2
Write a function to find the similar elements from the given two tuple lists.

Prompt Generated:
```
Write a function to find the similar elements from the given two tuple lists.

assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)
assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)

# Solution:
```


Example 3:
Task ID