# Phase 0.2: HumanEval Conversion Results Inspection

This notebook inspects the converted HumanEval dataset to verify:
- Schema matches MBPP format
- All 164 problems converted
- Assertions properly parsed (candidate ‚Üí function name)
- Full content of sample problems

In [1]:
import pandas as pd
import os
from pathlib import Path
import json

# Set pandas display options to show FULL content
pd.set_option('display.max_colwidth', None)  # Show full column content (NO LIMITS)
pd.set_option('display.max_columns', None)   # Show all columns
pd.set_option('display.width', None)         # Don't wrap to multiple lines

print("‚úÖ Pandas display options set to show FULL content")

‚úÖ Pandas display options set to show FULL content


In [2]:
# Load converted HumanEval dataset
dataset_path = "../data/phase0_2_humaneval/humaneval.parquet"

if not Path(dataset_path).exists():
    raise FileNotFoundError(f"‚ùå Dataset not found: {dataset_path}")

df = pd.read_parquet(dataset_path)

print("=" * 80)
print("PHASE 0.2: HUMANEVAL CONVERSION RESULTS")
print("=" * 80)
print(f"\nüìÅ Dataset: {dataset_path}")
print(f"üìä Total problems: {len(df)}")
print(f"\n‚úÖ Expected: 164 problems")
print(f"‚úÖ Actual: {len(df)} problems")
print(f"\n{'‚úì' if len(df) == 164 else '‚úó'} Count matches expected!")

PHASE 0.2: HUMANEVAL CONVERSION RESULTS

üìÅ Dataset: ../data/phase0_2_humaneval/humaneval.parquet
üìä Total problems: 164

‚úÖ Expected: 164 problems
‚úÖ Actual: 164 problems

‚úì Count matches expected!


In [3]:
# Display schema information
print("\n" + "=" * 80)
print("SCHEMA VALIDATION")
print("=" * 80)

print("\nüìã Columns:")
for col in df.columns:
    print(f"  - {col}")

print("\nüìä Data Types:")
print(df.dtypes)

print("\n‚úÖ Expected MBPP schema:")
print("  - task_id: int64")
print("  - text: object (string)")
print("  - code: object (string)")
print("  - test_list: object (list)")
print("  - cyclomatic_complexity: int64")

expected_cols = {'task_id', 'text', 'code', 'test_list', 'cyclomatic_complexity'}
actual_cols = set(df.columns)
print(f"\n{'‚úì' if expected_cols == actual_cols else '‚úó'} Schema matches MBPP format!")


SCHEMA VALIDATION

üìã Columns:
  - task_id
  - text
  - code
  - test_list
  - cyclomatic_complexity

üìä Data Types:
task_id                   int64
text                     object
code                     object
test_list                object
cyclomatic_complexity     int64
dtype: object

‚úÖ Expected MBPP schema:
  - task_id: int64
  - text: object (string)
  - code: object (string)
  - test_list: object (list)
  - cyclomatic_complexity: int64

‚úì Schema matches MBPP format!


In [4]:
# Display test_list statistics
print("\n" + "=" * 80)
print("TEST LIST STATISTICS")
print("=" * 80)

test_list_lengths = df['test_list'].apply(len)
print("\nüìä Number of assertions per problem:")
print(test_list_lengths.describe())

print("\nüìà Distribution:")
print(f"  - Min: {test_list_lengths.min()} assertions")
print(f"  - Max: {test_list_lengths.max()} assertions")
print(f"  - Mean: {test_list_lengths.mean():.2f} assertions")
print(f"  - Median: {test_list_lengths.median():.0f} assertions")


TEST LIST STATISTICS

üìä Number of assertions per problem:
count    164.000000
mean       7.201220
std        3.718928
min        1.000000
25%        5.000000
50%        7.000000
75%        9.000000
max       26.000000
Name: test_list, dtype: float64

üìà Distribution:
  - Min: 1 assertions
  - Max: 26 assertions
  - Mean: 7.20 assertions
  - Median: 7 assertions


In [5]:
# Display first 10 problems as table
print("\n" + "=" * 80)
print("FIRST 10 CONVERTED PROBLEMS (TABLE VIEW)")
print("=" * 80)
print(f"\nRecords: {len(df)}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 10 records:")
display(df.head(10))


FIRST 10 CONVERTED PROBLEMS (TABLE VIEW)

Records: 164
Columns: ['task_id', 'text', 'code', 'test_list', 'cyclomatic_complexity']

First 10 records:


Unnamed: 0,task_id,text,code,test_list,cyclomatic_complexity
0,0,"from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n """""" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n """"""\n","for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n","[assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True, assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False, assert has_close_elements([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True, assert has_close_elements([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False, assert has_close_elements([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True, assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True, assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False]",0
1,1,"from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n """""" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n separate those group into separate strings and return the list of those.\n Separate groups are balanced (each open brace is properly closed) and not nested within each other\n Ignore any spaces in the input string.\n >>> separate_paren_groups('( ) (( )) (( )( ))')\n ['()', '(())', '(()())']\n """"""\n",result = []\n current_string = []\n current_depth = 0\n\n for c in paren_string:\n if c == '(':\n current_depth += 1\n current_string.append(c)\n elif c == ')':\n current_depth -= 1\n current_string.append(c)\n\n if current_depth == 0:\n result.append(''.join(current_string))\n current_string.clear()\n\n return result\n,"[assert separate_paren_groups('(()()) ((())) () ((())()())') == [, assert separate_paren_groups('() (()) ((())) (((())))') == [, assert separate_paren_groups('(()(())((())))') == [, assert separate_paren_groups('( ) (( )) (( )( ))') == ['()', '(())', '(()())']]",0
2,2,"\n\ndef truncate_number(number: float) -> float:\n """""" Given a positive floating point number, it can be decomposed into\n and integer part (largest integer smaller than given number) and decimals\n (leftover part always smaller than 1).\n\n Return the decimal part of the number.\n >>> truncate_number(3.5)\n 0.5\n """"""\n",return number % 1.0\n,"[assert truncate_number(3.5) == 0.5, assert abs(truncate_number(1.33) - 0.33) < 1e-6, assert abs(truncate_number(123.456) - 0.456) < 1e-6]",0
3,3,"from typing import List\n\n\ndef below_zero(operations: List[int]) -> bool:\n """""" You're given a list of deposit and withdrawal operations on a bank account that starts with\n zero balance. Your task is to detect if at any point the balance of account fallls below zero, and\n at that point function should return True. Otherwise it should return False.\n >>> below_zero([1, 2, 3])\n False\n >>> below_zero([1, 2, -4, 5])\n True\n """"""\n",balance = 0\n\n for op in operations:\n balance += op\n if balance < 0:\n return True\n\n return False\n,"[assert below_zero([]) == False, assert below_zero([1, 2, -3, 1, 2, -3]) == False, assert below_zero([1, 2, -4, 5, 6]) == True, assert below_zero([1, -1, 2, -2, 5, -5, 4, -4]) == False, assert below_zero([1, -1, 2, -2, 5, -5, 4, -5]) == True, assert below_zero([1, -2, 2, -2, 5, -5, 4, -4]) == True]",0
4,4,"from typing import List\n\n\ndef mean_absolute_deviation(numbers: List[float]) -> float:\n """""" For a given list of input numbers, calculate Mean Absolute Deviation\n around the mean of this dataset.\n Mean Absolute Deviation is the average absolute difference between each\n element and a centerpoint (mean in this case):\n MAD = average | x - x_mean |\n >>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])\n 1.0\n """"""\n",mean = sum(numbers) / len(numbers)\n return sum(abs(x - mean) for x in numbers) / len(numbers)\n,"[assert abs(mean_absolute_deviation([1.0, 2.0, 3.0]) - 2.0/3.0) < 1e-6, assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0]) - 1.0) < 1e-6, assert abs(mean_absolute_deviation([1.0, 2.0, 3.0, 4.0, 5.0]) - 6.0/5.0) < 1e-6]",0
5,5,"from typing import List\n\n\ndef intersperse(numbers: List[int], delimeter: int) -> List[int]:\n """""" Insert a number 'delimeter' between every two consecutive elements of input list `numbers'\n >>> intersperse([], 4)\n []\n >>> intersperse([1, 2, 3], 4)\n [1, 4, 2, 4, 3]\n """"""\n",if not numbers:\n return []\n\n result = []\n\n for n in numbers[:-1]:\n result.append(n)\n result.append(delimeter)\n\n result.append(numbers[-1])\n\n return result\n,"[assert intersperse([], 7) == [], assert intersperse([5, 6, 3, 2], 8) == [5, 8, 6, 8, 3, 8, 2], assert intersperse([2, 2, 2], 2) == [2, 2, 2, 2, 2]]",0
6,6,"from typing import List\n\n\ndef parse_nested_parens(paren_string: str) -> List[int]:\n """""" Input to this function is a string represented multiple groups for nested parentheses separated by spaces.\n For each of the group, output the deepest level of nesting of parentheses.\n E.g. (()()) has maximum two levels of nesting while ((())) has three.\n\n >>> parse_nested_parens('(()()) ((())) () ((())()())')\n [2, 3, 1, 3]\n """"""\n","def parse_paren_group(s):\n depth = 0\n max_depth = 0\n for c in s:\n if c == '(':\n depth += 1\n max_depth = max(depth, max_depth)\n else:\n depth -= 1\n\n return max_depth\n\n return [parse_paren_group(x) for x in paren_string.split(' ') if x]\n","[assert parse_nested_parens('(()()) ((())) () ((())()())') == [2, 3, 1, 3], assert parse_nested_parens('() (()) ((())) (((())))') == [1, 2, 3, 4], assert parse_nested_parens('(()(())((())))') == [4]]",0
7,7,"from typing import List\n\n\ndef filter_by_substring(strings: List[str], substring: str) -> List[str]:\n """""" Filter an input list of strings only for ones that contain given substring\n >>> filter_by_substring([], 'a')\n []\n >>> filter_by_substring(['abc', 'bacd', 'cde', 'array'], 'a')\n ['abc', 'bacd', 'array']\n """"""\n",return [x for x in strings if substring in x]\n,"[assert filter_by_substring([], 'john') == [], assert filter_by_substring(['xxx', 'asd', 'xxy', 'john doe', 'xxxAAA', 'xxx'], 'xxx') == ['xxx', 'xxxAAA', 'xxx'], assert filter_by_substring(['xxx', 'asd', 'aaaxxy', 'john doe', 'xxxAAA', 'xxx'], 'xx') == ['xxx', 'aaaxxy', 'xxxAAA', 'xxx'], assert filter_by_substring(['grunt', 'trumpet', 'prune', 'gruesome'], 'run') == ['grunt', 'prune']]",0
8,8,"from typing import List, Tuple\n\n\ndef sum_product(numbers: List[int]) -> Tuple[int, int]:\n """""" For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.\n Empty sum should be equal to 0 and empty product should be equal to 1.\n >>> sum_product([])\n (0, 1)\n >>> sum_product([1, 2, 3, 4])\n (10, 24)\n """"""\n","sum_value = 0\n prod_value = 1\n\n for n in numbers:\n sum_value += n\n prod_value *= n\n return sum_value, prod_value\n","[assert sum_product([]) == (0, 1), assert sum_product([1, 1, 1]) == (3, 1), assert sum_product([100, 0]) == (100, 0), assert sum_product([3, 5, 7]) == (3 + 5 + 7, 3 * 5 * 7), assert sum_product([10]) == (10, 10)]",0
9,9,"from typing import List, Tuple\n\n\ndef rolling_max(numbers: List[int]) -> List[int]:\n """""" From a given list of integers, generate a list of rolling maximum element found until given moment\n in the sequence.\n >>> rolling_max([1, 2, 3, 2, 3, 4, 2])\n [1, 2, 3, 3, 3, 4, 4]\n """"""\n","running_max = None\n result = []\n\n for n in numbers:\n if running_max is None:\n running_max = n\n else:\n running_max = max(running_max, n)\n\n result.append(running_max)\n\n return result\n","[assert rolling_max([]) == [], assert rolling_max([1, 2, 3, 4]) == [1, 2, 3, 4], assert rolling_max([4, 3, 2, 1]) == [4, 4, 4, 4], assert rolling_max([3, 2, 3, 100, 3]) == [3, 3, 3, 100, 100]]",0


In [None]:
# Verify assertion parsing (check if 'candidate' was replaced)
print("\n" + "=" * 80)
print("ASSERTION PARSING VERIFICATION")
print("=" * 80)

has_candidate = False
for i, row in df.iterrows():
    for assertion in row['test_list']:
        if 'candidate' in assertion:
            has_candidate = True
            print(f"\n‚ö†Ô∏è  Found 'candidate' in task_id {row['task_id']}:")
            print(f"  {assertion}")

if not has_candidate:
    print("\n‚úÖ All assertions correctly parsed - 'candidate' replaced with function names!")
else:
    print("\n‚úó Some assertions still contain 'candidate' - check conversion logic")

In [None]:
# Summary
print("\n" + "=" * 80)
print("CONVERSION SUMMARY")
print("=" * 80)
print(f"\n‚úÖ Total problems converted: {len(df)}")
print(f"‚úÖ Schema matches MBPP format: {expected_cols == actual_cols}")
print(f"‚úÖ Average assertions per problem: {test_list_lengths.mean():.2f}")
print(f"‚úÖ 'candidate' replaced in assertions: {not has_candidate}")
print("\nüéâ Phase 0.2 conversion looks good!")