# Phase 1 Dataset Checker

This notebook checks the latest Phase 1 parquet file output and displays:
- The first 10 rows of data (as requested)
- Dataset statistics (pass/fail rates)
- Sample correct and incorrect code solutions
- Activation file counts
- Data types and missing values

In [1]:
import pandas as pd
from pathlib import Path
import glob
from datetime import datetime

# Find the data directory
data_dir = Path("../data/phase1_0")
print(f"Looking for parquet files in: {data_dir.absolute()}")

Looking for parquet files in: /Users/krizroycetahimic/Documents/Thesis/Code/pva_sae/phase1_simplified/../data/phase1_0


In [2]:
# Find all parquet files
parquet_files = list(data_dir.glob("*.parquet"))
parquet_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)

if not parquet_files:
    print("No parquet files found!")
else:
    print(f"Found {len(parquet_files)} parquet files:")
    for i, file in enumerate(parquet_files[:5]):  # Show top 5 most recent
        mtime = datetime.fromtimestamp(file.stat().st_mtime)
        print(f"{i+1}. {file.name} (modified: {mtime.strftime('%Y-%m-%d %H:%M:%S')})")
    
    # Use the most recent file
    latest_file = parquet_files[0]
    print(f"\nUsing latest file: {latest_file.name}")

Found 2 parquet files:
1. dataset_sae_20250825_133416.parquet (modified: 2025-08-25 13:34:16)
2. dataset_sae_20250825_131927.parquet (modified: 2025-08-25 13:19:27)

Using latest file: dataset_sae_20250825_133416.parquet


In [3]:
# Load the latest parquet file
pd.set_option('display.max_colwidth', None)  # Show full column content
pd.set_option('display.max_columns', None)   # Show all columns
pd.set_option('display.max_rows', None)      # Show all rows (be careful with large datasets)
pd.set_option('display.width', None)         # Don't wrap to multiple lines
df = pd.read_parquet(latest_file)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 10 rows:")
df.head(30)

Dataset shape: (30, 7)
Columns: ['task_id', 'text', 'code', 'test_list', 'cyclomatic_complexity', 'generated_code', 'test_passed']

First 10 rows:


Unnamed: 0,task_id,text,code,test_list,cyclomatic_complexity,generated_code,test_passed
0,2,Write a function to find the similar elements from the given two tuple lists.,"def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)","[assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5), assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4), assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)]",1,"def similar_elements(list1, list2):\n return [x for x in list1 if x in list2]",False
1,3,Write a python function to identify non-prime numbers.,"import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result","[assert is_not_prime(2) == False, assert is_not_prime(10) == True, assert is_not_prime(35) == True]",3,"def is_not_prime(n):\n if n == 1:\n return False\n for i in range(2, n):\n if n % i == 0:\n return False\n return True",False
2,4,Write a function to find the largest integers from a given list of numbers using heap queue algorithm.,"import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums","[assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] , assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] , assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]]",1,"def heap_queue_largest(arr,n):\n # Create a heap\n heap = []\n for i in range(n):\n heap.append(arr[i])\n # Build the heap\n for i in range(n//2-1,-1,-1):\n heapify(heap,i)\n # Extract the largest element\n for i in range(n-1):\n arr[i],arr[n-1] = arr[n-1],arr[i]\n heap.pop()\n heapify(heap,0)\n return arr",False
3,6,Write a python function to check whether the two numbers differ at one bit position only or not.,"def is_Power_Of_Two (x): \r\n return x and (not(x & (x - 1))) \r\ndef differ_At_One_Bit_Pos(a,b): \r\n return is_Power_Of_Two(a ^ b)","[assert differ_At_One_Bit_Pos(13,9) == True, assert differ_At_One_Bit_Pos(15,8) == False, assert differ_At_One_Bit_Pos(2,4) == False]",2,"def differ_At_One_Bit_Pos(a,b):\n return (a ^ b) == (a & b)",False
4,7,Write a function to find all words which are at least 4 characters long in a string by using regex.,"import re\r\ndef find_char_long(text):\r\n return (re.findall(r""\b\w{4,}\b"", text))","[assert find_char_long('Please move back to stream') == ['Please', 'move', 'back', 'stream'], assert find_char_long('Jing Eco and Tech') == ['Jing', 'Tech'], assert find_char_long('Jhingai wulu road Zone 3') == ['Jhingai', 'wulu', 'road', 'Zone']]",1,def find_char_long(string):\n return [word for word in string.split() if len(word) >= 4],True
5,9,Write a python function to find the minimum number of rotations required to get the same string.,"def find_Rotations(str): \r\n tmp = str + str\r\n n = len(str) \r\n for i in range(1,n + 1): \r\n substring = tmp[i: i+n] \r\n if (str == substring): \r\n return i \r\n return n","[assert find_Rotations(""aaaa"") == 1, assert find_Rotations(""ab"") == 2, assert find_Rotations(""abc"") == 3]",3,def find_Rotations(s):\n # Write your code here.\n if len(s) == 1:\n return 0\n if len(s) == 2:\n if s[0] == s[1]:\n return 0\n else:\n return 1\n if len(s) == 3:\n if s[0] == s[1] and s[1] == s[2]:\n return 0\n else:\n return 1\n if len(s) == 4:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3]:\n return 0\n else:\n return 1\n if len(s) == 5:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4]:\n return 0\n else:\n return 1\n if len(s) == 6:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5]:\n return 0\n else:\n return 1\n if len(s) == 7:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5] and s[5] == s[6]:\n return 0\n else:\n return 1\n if len(s) == 8:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5] and s[5] == s[6] and s[6] == s[7]:\n return 0\n else:\n return 1\n if len(s) == 9:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5] and s[5] == s[6] and s[6] == s[7] and s[7] == s[8]:\n return 0\n else:\n return 1\n if len(s) == 10:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5] and s[5] == s[6] and s[6] == s[7] and s[7] == s[8] and s[8] == s[9]:\n return 0\n else:\n return 1\n if len(s) == 11:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5] and s[5] == s[6] and s[6] == s[7] and s[7] == s[8] and s[8] == s[9] and s[9] == s[10]:\n return 0\n else:\n return 1\n if len(s) == 12:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5] and s[5] == s[6] and s[6] == s[7] and s[7] == s[8] and s[8] == s[9] and s[9] == s[10] and s[10] == s[11]:\n return 0\n else:\n return 1\n if len(s) == 13:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5] and s[5] == s[6] and s[6] == s[7] and s[7] == s[8] and s[8] == s[9] and s[9] == s[10] and s[10] == s[11] and s[11] == s[12]:\n return 0\n else:\n return 1\n if len(s) == 14:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5] and s[5] == s[6] and s[6] == s[7] and s[7] == s[8] and s[8] == s[9] and s[9] == s[10] and s[10] == s[11] and s[11] == s[12] and s[12] == s[13]:\n return 0\n else:\n return 1\n if len(s) == 15:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5] and s[5] == s[6] and s[6] == s[7] and s[7] == s[8] and s[8] == s[9] and s[9] == s[10] and s[10] == s[11] and s[11] == s[12] and s[12] == s[13] and s[13] == s[14]:\n return 0\n else:\n return 1\n if len(s) == 16:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5] and s[5] == s[6] and s[6] == s[7] and s[7] == s[8] and s[8] == s[9] and s[9] == s[10] and s[10] == s[11] and s[11] == s[12] and s[12] == s[13] and s[13] == s[14] and s[14] == s[15]:\n return 0\n else:\n return 1\n if len(s) == 17:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5] and s[5] == s[6] and s[6] == s[7] and s[7] == s[8] and s[8] == s[9] and s[9] == s[10] and s[10] == s[11] and s[11] == s[12] and s[12] == s[13] and s[13] == s[14] and s[14] == s[15] and s[15] == s[16]:\n return 0\n else:\n return 1\n if len(s) == 18:\n if s[0] == s[1] and s[1] == s[2] and s[2] == s[3] and s[3] == s[4] and s[4] == s[5] and s[5] == s[6] and s[6] == s[7] and s[7] == s[8] and s[8] == s[9] and s[9] == s[10] and s[,False
6,11,Write a python function to remove first and last occurrence of a given character from the string.,"def remove_Occ(s,ch): \r\n for i in range(len(s)): \r\n if (s[i] == ch): \r\n s = s[0 : i] + s[i + 1:] \r\n break\r\n for i in range(len(s) - 1,-1,-1): \r\n if (s[i] == ch): \r\n s = s[0 : i] + s[i + 1:] \r\n break\r\n return s","[assert remove_Occ(""hello"",""l"") == ""heo"", assert remove_Occ(""abcda"",""a"") == ""bcd"", assert remove_Occ(""PHP"",""P"") == ""H""]",5,"def remove_Occ(string, character):\n string = string.replace(character, """")\n return string",True
7,12,Write a function to sort a given matrix in ascending order according to the sum of its rows.,"def sort_matrix(M):\r\n result = sorted(M, key=sum)\r\n return result","[assert sort_matrix([[1, 2, 3], [2, 4, 5], [1, 1, 1]])==[[1, 1, 1], [1, 2, 3], [2, 4, 5]], assert sort_matrix([[1, 2, 3], [-2, 4, -5], [1, -1, 1]])==[[-2, 4, -5], [1, -1, 1], [1, 2, 3]], assert sort_matrix([[5,8,9],[6,4,3],[2,1,4]])==[[2, 1, 4], [6, 4, 3], [5, 8, 9]]]",1,def sort_matrix(matrix):\n # Sort the rows\n for i in range(len(matrix)):\n matrix[i].sort()\n # Sort the columns\n for i in range(len(matrix[0])):\n for j in range(len(matrix)):\n matrix[j][i] = matrix[j][i]\n return matrix,False
8,13,Write a function to count the most common words in a dictionary.,from collections import Counter\r\ndef count_common(words):\r\n word_counts = Counter(words)\r\n top_four = word_counts.most_common(4)\r\n return (top_four)\r\n,"[assert count_common(['red','green','black','pink','black','white','black','eyes','white','black','orange','pink','pink','red','red','white','orange','white',""black"",'pink','green','green','pink','green','pink','white','orange',""orange"",'red']) == [('pink', 6), ('black', 5), ('white', 5), ('red', 4)], assert count_common(['one', 'two', 'three', 'four', 'five', 'one', 'two', 'one', 'three', 'one']) == [('one', 4), ('two', 2), ('three', 2), ('four', 1)], assert count_common(['Facebook', 'Apple', 'Amazon', 'Netflix', 'Google', 'Apple', 'Netflix', 'Amazon']) == [('Apple', 2), ('Amazon', 2), ('Netflix', 2), ('Facebook', 1)]]",1,"def count_common(dictionary):\n # Create a dictionary to store the counts of each word\n counts = {}\n # Iterate over the dictionary\n for word in dictionary:\n # If the word is not in the dictionary, add it with a count of 1\n if word not in counts:\n counts[word] = 1\n # If the word is in the dictionary, increment the count by 1\n else:\n counts[word] += 1\n # Return the dictionary with the counts of each word\n return counts",False
9,15,Write a function to split a string at lowercase letters.,"import re\r\ndef split_lowerstring(text):\r\n return (re.findall('[a-z][^a-z]*', text))","[assert split_lowerstring(""AbCd"")==['bC','d'], assert split_lowerstring(""Python"")==['y', 't', 'h', 'o', 'n'], assert split_lowerstring(""Programming"")==['r', 'o', 'g', 'r', 'a', 'm', 'm', 'i', 'n', 'g']]",1,def split_lowerstring(string):\n return [word.lower() for word in string.split()],False


In [4]:
# Basic statistics
if 'test_passed' in df.columns:
    n_correct = df['test_passed'].sum()
    n_incorrect = (~df['test_passed']).sum()
    n_total = len(df)
    pass_rate = n_correct/n_total*100 if n_total > 0 else 0
    
    print("=== Dataset Statistics ===")
    print(f"Total tasks: {n_total}")
    print(f"Correct solutions: {n_correct} ({pass_rate:.1f}%)")
    print(f"Incorrect solutions: {n_incorrect} ({100-pass_rate:.1f}%)")
else:
    print("No 'test_passed' column found in the dataset")

=== Dataset Statistics ===
Total tasks: 30
Correct solutions: 9 (30.0%)
Incorrect solutions: 21 (70.0%)


In [5]:
# Show sample of correct and incorrect solutions
if 'test_passed' in df.columns and 'generated_code' in df.columns:
    print("=== Sample Correct Solution ===")
    correct_sample = df[df['test_passed'] == True].iloc[0] if any(df['test_passed']) else None
    if correct_sample is not None:
        print(f"Task ID: {correct_sample['task_id']}")
        print(f"Problem: {correct_sample['text'][:200]}..." if len(correct_sample['text']) > 200 else correct_sample['text'])
        print(f"\nGenerated Code:\n{correct_sample['generated_code']}")
    
    print("\n=== Sample Incorrect Solution ===")
    incorrect_sample = df[df['test_passed'] == False].iloc[0] if any(~df['test_passed']) else None
    if incorrect_sample is not None:
        print(f"Task ID: {incorrect_sample['task_id']}")
        print(f"Problem: {incorrect_sample['text'][:200]}..." if len(incorrect_sample['text']) > 200 else incorrect_sample['text'])
        print(f"\nGenerated Code:\n{incorrect_sample['generated_code']}")

=== Sample Correct Solution ===
Task ID: 7
Write a function to find all words which are at least 4 characters long in a string by using regex.

Generated Code:
def find_char_long(string):
    return [word for word in string.split() if len(word) >= 4]

=== Sample Incorrect Solution ===
Task ID: 2
Write a function to find the similar elements from the given two tuple lists.

Generated Code:
def similar_elements(list1, list2):
    return [x for x in list1 if x in list2]


In [6]:
# Check activation files
activation_dir = data_dir / "activations"
if activation_dir.exists():
    correct_activations = list((activation_dir / "correct").glob("*.npz"))
    incorrect_activations = list((activation_dir / "incorrect").glob("*.npz"))
    
    print("=== Activation Files ===")
    print(f"Correct activations: {len(correct_activations)} files")
    print(f"Incorrect activations: {len(incorrect_activations)} files")
    
    # Show sample of activation filenames
    if correct_activations:
        print("\nSample correct activation files:")
        for file in correct_activations[:5]:
            print(f"  - {file.name}")
else:
    print("No activations directory found")

=== Activation Files ===
Correct activations: 225 files
Incorrect activations: 525 files

Sample correct activation files:
  - 46_layer_16.npz
  - 34_layer_21.npz
  - 41_layer_16.npz
  - 23_layer_13.npz
  - 49_layer_1.npz


In [7]:
# Data types and missing values
print("=== Column Data Types ===")
print(df.dtypes)
print("\n=== Missing Values ===")
print(df.isnull().sum())

=== Column Data Types ===
task_id                   int64
text                     object
code                     object
test_list                object
cyclomatic_complexity     int64
generated_code           object
test_passed                bool
dtype: object

=== Missing Values ===
task_id                  0
text                     0
code                     0
test_list                0
cyclomatic_complexity    0
generated_code           0
test_passed              0
dtype: int64


In [11]:
# Unit test: Verify the CURRENT prompt builder doesn't add "# Your code here:"
import sys
sys.path.append('..')
from common.prompt_utils import PromptBuilder

print("=== UNIT TEST: Current PromptBuilder ===\n")

# Test case 1: Simple problem
test_problem = "Write a function to add two numbers"
test_cases = "assert add(1, 2) == 3\nassert add(5, 7) == 12"

prompt = PromptBuilder.build_prompt(
    problem_description=test_problem,
    test_cases=test_cases
)

print("Test 1 - Generated prompt:")
print("-" * 50)
print(prompt)
print("-" * 50)

# Check if unwanted text is present
if "# Your code here:" in prompt:
    print("❌ FAIL: Found '# Your code here:' in prompt")
else:
    print("✅ PASS: No '# Your code here:' in prompt")

print("\n" + "="*50 + "\n")

# Test case 2: Using actual MBPP data
if len(df) > 0:
    sample = df.iloc[0]
    test_cases_str = '\n'.join(sample['test_list']) if isinstance(sample['test_list'], list) else str(sample['test_list'])
    
    prompt2 = PromptBuilder.build_prompt(
        problem_description=sample['text'],
        test_cases=test_cases_str
    )
    
    print("Test 2 - MBPP problem prompt:")
    print("-" * 50)
    print(prompt2)
    print("-" * 50)
    
    if "# Your code here:" in prompt2:
        print("❌ FAIL: Found '# Your code here:' in prompt")
    else:
        print("✅ PASS: No '# Your code here:' in prompt")

print("\n=== UNIT TEST COMPLETE ===")
print("The current PromptBuilder is working correctly.")
print("Any '# Your code here:' you see in the generated_code column")
print("is from OLD data that was generated with a previous template version.")

=== UNIT TEST: Current PromptBuilder ===

Test 1 - Generated prompt:
--------------------------------------------------
Write a function to add two numbers

assert add(1, 2) == 3
assert add(5, 7) == 12

# Solution:
--------------------------------------------------
✅ PASS: No '# Your code here:' in prompt


Test 2 - MBPP problem prompt:
--------------------------------------------------
Write a function to find the similar elements from the given two tuple lists.

['assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)'
 'assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)'
 'assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)']

# Solution:
--------------------------------------------------
✅ PASS: No '# Your code here:' in prompt

=== UNIT TEST COMPLETE ===
The current PromptBuilder is working correctly.
Any '# Your code here:' you see in the generated_code column
is from OLD data that was generated with a previous template version.


In [12]:
# Let's check if the data was generated with an old template
# by looking at multiple examples for the pattern

print("=== Checking for '# Your code here:' in generated code ===\n")

pattern_count = 0
total_count = 0

for idx, row in df.iterrows():
    total_count += 1
    if 'generated_code' in row and isinstance(row['generated_code'], str):
        if "# Your code here:" in row['generated_code']:
            pattern_count += 1
            if pattern_count <= 3:  # Show first 3 examples
                print(f"Task {row['task_id']}: Found '# Your code here:' in generated code")
                print(f"Generated code snippet: {row['generated_code'][:200]}...")
                print("-" * 50)

print(f"\n=== Summary ===")
print(f"Total tasks: {total_count}")
print(f"Tasks with '# Your code here:': {pattern_count}")
print(f"Percentage: {pattern_count/total_count*100:.1f}%")

if pattern_count > 0:
    print("\n⚠️  The data appears to have been generated with an old prompt template")
    print("that included '# Your code here:' as a code initiator.")
    print("The model learned to include this in its output.")
    print("\nTo fix this, Phase 1 needs to be re-run with the updated prompt template.")

=== Checking for '# Your code here:' in generated code ===


=== Summary ===
Total tasks: 30
Tasks with '# Your code here:': 0
Percentage: 0.0%


In [13]:
# Test prompt builder with multiple examples
from common.prompt_utils import PromptBuilder
import numpy as np

print("=== Testing Prompt Builder with Multiple Examples ===\n")

for i in range(min(3, len(df))):  # Test first 3 problems
    sample = df.iloc[i]
    
    # Get test cases
    test_cases = sample.get('test_list', [])
    
    # Handle if test_cases is a numpy array or list
    if isinstance(test_cases, np.ndarray):
        test_cases = test_cases.tolist()
    
    if test_cases and len(test_cases) > 0:
        test_cases_str = '\n'.join(test_cases)
    else:
        test_cases_str = "# No test cases provided"
    
    # Build prompt
    prompt = PromptBuilder.build_prompt(
        problem_description=sample['text'],
        test_cases=test_cases_str
    )
    
    print(f"Example {i+1}:")
    print(f"Task ID: {sample['task_id']}")
    print(f"Problem: {sample['text'][:100]}..." if len(sample['text']) > 100 else sample['text'])
    print(f"\nPrompt Generated:")
    print("```")
    print(prompt)
    print("```")
    print("\n" + "="*70 + "\n")

=== Testing Prompt Builder with Multiple Examples ===

Example 1:
Task ID: 2
Write a function to find the similar elements from the given two tuple lists.

Prompt Generated:
```
Write a function to find the similar elements from the given two tuple lists.

assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)
assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)

# Solution:
```


Example 2:
Task ID: 3
Write a python function to identify non-prime numbers.

Prompt Generated:
```
Write a python function to identify non-prime numbers.

assert is_not_prime(2) == False
assert is_not_prime(10) == True
assert is_not_prime(35) == True

# Solution:
```


Example 3:
Task ID: 4
Problem: Write a function to find the largest integers from a given list of numbers using heap queue algorith...

Prompt Generated:
```
Write a function to find the largest integers from a given list of numbers using heap queue algorithm