# Phase 1 Dataset Checker

This notebook checks the latest Phase 1 parquet file output and displays:
- The first 10 rows of data (as requested)
- Dataset statistics (pass/fail rates)
- Sample correct and incorrect code solutions
- Activation file counts
- Data types and missing values

In [1]:
import pandas as pd
from pathlib import Path
import glob
from datetime import datetime

# Find the data directory
data_dir = Path("../data/phase1_0")
print(f"Looking for parquet files in: {data_dir.absolute()}")

Looking for parquet files in: /Users/krizroycetahimic/Documents/Thesis/Code/pva_sae/phase1_simplified/../data/phase1_0


In [2]:
# Find all parquet files
parquet_files = list(data_dir.glob("*.parquet"))
parquet_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)

if not parquet_files:
    print("No parquet files found!")
else:
    print(f"Found {len(parquet_files)} parquet files:")
    for i, file in enumerate(parquet_files[:5]):  # Show top 5 most recent
        mtime = datetime.fromtimestamp(file.stat().st_mtime)
        print(f"{i+1}. {file.name} (modified: {mtime.strftime('%Y-%m-%d %H:%M:%S')})")
    
    # Use the most recent file
    latest_file = parquet_files[0]
    print(f"\nUsing latest file: {latest_file.name}")

Found 10 parquet files:
1. dataset_sae_20250629_161228.parquet (modified: 2025-06-29 16:12:28)
2. dataset_sae_20250629_154004.parquet (modified: 2025-06-29 15:40:04)
3. dataset_sae_20250629_152134.parquet (modified: 2025-06-29 15:21:34)
4. dataset_sae_20250629_151007.parquet (modified: 2025-06-29 15:10:07)
5. dataset_sae_20250629_150145.parquet (modified: 2025-06-29 15:01:45)

Using latest file: dataset_sae_20250629_161228.parquet


In [3]:
# Load the latest parquet file
df = pd.read_parquet(latest_file)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 10 rows:")
df.head(10)

Dataset shape: (3, 7)
Columns: ['task_id', 'text', 'code', 'test_list', 'cyclomatic_complexity', 'generated_code', 'test_passed']

First 10 rows:


Unnamed: 0,task_id,text,code,test_list,cyclomatic_complexity,generated_code,test_passed
0,2,Write a function to find the similar elements ...,"def similar_elements(test_tup1, test_tup2):\r\...","[assert similar_elements((3, 4, 5, 6),(5, 7, 4...",1,"def similar_elements(list1, list2):\n # You...",False
1,3,Write a python function to identify non-prime ...,import math\r\ndef is_not_prime(n):\r\n res...,"[assert is_not_prime(2) == False, assert is_no...",3,def is_not_prime(n):\n if n == 1:\n ...,True
2,4,Write a function to find the largest integers ...,import heapq as hq\r\ndef heap_queue_largest(n...,"[assert heap_queue_largest( [25, 35, 22, 85, 1...",1,"def heap_queue_largest(numbers, k):\n # You...",False


In [None]:
# Basic statistics
if 'test_passed' in df.columns:
    n_correct = df['test_passed'].sum()
    n_incorrect = (~df['test_passed']).sum()
    n_total = len(df)
    pass_rate = n_correct/n_total*100 if n_total > 0 else 0
    
    print("=== Dataset Statistics ===")
    print(f"Total tasks: {n_total}")
    print(f"Correct solutions: {n_correct} ({pass_rate:.1f}%)")
    print(f"Incorrect solutions: {n_incorrect} ({100-pass_rate:.1f}%)")
else:
    print("No 'test_passed' column found in the dataset")

In [None]:
# Show sample of correct and incorrect solutions
if 'test_passed' in df.columns and 'generated_code' in df.columns:
    print("=== Sample Correct Solution ===")
    correct_sample = df[df['test_passed'] == True].iloc[0] if any(df['test_passed']) else None
    if correct_sample is not None:
        print(f"Task ID: {correct_sample['task_id']}")
        print(f"Problem: {correct_sample['text'][:200]}..." if len(correct_sample['text']) > 200 else correct_sample['text'])
        print(f"\nGenerated Code:\n{correct_sample['generated_code']}")
    
    print("\n=== Sample Incorrect Solution ===")
    incorrect_sample = df[df['test_passed'] == False].iloc[0] if any(~df['test_passed']) else None
    if incorrect_sample is not None:
        print(f"Task ID: {incorrect_sample['task_id']}")
        print(f"Problem: {incorrect_sample['text'][:200]}..." if len(incorrect_sample['text']) > 200 else incorrect_sample['text'])
        print(f"\nGenerated Code:\n{incorrect_sample['generated_code']}")

In [None]:
# Check activation files
activation_dir = data_dir / "activations"
if activation_dir.exists():
    correct_activations = list((activation_dir / "correct").glob("*.npz"))
    incorrect_activations = list((activation_dir / "incorrect").glob("*.npz"))
    
    print("=== Activation Files ===")
    print(f"Correct activations: {len(correct_activations)} files")
    print(f"Incorrect activations: {len(incorrect_activations)} files")
    
    # Show sample of activation filenames
    if correct_activations:
        print("\nSample correct activation files:")
        for file in correct_activations[:5]:
            print(f"  - {file.name}")
else:
    print("No activations directory found")

In [None]:
# Data types and missing values
print("=== Column Data Types ===")
print(df.dtypes)
print("\n=== Missing Values ===")
print(df.isnull().sum())