In [3]:
import pandas as pd
import os
from pathlib import Path

In [4]:
# Paste the parquet file path here, or leave empty to auto-find latest
file_path = ""  # Replace with your parquet file path, or leave empty for auto-discovery

# Auto-discovery of latest dataset if file_path is empty
if not file_path.strip():
    import glob
    
    # Search for all Phase 1 dataset parquet files
    datasets_dir = "../data/phase1/"
    pattern = os.path.join(datasets_dir, "dataset_*.parquet")
    matching_files = glob.glob(pattern)
    
    if matching_files:
        # Sort by modification time to get the latest file
        latest_file = max(matching_files, key=os.path.getmtime)
        file_path = latest_file
        print(f"🔍 Auto-discovered latest dataset: {Path(file_path).name}")
        print(f"📅 Last modified: {pd.Timestamp.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S')}")
    else:
        raise FileNotFoundError(f"No Phase 1 dataset files found in {datasets_dir}")
else:
    print(f"📁 Using specified file: {Path(file_path).name}")

# Verify file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

🔍 Auto-discovered latest dataset: dataset_gemma-2-9b_2025-06-08_19-59-38.parquet
📅 Last modified: 2025-06-08 19:59:38


In [5]:
# Load parquet file and display basic info
df = pd.read_parquet(file_path)

print(f"Dataset Information:")
print(f"Number of records: {len(df):,}")
print(f"Number of columns: {len(df.columns)}")
print(f"Columns: {list(df.columns)}")
print(f"File size: {os.path.getsize(file_path) / (1024**2):.2f} MB")

Dataset Information:
Number of records: 10
Number of columns: 4
Columns: ['task_id', 'generated_code', 'test_passed', 'complexity_score']
File size: 0.00 MB


In [6]:
# Display column types and basic statistics
print("Column Information:")
print(df.info())
print("\nData types:")
print(df.dtypes)

Column Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   task_id           10 non-null     int64 
 1   generated_code    10 non-null     object
 2   test_passed       10 non-null     bool  
 3   complexity_score  10 non-null     int64 
dtypes: bool(1), int64(2), object(1)
memory usage: 378.0+ bytes
None

Data types:
task_id              int64
generated_code      object
test_passed           bool
complexity_score     int64
dtype: object


In [7]:
# Display first 10 records in a clean table format
print("First 10 records:")
df.head(10)

First 10 records:


Unnamed: 0,task_id,generated_code,test_passed,complexity_score
0,1,"\ndef min_cost(cost, m, n):\n # Write your ...",False,7
1,2,"\ndef similar_elements(list1, list2):\n # W...",False,1
2,3,\ndef is_not_prime(n):\n if n == 2:\n ...,True,3
3,4,"\ndef heap_queue_largest(nums, k):\n # Your...",False,1
4,5,\ndef count_ways(n):\n if n == 2:\n ...,True,2
5,6,"\ndef differ_At_One_Bit_Pos(a,b):\n return ...",False,2
6,7,\ndef find_char_long(string):\n # Write you...,False,1
7,8,\ndef square_nums(nums):\n return list(map(...,True,1
8,9,\ndef find_Rotations(s):\n # Write your cod...,False,3
9,10,"\ndef small_nnum(dataset, n):\n # Your code...",False,1


In [8]:
# Phase 1 Dataset Summary
print("=" * 40)
print("PHASE 1 DATASET SUMMARY")
print("=" * 40)

# Basic statistics
total_records = len(df)
passed_tests = df['test_passed'].sum()
pass_percentage = (passed_tests / total_records) * 100

print(f"\nTotal records: {total_records:,}")
print(f"Test pass ratio: {passed_tests}/{total_records}")
print(f"Test pass percentage: {pass_percentage:.1f}%")

PHASE 1 DATASET SUMMARY

Total records: 10
Test pass ratio: 3/10
Test pass percentage: 30.0%
