# Phase 0.1 Split Inspection (Simple Version)

A simpler approach that changes to the project root directory first.

In [9]:
# Setup and change to project root
import os
import pandas as pd
import json
from pathlib import Path
from IPython.display import display, Markdown

# Change to project root if we're in a subdirectory
current_dir = os.getcwd()
if 'phase0_1_problem_splitting' in current_dir:
    os.chdir('..')
    print(f"Changed from: {current_dir}")
    print(f"Changed to: {os.getcwd()}")
else:
    print(f"Already in: {current_dir}")

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 150)
pd.set_option('display.width', None)

# Helper function to truncate text
def truncate_text(text, max_length=150):
    """Truncate text to max_length characters"""
    if isinstance(text, str) and len(text) > max_length:
        return text[:max_length] + '...'
    return text

Already in: /Users/krizroycetahimic/Documents/Thesis/Code/pva_sae


In [10]:
# Load and display splits
phase0_1_dir = Path('data/phase0_1')
print(f"Looking in: {phase0_1_dir.absolute()}\n")

split_files = {
    'SAE': 'sae_mbpp.parquet',
    'Hyperparameters': 'hyperparams_mbpp.parquet',
    'Validation': 'validation_mbpp.parquet'
}

# Check what files exist
if phase0_1_dir.exists():
    print("Files in phase0_1 directory:")
    for f in phase0_1_dir.iterdir():
        print(f"  - {f.name}")
    print()

# Store summary data for comparison
summary_data = []

# Load each split
for split_name, filename in split_files.items():
    file_path = phase0_1_dir / filename
    
    if file_path.exists():
        display(Markdown(f"## {split_name} Split"))
        
        df = pd.read_parquet(file_path)
        print(f"**File:** {filename}")
        print(f"**Total records:** {len(df)}")
        
        # Calculate and store summary statistics
        if 'cyclomatic_complexity' in df.columns:
            complexity_stats = df['cyclomatic_complexity'].describe()
            summary_data.append({
                'Split': split_name,
                'Count': len(df),
                'Mean Complexity': complexity_stats['mean'],
                'Std Complexity': complexity_stats['std'],
                'Min Complexity': complexity_stats['min'],
                'Max Complexity': complexity_stats['max']
            })
            print(f"**Complexity:** mean={complexity_stats['mean']:.2f}, std={complexity_stats['std']:.2f}\n")
        
        # Display first 3 records with ALL columns
        display_df = df.head(3).copy()
        
        # Truncate long text columns for better display
        text_columns = ['text', 'code', 'test_list']
        for col in text_columns:
            if col in display_df.columns:
                display_df[col] = display_df[col].apply(lambda x: truncate_text(str(x), 150))
        
        display(Markdown("### First 3 Records (All Columns):"))
        display(display_df)
        print()  # Add spacing
    else:
        print(f"\n✗ {split_name} split not found: {filename}")

Looking in: /Users/krizroycetahimic/Documents/Thesis/Code/pva_sae/data/phase0_1

Files in phase0_1 directory:
  - split_metadata.json
  - hyperparams_mbpp.parquet
  - timestamp.txt
  - validation_mbpp.parquet
  - sae_mbpp.parquet



## SAE Split

**File:** sae_mbpp.parquet
**Total records:** 489
**Complexity:** mean=2.73, std=1.90



### First 3 Records (All Columns):

Unnamed: 0,task_id,text,code,test_list,cyclomatic_complexity
0,2,Write a function to find the similar elements from the given two tuple lists.,"def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)","['assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)'\n 'assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)'\n 'assert simila...",1
1,3,Write a python function to identify non-prime numbers.,"import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n ...",['assert is_not_prime(2) == False' 'assert is_not_prime(10) == True'\n 'assert is_not_prime(35) == True'],3
2,4,Write a function to find the largest integers from a given list of numbers using heap queue algorithm.,"import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums","['assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] '\n 'assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22...",1





## Hyperparameters Split

**File:** hyperparams_mbpp.parquet
**Total records:** 97
**Complexity:** mean=3.38, std=2.04



### First 3 Records (All Columns):

Unnamed: 0,task_id,text,code,test_list,cyclomatic_complexity
0,28,Write a python function to find binomial co-efficient.,"def binomial_Coeff(n,k): \r\n if k > n : \r\n return 0\r\n if k==0 or k ==n : \r\n return 1 \r\n return binomial_Coeff(n-1,k...","['assert binomial_Coeff(5,2) == 10' 'assert binomial_Coeff(4,3) == 4'\n 'assert binomial_Coeff(3,2) == 3']",4
1,42,Write a python function to find the sum of repeated elements in a given array.,"def find_Sum(arr,n): \r\n return sum([x for x in arr if arr.count(x) > 1])","['assert find_Sum([1,2,3,1,1,4,5,6],8) == 3'\n 'assert find_Sum([1,2,3,1,1],5) == 3' 'assert find_Sum([1,1,2],3) == 2']",3
2,53,Write a python function to check whether the first and last characters of a given string are equal or not.,"def check_Equality(str):\r\n if (str[0] == str[-1]): \r\n return (""Equal"") \r\n else: \r\n return (""Not Equal"")","['assert check_Equality(""abcda"") == ""Equal""'\n 'assert check_Equality(""ab"") == ""Not Equal""'\n 'assert check_Equality(""mad"") == ""Not Equal""']",2





## Validation Split

**File:** validation_mbpp.parquet
**Total records:** 388
**Complexity:** mean=2.86, std=2.15



### First 3 Records (All Columns):

Unnamed: 0,task_id,text,code,test_list,cyclomatic_complexity
0,1,"Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix cost[][] and a position (m, n) in cost[][].","R = 3\r\nC = 3\r\ndef min_cost(cost, m, n): \r\n\ttc = [[0 for x in range(C)] for x in range(R)] \r\n\ttc[0][0] = cost[0][0] \r\n\tfor i in range(...","['assert min_cost([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 2, 2) == 8'\n 'assert min_cost([[2, 3, 4], [5, 9, 3], [2, 6, 4]], 2, 2) == 12'\n 'assert min_...",7
1,5,Write a function to find the number of ways to fill it with 2 x 1 dominoes for the given 3 x n board.,"def count_ways(n): \r\n\tA = [0] * (n + 1) \r\n\tB = [0] * (n + 1) \r\n\tA[0] = 1\r\n\tA[1] = 0\r\n\tB[0] = 0\r\n\tB[1] = 1\r\n\tfor i in range(2,...",['assert count_ways(2) == 3' 'assert count_ways(8) == 153'\n 'assert count_ways(12) == 2131'],2
2,8,Write a function to find squares of individual elements in a list using lambda function.,"def square_nums(nums):\r\n square_nums = list(map(lambda x: x ** 2, nums))\r\n return square_nums","['assert square_nums([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])==[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]'\n 'assert square_nums([10,20,30])==([100,400,900])'...",1





In [11]:
# Show all available columns from one split
display(Markdown("## Available Columns"))

# Use the first available split to show column information
if split_files:
    for split_name, filename in split_files.items():
        file_path = phase0_1_dir / filename
        if file_path.exists():
            df = pd.read_parquet(file_path)
            
            # Create column info DataFrame
            col_info = pd.DataFrame({
                'Column': df.columns,
                'Type': [str(df[col].dtype) for col in df.columns],
                'Non-Null Count': [df[col].notna().sum() for col in df.columns],
                'Sample Value': [str(df[col].iloc[0])[:50] + '...' if len(str(df[col].iloc[0])) > 50 else str(df[col].iloc[0]) for col in df.columns]
            })
            
            print(f"Columns from {split_name} split:")
            display(col_info)
            break

## Available Columns

Columns from SAE split:


Unnamed: 0,Column,Type,Non-Null Count,Sample Value
0,task_id,int64,489,2
1,text,object,489,Write a function to find the similar elements from...
2,code,object,489,"def similar_elements(test_tup1, test_tup2):\r\n res..."
3,test_list,object,489,"['assert similar_elements((3, 4, 5, 6),(5, 7, 4, 1..."
4,cyclomatic_complexity,int64,489,1


In [8]:
# Load and display metadata
metadata_file = phase0_1_dir / 'split_metadata.json'
if metadata_file.exists():
    with open(metadata_file, 'r') as f:
        metadata = json.load(f)
    
    display(Markdown("## Metadata Summary"))
    
    # Create metadata summary DataFrame
    metadata_rows = []
    for split, size in metadata['split_sizes'].items():
        ratio = metadata['split_ratios'][split]
        stats = metadata['split_complexity_stats'][split]
        metadata_rows.append({
            'Split': split,
            'Size': size,
            'Ratio (%)': f"{ratio*100:.1f}",
            'Complexity Mean': stats['mean'],
            'Complexity Std': stats['std'],
            'Complexity Min': stats['min'],
            'Complexity Max': stats['max']
        })
    
    metadata_df = pd.DataFrame(metadata_rows)
    print(f"Total problems: {metadata['total_problems']}")
    print(f"Complexity range: [{metadata['complexity_range'][0]:.2f}, {metadata['complexity_range'][1]:.2f}]")
    print(f"Creation timestamp: {metadata['creation_timestamp']}\n")
    
    display(metadata_df)

## Metadata Summary

Total problems: 974
Complexity range: [1.00, 16.00]
Creation timestamp: 2025-06-16T11:32:36.163698



Unnamed: 0,Split,Size,Ratio (%),Complexity Mean,Complexity Std,Complexity Min,Complexity Max
0,sae,489,50.2,2.728016,1.89846,1.0,12.0
1,hyperparams,97,10.0,3.381443,2.043502,1.0,12.0
2,validation,388,39.8,2.85567,2.145795,1.0,16.0
