In [ ]:
# Install the project in editable mode (run this cell once)
import sys
import subprocess

# Install the project in editable mode
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", ".."])
    print("✓ Project installed successfully")
except subprocess.CalledProcessError as e:
    print(f"❌ Installation failed: {e}")
    
# Alternative: Add to path manually
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"✓ Added to Python path: {project_root}")

In [None]:
# Setup - Run this cell first
import sys
import os

# Add project root to Python path
project_root = os.path.abspath('..')
sys.path.insert(0, project_root)

# Import required libraries
import pandas as pd
import duckdb
from datetime import datetime
import json

# Import our Iceberg manager using importlib for numbered module
import importlib.util
spec = importlib.util.spec_from_file_location("iceberg_creation", "../scripts/04_iceberg_creation.py")
iceberg_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(iceberg_module)
IcebergManager = iceberg_module.IcebergManager

print("✅ Setup complete!")

In [ ]:
# Initialize the Iceberg Manager
manager = IcebergManager()
print("✅ Iceberg manager initialized")
print(f"📁 Warehouse: {manager.warehouse_path}")

In [None]:
# Create the Iceberg table from our NEO data
try:
    table_name = manager.create_iceberg_table()
    print(f"✓ Created Iceberg table: {table_name}")
except Exception as e:
    print(f"❌ Table creation failed: {e}")
    raise

## 3. Table Information and Schema

In [None]:
# Get basic table information
try:
    table = manager.catalog.load_table(table_name)
    
    print("📊 ICEBERG TABLE INFORMATION")
    print("=" * 50)
    print(f"📍 Location: {table.location()}")
    print(f"📋 Schema fields: {len(table.schema().fields)}")
    print(f"🗂️ Partition fields: {len(table.spec().fields)}")
    
    # Current snapshot
    current_snapshot = table.current_snapshot()
    if current_snapshot:
        print(f"📸 Current Snapshot: {current_snapshot.snapshot_id}")
    
    print("\n📋 SCHEMA:")
    for i, field in enumerate(table.schema().fields):
        print(f"  {i+1}. {field.name}: {field.field_type}")
    
    print("\n🗂️ PARTITIONING:")
    if len(table.spec().fields) > 0:
        for field in table.spec().fields:
            print(f"  - {field}")
    else:
        print("  No partitioning (unpartitioned table)")
        
except Exception as e:
    print(f"❌ Failed to get table info: {e}")

## 4. Query the Iceberg Table

In [None]:
# Query the Iceberg table directly through PyIceberg
try:
    table = manager.catalog.load_table(table_name)
    
    # Basic count query
    scan = table.scan()
    arrow_table = scan.to_arrow()
    total_records = len(arrow_table)
    
    print(f"📊 Total records in Iceberg table: {total_records:,}")
    
    # Convert to pandas for easier display
    df = arrow_table.to_pandas()
    
    # Show sample data (closest approaches)
    if 'dist' in df.columns:
        sample_data = df.nsmallest(10, 'dist')[['des', 'fullname', 'cd', 'dist', 'h']]
        print("\n🔍 Closest approaches (sample):")
        display(sample_data)
    else:
        print("\n🔍 Sample data:")
        display(df.head(10))
        
except Exception as e:
    print(f"❌ Query failed: {e}")
    import traceback
    traceback.print_exc()

## 5. Schema Evolution Demo

In [None]:
# Demonstrate schema evolution
print("🔄 SCHEMA EVOLUTION DEMONSTRATION")
print("=" * 40)

try:
    # Get original schema
    table = manager.catalog.load_table(table_name)
    original_schema = table.schema()
    print("📋 ORIGINAL SCHEMA:")
    for field in original_schema.fields:
        print(f"  - {field.name}: {field.field_type}")
    
    # Perform schema evolution using our existing method
    schema_results = manager.add_simple_column(table_name)
    
    print(f"\n✓ Schema evolution completed!")
    print(f"📝 Operation: {schema_results['operation']}")
    print(f"📝 Column added: {schema_results['column_name']} ({schema_results['column_type']})")
    print(f"📝 Records added: {schema_results['records_added']}")
    
    # Show new schema
    table = manager.catalog.load_table(table_name)  # Reload to get updated schema
    new_schema = table.schema()
    print(f"\n📋 EVOLVED SCHEMA:")
    for field in new_schema.fields:
        print(f"  - {field.name}: {field.field_type}")
    
    print(f"\n✅ Successfully added '{schema_results['column_name']}' column!")
    
except Exception as e:
    print(f"❌ Schema evolution failed: {e}")
    import traceback
    traceback.print_exc()

## 6. Time Travel Demo

In [None]:
# Demonstrate time travel capabilities
print("⏰ TIME TRAVEL DEMONSTRATION")
print("=" * 35)

try:
    # Perform time travel demonstration using our existing method
    time_travel_results = manager.demonstrate_time_travel(table_name)
    
    print(f"✓ Time travel completed successfully!")
    print(f"📸 Found {len(time_travel_results['snapshots'])} snapshots")
    
    # Display snapshot information
    print("\n📸 SNAPSHOTS:")
    for i, snapshot in enumerate(time_travel_results['snapshots']):
        print(f"  {i+1}. Snapshot {snapshot['snapshot_id']}")
        print(f"     📅 Time: {snapshot['timestamp']}")
        if snapshot.get('summary'):
            print(f"     📊 Summary: {snapshot['summary']}")
        print()
    
    # Display time travel queries
    print("🔍 TIME TRAVEL QUERIES:")
    for i, query in enumerate(time_travel_results['queries']):
        print(f"  {i+1}. Snapshot {query['snapshot_id']}")
        print(f"     📅 Time: {query['timestamp']}")
        print(f"     📊 Records: {query['record_count']:,}")
        print()
        
except Exception as e:
    print(f"❌ Time travel failed: {e}")
    import traceback
    traceback.print_exc()

## 7. Advanced Iceberg Features

In [None]:
# Test advanced Iceberg features through PyIceberg
print("🚀 ADVANCED ICEBERG FEATURES")
print("=" * 35)

try:
    # Load the table directly
    table = manager.catalog.load_table(table_name)
    
    # 1. Inspect table history
    print("📚 TABLE HISTORY:")
    history = list(table.history())
    for i, entry in enumerate(history[:5]):  # Show first 5 entries
        print(f"  {i+1}. Snapshot {entry.snapshot_id}")
        print(f"     📅 {datetime.fromtimestamp(entry.timestamp_ms / 1000)}")
    
    # 2. Table statistics
    print(f"\n📊 TABLE STATISTICS:")
    print(f"  📁 Location: {table.location()}")
    print(f"  📋 Schema fields: {len(table.schema().fields)}")
    print(f"  🗂️ Partition fields: {len(table.spec().fields)}")
    
    # 3. Current snapshot details
    current_snapshot = table.current_snapshot()
    if current_snapshot:
        print(f"\n📸 CURRENT SNAPSHOT:")
        print(f"  🆔 ID: {current_snapshot.snapshot_id}")
        print(f"  📅 Timestamp: {datetime.fromtimestamp(current_snapshot.timestamp_ms / 1000)}")
        if hasattr(current_snapshot, 'summary'):
            print(f"  📊 Summary: {dict(current_snapshot.summary)}")
    
    # 4. Scan with filters (demonstrate predicate pushdown)
    print(f"\n🔍 FILTERED SCAN EXAMPLE:")
    
    # Simple scan with limit
    scan = table.scan().limit(1000)
    filtered_data = scan.to_arrow().to_pandas()
    print(f"  📊 Limited scan records: {len(filtered_data)}")
    
    if len(filtered_data) > 0:
        if 'approach_year' in filtered_data.columns:
            print(f"  📅 Year range: {filtered_data['approach_year'].min()} - {filtered_data['approach_year'].max()}")
        if 'dist' in filtered_data.columns:
            print(f"  🎯 Closest approach: {filtered_data['dist'].min():.6f} AU")
    
except Exception as e:
    print(f"❌ Advanced features test failed: {e}")
    import traceback
    traceback.print_exc()

## 8. Performance Comparison: Iceberg vs Traditional

In [None]:
# Performance comparison
import time

print("⚡ PERFORMANCE COMPARISON")
print("=" * 30)

def time_query(description, query_func):
    """Time a query function."""
    start = time.time()
    try:
        result = query_func()
        end = time.time()
        duration = end - start
        print(f"✓ {description}: {duration:.4f}s")
        return result, duration
    except Exception as e:
        end = time.time()
        duration = end - start
        print(f"❌ {description}: FAILED ({duration:.4f}s) - {e}")
        return None, duration

# Test queries
try:
    # 1. Count query via Iceberg (PyIceberg direct)
    def iceberg_count():
        table = manager.catalog.load_table(table_name)
        scan = table.scan()
        return len(scan.to_arrow())
    
    iceberg_result, iceberg_time = time_query("Iceberg Count (PyIceberg)", iceberg_count)
    
    # 2. Sample query via Iceberg
    def iceberg_sample():
        table = manager.catalog.load_table(table_name)
        scan = table.scan().limit(100)
        return len(scan.to_arrow())
    
    iceberg_sample_result, iceberg_sample_time = time_query("Iceberg Sample (100 records)", iceberg_sample)
    
    # 3. Full table scan with conversion
    def iceberg_full_scan():
        table = manager.catalog.load_table(table_name)
        scan = table.scan()
        arrow_table = scan.to_arrow()
        df = arrow_table.to_pandas()
        return len(df)
    
    iceberg_full_result, iceberg_full_time = time_query("Iceberg Full Scan + Pandas", iceberg_full_scan)
    
    # Summary
    print("\n📊 PERFORMANCE SUMMARY:")
    print(f"  📊 Total records: {iceberg_result:,}")
    print(f"  ⚡ Count time: {iceberg_time:.4f}s")
    print(f"  🔍 Sample time: {iceberg_sample_time:.4f}s") 
    print(f"  📈 Full scan time: {iceberg_full_time:.4f}s")
    
    if iceberg_result and iceberg_result > 0:
        throughput = iceberg_result / iceberg_time
        print(f"  🚀 Throughput: {throughput:,.0f} records/sec")

except Exception as e:
    print(f"❌ Performance comparison failed: {e}")
    import traceback
    traceback.print_exc()

## 9. Save Results and Cleanup

In [None]:
# Save comprehensive results
demo_results = {
    "demo_timestamp": datetime.now().isoformat(),
    "table_name": table_name if 'table_name' in locals() else None,
    "schema_evolution": schema_results if 'schema_results' in locals() else None,
    "time_travel": time_travel_results if 'time_travel_results' in locals() else None,
    "performance": {
        "iceberg_count_time": iceberg_time if 'iceberg_time' in locals() else None,
        "iceberg_sample_time": iceberg_sample_time if 'iceberg_sample_time' in locals() else None,
        "iceberg_full_time": iceberg_full_time if 'iceberg_full_time' in locals() else None,
        "total_records": iceberg_result if 'iceberg_result' in locals() else None
    }
}

# Save to file
with open('../iceberg_demo_results.json', 'w') as f:
    json.dump(demo_results, f, indent=2, default=str)

print("✓ Demo results saved to iceberg_demo_results.json")

# Cleanup
try:
    manager.close()
    print("✓ Connections closed")
except:
    pass

print("\n🎉 Iceberg features demo completed successfully!")
print("\n📋 Summary of demonstrated features:")
print("  ✓ Native PyIceberg table creation")
print("  ✓ Schema evolution (adding columns)")
print("  ✓ Time travel (snapshot queries)")
print("  ✓ Direct PyIceberg table access")
print("  ✓ Table metadata inspection")
print("  ✓ ACID transactions")
print("  ✓ Performance measurements")