### Apache Iceberg + Trino Performance Evaluation

This notebook compares the performance of Apache Iceberg with Trino against the current Parquet implementation for handling MultiQC data.

In [4]:
import random
import string
import time
from datetime import datetime

import os
from pathlib import Path
import duckdb
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import requests
import trino
from trino.auth import BasicAuthentication

In [5]:
# Configuration parameters
NUM_RUNS = 100  # Can be scaled up to millions in real case
NUM_MODULES = 10  # Fixed across runs
NUM_SECTIONS_PER_MODULE = 5
NUM_SAMPLES_PER_MODULE = 100  # Can be 10 to 1000
NUM_METRICS_PER_MODULE = 20  # Can be 10 to 50

# Paths
PARQUET_PATH = "s3://megaqc-test/parquet_data"
ICEBERG_PATH = "s3://megaqc-test/iceberg_data"

# Trino connection parameters
TRINO_HOST = "trino-coordinator"
TRINO_PORT = 8080
TRINO_USER = "trino"
TRINO_CATALOG = "iceberg"
TRINO_SCHEMA = "default"

# Set MinIO credentials for local testing
# os.environ["AWS_ACCESS_KEY_ID"] = "minio"
# os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123"
# os.environ["AWS_ENDPOINT_URL"] = "http://minio:9000"
# os.environ["AWS_REGION"] = "us-east-1"
# os.environ["AWS_DEFAULT_REGION"] = "us-east-1"

In [9]:
# Local dir with pre-generated data
DIR_PATH = Path(f"data/{NUM_RUNS}runs_{NUM_MODULES}mod_{NUM_SAMPLES_PER_MODULE}samples_{NUM_METRICS_PER_MODULE}metrics")
# Read the Parquet files with partitioning information
dataset = ds.dataset(DIR_PATH / "parquet_data", format="parquet", partitioning="hive")

In [12]:
dataset.to_table().to_pandas()

Unnamed: 0,timestamp,module_id,module_name,module_url,module_comment,module_anchor,module_doi,sample_id,metric_name,val_raw,...,section_module_info,section_comment,section_helptext,section_content_before_plot,section_content,section_plot,section_print_section,section_plot_anchor,section_ai_summary,run_id
0,2025-04-24T17:35:29.860622,module_0,Module 0,http://example.com/module/0,This is module 0,anchor_0,rdRrkvCgqLJWRsPyHQlq,PaoIgLgJRI,metric_0,77.469653,...,,,,,,,,,,run_0
1,2025-04-24T17:35:29.860622,module_0,Module 0,http://example.com/module/0,This is module 0,anchor_0,rdRrkvCgqLJWRsPyHQlq,PaoIgLgJRI,metric_1,56.205948,...,,,,,,,,,,run_0
2,2025-04-24T17:35:29.860622,module_0,Module 0,http://example.com/module/0,This is module 0,anchor_0,rdRrkvCgqLJWRsPyHQlq,PaoIgLgJRI,metric_2,92.326046,...,,,,,,,,,,run_0
3,2025-04-24T17:35:29.860622,module_0,Module 0,http://example.com/module/0,This is module 0,anchor_0,rdRrkvCgqLJWRsPyHQlq,PaoIgLgJRI,metric_3,82.747585,...,,,,,,,,,,run_0
4,2025-04-24T17:35:29.860622,module_0,Module 0,http://example.com/module/0,This is module 0,anchor_0,rdRrkvCgqLJWRsPyHQlq,PaoIgLgJRI,metric_4,68.244517,...,,,,,,,,,,run_0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004995,2025-04-24T17:35:33.805751,module_9,Module 9,,,,,,,,...,Info about module for section jcGLFaICEq,Comment for section jcGLFaICEq,Help text for section jcGLFaICEq,Content before plot for section jcGLFaICEq,Main content for section jcGLFaICEq,Plot content for section jcGLFaICEq,False,plot_jcGLFaICEq,AI generated summary for section jcGLFaICEq,run_99
2004996,2025-04-24T17:35:33.805751,module_9,Module 9,,,,,,,,...,Info about module for section UVvZCFqoTB,Comment for section UVvZCFqoTB,Help text for section UVvZCFqoTB,Content before plot for section UVvZCFqoTB,Main content for section UVvZCFqoTB,Plot content for section UVvZCFqoTB,False,,AI generated summary for section UVvZCFqoTB,run_99
2004997,2025-04-24T17:35:33.805751,module_9,Module 9,,,,,,,,...,Info about module for section PofjOacreb,Comment for section PofjOacreb,Help text for section PofjOacreb,Content before plot for section PofjOacreb,Main content for section PofjOacreb,Plot content for section PofjOacreb,False,plot_PofjOacreb,AI generated summary for section PofjOacreb,run_99
2004998,2025-04-24T17:35:33.805751,module_9,Module 9,,,,,,,,...,Info about module for section NYNOtHCtfx,Comment for section NYNOtHCtfx,Help text for section NYNOtHCtfx,Content before plot for section NYNOtHCtfx,Main content for section NYNOtHCtfx,Plot content for section NYNOtHCtfx,True,,AI generated summary for section NYNOtHCtfx,run_99


In [5]:
def query_single_metric_parquet(parquet_dir, metric_name="metric_0"):
    """Query Parquet files to retrieve specific metric values using PyArrow"""
    print(f"Querying Parquet files for metrics with name: {metric_name}")
    start_time = time.time()

    # Read the Parquet files with partitioning information
    dataset = ds.dataset(parquet_dir, format="parquet", partitioning="hive")

    # Define filter condition for the metric name
    filter_expr = (ds.field("metric_name") == metric_name)
    # Read the filtered data
    table = dataset.to_table(filter=filter_expr)
    # Convert to pandas DataFrame if needed
    df = table.to_pandas()

    end_time = time.time()
    elapsed = end_time - start_time
    
    print(f"Query found {len(df)} records in {elapsed:.4f} seconds")
    return df, elapsed


def query_single_module_parquet(parquet_dir, run_id="run_0", module_id="module_0"):
    """Query Parquet files to retrieve specific module data using PyArrow"""
    print(f"Querying Parquet files for run_id={run_id} and module_id={module_id}")
    start_time = time.time()

    # Read the Parquet files with partitioning information
    dataset = ds.dataset(parquet_dir, format="parquet", partitioning="hive")

    # Define filter condition for the run_id and module_id
    filter_expr = (ds.field("run_id") == run_id) & (ds.field("module_id") == module_id)
    # Read the filtered data
    table = dataset.to_table(filter=filter_expr)
    # Convert to pandas DataFrame if needed
    df = table.to_pandas()
    
    end_time = time.time()
    elapsed = end_time - start_time
    
    print(f"Query found {len(df)} records in {elapsed:.4f} seconds")
    return df, elapsed


def run_parquet_benchmark(parquet_dir, num_runs=10, num_modules=10, 
                         num_samples_per_module=100, num_metrics_per_module=20):
    """Run a complete Parquet benchmark"""
    print("-" * 80)
    print("PARQUET BENCHMARK")
    print("-" * 80)
    
    print("\nGenerating sample data with:")
    print(f"- {num_runs} runs")
    print(f"- {num_modules} modules per run")
    print(f"- {num_samples_per_module} samples per module")
    print(f"- {num_metrics_per_module} metrics per module")
    
    # Storage benchmark
    current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    benchmark_dir = f"{parquet_dir}/{current_time}"
    
    # Query benchmarks
    print("\nRunning query benchmarks:")
    
    # Query by metric name
    _, metric_query_time = query_single_metric_parquet(benchmark_dir)
    
    # Query by run_id and module_id
    _, module_query_time = query_single_module_parquet(benchmark_dir)
    
    # Summary
    print("\nPARQUET BENCHMARK SUMMARY:")
    print(f"Query by metric time: {metric_query_time:.4f} seconds")
    print(f"Query by module time: {module_query_time:.4f} seconds")
    
    return {
        "metric_query_time": metric_query_time,
        "module_query_time": module_query_time
    }


# MinIO/S3 configuration
PARQUET_PATH = "s3://megaqc-test/parquet_data"

# Set MinIO credentials for local testing
# If using AWS S3 directly, these would be your AWS credentials
# os.environ["AWS_ACCESS_KEY_ID"] = "minio"
# os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123"
# os.environ["AWS_ENDPOINT_URL"] = "http://minio:9000"
# os.environ["AWS_REGION"] = "us-east-1"

# Run the benchmark with smaller dataset for testing
results = run_parquet_benchmark(
    PARQUET_PATH,
    num_runs=5,
    num_modules=5,
    num_samples_per_module=10,
    num_metrics_per_module=5
)

--------------------------------------------------------------------------------
PARQUET BENCHMARK
--------------------------------------------------------------------------------

Generating sample data with:
- 5 runs
- 5 modules per run
- 10 samples per module
- 5 metrics per module
Flattening data...
Creating DataFrame...
Creating Parquet Table...
Writing to Parquet file s3://megaqc-test/parquet_data/2025-04-23-13-46-14...
Parquet storage time: 7.2234 seconds

Running query benchmarks:
Querying Parquet files for metrics with name: metric_0
Query found 250 records in 1.2624 seconds
Querying Parquet files for run_id=run_0 and module_id=module_0
Query found 50 records in 1.2209 seconds

PARQUET BENCHMARK SUMMARY:
Storage time: 7.2234 seconds
Query by metric time: 1.2624 seconds
Query by module time: 1.2209 seconds


In [6]:
# Benchmark script for Iceberg storage and querying with Trino

def create_trino_connection(
        host="trino-coordinator", port=8080, 
        user="trino", catalog="iceberg", schema="default"
    ):
    """Create a connection to Trino"""
    try:
        conn = trino.dbapi.connect(
            host=host,
            port=port,
            user=user,
            catalog=catalog,
            schema=schema,
        )
        print("Connected to Trino successfully!")
        return conn
    except Exception as e:
        print(f"Error connecting to Trino: {e}")
        return None


def init_iceberg_schema(conn):
    """Initialize Iceberg schema and table"""
    cursor = conn.cursor()
    
    try:
        # Create the table in Iceberg format
        cursor.execute("""
        CREATE TABLE IF NOT EXISTS metrics (
            run_id VARCHAR,
            timestamp VARCHAR,
            module_id VARCHAR,
            module_name VARCHAR,
            module_url VARCHAR,
            module_comment VARCHAR,
            sample_id VARCHAR,
            metric_name VARCHAR,
            value DOUBLE,
            unmodified_value DOUBLE,
            formatted_value VARCHAR,
            metric_min DOUBLE,
            metric_max DOUBLE,
            metric_scale VARCHAR,
            metric_color VARCHAR,
            metric_type VARCHAR,
            metric_namespace VARCHAR,
            metric_placement VARCHAR
        )
        WITH (
            format = 'PARQUET',
            partitioning = ARRAY['run_id']
        )
        """)
        print("Iceberg table created successfully!")
        return True
    except Exception as e:
        print(f"Error creating Iceberg table: {e}")
        return False


def store_in_iceberg(data, conn):
    """Store the flattened data in Iceberg format using Trino"""
    print("Flattening data...")
    flat_data = flatten_hierarchical_data(data)
    print("Creating DataFrame...")
    df = pd.DataFrame(flat_data)
    
    # Start timing
    start_time = time.time()
    
    # Execute insert for each row (not efficient, but works for small dataset)
    cursor = conn.cursor()
    
    try:
        print("Inserting data into Iceberg table...")
        
        # Batch inserts for better performance
        batch_size = 1000
        total_batches = (len(df) + batch_size - 1) // batch_size
        
        for i in range(0, len(df), batch_size):
            batch_df = df.iloc[i:i+batch_size]
            batch_num = i // batch_size + 1
            print(f"Inserting batch {batch_num}/{total_batches}...")
            
            for _, row in batch_df.iterrows():
                # Create the INSERT statement
                insert_sql = """
                INSERT INTO metrics VALUES (
                    %(run_id)s, %(timestamp)s, %(module_id)s, %(module_name)s,
                    %(module_url)s, %(module_comment)s, %(sample_id)s, %(metric_name)s,
                    %(value)s, %(unmodified_value)s, %(formatted_value)s,
                    %(metric_min)s, %(metric_max)s, %(metric_scale)s, %(metric_color)s,
                    %(metric_type)s, %(metric_namespace)s, %(metric_placement)s
                )
                """
                # Execute the INSERT statement
                cursor.execute(insert_sql, {
                    'run_id': row['run_id'],
                    'timestamp': row['timestamp'],
                    'module_id': row['module_id'],
                    'module_name': row['module_name'],
                    'module_url': row['module_url'],
                    'module_comment': row['module_comment'],
                    'sample_id': row['sample_id'],
                    'metric_name': row['metric_name'],
                    'value': row['value'],
                    'unmodified_value': row['unmodified_value'],
                    'formatted_value': row['formatted_value'],
                    'metric_min': row['metric_min'],
                    'metric_max': row['metric_max'],
                    'metric_scale': row['metric_scale'],
                    'metric_color': row['metric_color'],
                    'metric_type': row['metric_type'],
                    'metric_namespace': row['metric_namespace'],
                    'metric_placement': row['metric_placement']
                })
        
        end_time = time.time()
        elapsed = end_time - start_time
        print(f"Iceberg storage time: {elapsed:.4f} seconds")
        return elapsed
    
    except Exception as e:
        print(f"Error inserting data: {e}")
        return -1


def query_single_metric_iceberg(conn, metric_name="metric_0"):
    """Query Iceberg table to retrieve specific metric values using Trino"""
    print(f"Querying Iceberg table for metrics with name: {metric_name}")
    start_time = time.time()

    # Execute query through Trino
    cursor = conn.cursor()
    query = f"""
        SELECT * FROM metrics
        WHERE metric_name = '{metric_name}'
    """
    
    try:
        cursor.execute(query)
        # Fetch all results
        results = cursor.fetchall()
        
        # Convert to DataFrame
        columns = [desc[0] for desc in cursor.description]
        df = pd.DataFrame(results, columns=columns)
        
        end_time = time.time()
        elapsed = end_time - start_time
        
        print(f"Query found {len(df)} records in {elapsed:.4f} seconds")
        return df, elapsed
    
    except Exception as e:
        print(f"Error querying data: {e}")
        return pd.DataFrame(), -1


def query_single_module_iceberg(conn, run_id="run_0", module_id="module_0"):
    """Query Iceberg table to retrieve specific module data using Trino"""
    print(f"Querying Iceberg table for run_id={run_id} and module_id={module_id}")
    start_time = time.time()

    # Execute query through Trino
    cursor = conn.cursor()
    query = f"""
        SELECT * FROM metrics
        WHERE run_id = '{run_id}' AND module_id = '{module_id}'
    """
    
    try:
        cursor.execute(query)
        # Fetch all results
        results = cursor.fetchall()
        
        # Convert to DataFrame
        columns = [desc[0] for desc in cursor.description]
        df = pd.DataFrame(results, columns=columns)
        
        end_time = time.time()
        elapsed = end_time - start_time
        
        print(f"Query found {len(df)} records in {elapsed:.4f} seconds")
        return df, elapsed
    
    except Exception as e:
        print(f"Error querying data: {e}")
        return pd.DataFrame(), -1


def run_iceberg_benchmark(num_runs=10, num_modules=10, 
                         num_samples_per_module=100, num_metrics_per_module=20):
    """Run a complete Iceberg benchmark"""
    print("-" * 80)
    print("ICEBERG BENCHMARK")
    print("-" * 80)
    
    print("\nGenerating sample data with:")
    print(f"- {num_runs} runs")
    print(f"- {num_modules} modules per run")
    print(f"- {num_samples_per_module} samples per module")
    print(f"- {num_metrics_per_module} metrics per module")
    
    # Generate test data
    data = generate_all_data(
        num_runs, num_modules, num_samples_per_module, num_metrics_per_module
    )
    
    # Connect to Trino
    conn = create_trino_connection()
    if not conn:
        print("Failed to connect to Trino. Aborting Iceberg benchmark.")
        return {
            "storage_time": -1,
            "metric_query_time": -1,
            "module_query_time": -1
        }
    
    # Initialize Iceberg schema
    if not init_iceberg_schema(conn):
        print("Failed to initialize Iceberg schema. Aborting Iceberg benchmark.")
        return {
            "storage_time": -1,
            "metric_query_time": -1,
            "module_query_time": -1
        }
    
    # Clear existing data
    try:
        cursor = conn.cursor()
        cursor.execute("DELETE FROM metrics")
        print("Cleared existing data from metrics table")
    except Exception as e:
        print(f"Error clearing metrics table: {e}")
    
    # Storage benchmark
    storage_time = store_in_iceberg(data, conn)
    
    # Query benchmarks
    print("\nRunning query benchmarks:")
    
    # Query by metric name
    _, metric_query_time = query_single_metric_iceberg(conn)
    
    # Query by run_id and module_id
    _, module_query_time = query_single_module_iceberg(conn)
    
    # Summary
    print("\nICEBERG BENCHMARK SUMMARY:")
    print(f"Storage time: {storage_time:.4f} seconds")
    print(f"Query by metric time: {metric_query_time:.4f} seconds")
    print(f"Query by module time: {module_query_time:.4f} seconds")
    
    return {
        "storage_time": storage_time,
        "metric_query_time": metric_query_time,
        "module_query_time": module_query_time
    }


# Run the benchmark with smaller dataset for testing
results = run_iceberg_benchmark(
    num_runs=5,
    num_modules=5,
    num_samples_per_module=10,
    num_metrics_per_module=5
) 

--------------------------------------------------------------------------------
ICEBERG BENCHMARK
--------------------------------------------------------------------------------

Generating sample data with:
- 5 runs
- 5 modules per run
- 10 samples per module
- 5 metrics per module
Connected to Trino successfully!
Error creating Iceberg table: failed to execute: HTTPConnectionPool(host='trino-coordinator', port=8080): Max retries exceeded with url: /v1/statement (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x113ee9760>: Failed to resolve 'trino-coordinator' ([Errno 8] nodename nor servname provided, or not known)"))
Failed to initialize Iceberg schema. Aborting Iceberg benchmark.


In [9]:
conn = trino.dbapi.connect(
    host="localhost",
    port=8080,
    user="trino",
    catalog="iceberg",
    schema="default"
)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS metrics (
    run_id VARCHAR,
    module_id VARCHAR,
    metric_name VARCHAR,
    value DOUBLE
)
WITH (
    format = 'PARQUET',
    partitioning = ARRAY['run_id']
)
""")

TrinoConnectionError: failed to execute: HTTPConnectionPool(host='localhost', port=8080): Max retries exceeded with url: /v1/statement (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1081c19d0>: Failed to establish a new connection: [Errno 61] Connection refused'))