### Exploring Parquet on s3 performance and partitioning


In [1]:
BUCKET_NAME = "s3://megaqc-test/datasets"

# Configuration parameters
NUM_RUNS = 10  # Can be scaled up to millions in real case
NUM_MODULES = 10  # Fixed across runs
NUM_SAMPLES_PER_MODULE = 100  # Can be 10 to 1000
NUM_METRICS_PER_MODULE = 20  # Can be 10 to 50

In [49]:
import random
import string
import time
from datetime import datetime

import duckdb
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds


def generate_random_string(length=10):
    """Generate a random string of fixed length"""
    return "".join(random.choices(string.ascii_letters, k=length))


def generate_metric_metadata():
    """Generate metadata for a metric"""
    return {
        "min": random.uniform(0, 10),
        "max": random.uniform(90, 100),
        "scale": random.choice(["linear", "log"]),
        "color": f"#{random.randint(0, 0xFFFFFF):06x}",
        "type": random.choice(["numeric", "categorical", "percentage"]),
        "namespace": random.choice(["performance", "quality", "resource"]),
        "placement": random.choice(["primary", "secondary", "tertiary"]),
    }


def generate_value_metadata(value):
    """Generate metadata for a value"""
    return {
        "unmodified_value": value,
        "formatted_value": f"{value:.2f}" if isinstance(value, float) else str(value),
    }


def generate_sample_data(num_metrics):
    """Generate data for a single sample"""
    sample_id = generate_random_string()
    metrics = {}

    for i in range(num_metrics):
        metric_name = f"metric_{i}"
        value = random.uniform(0, 100)
        metrics[metric_name] = {
            "value": value,
            "metadata": generate_value_metadata(value),
        }

    return {"sample_id": sample_id, "metrics": metrics}


def generate_module_data(module_index, num_samples, num_metrics):
    """Generate data for a single module"""
    samples = [generate_sample_data(num_metrics) for _ in range(num_samples)]

    metrics_metadata = {}
    for i in range(num_metrics):
        metric_name = f"metric_{i}"
        metrics_metadata[metric_name] = generate_metric_metadata()

    return {
        "module_id": f"module_{module_index}",
        "name": f"Module {module_index}",
        "url": f"http://example.com/module/{module_index}",
        "comment": f"This is module {module_index}",
        "metrics_metadata": metrics_metadata,
        "samples": samples,
    }


def generate_run_data(
    run_index, num_modules, num_samples_per_module, num_metrics_per_module
):
    """Generate data for a single run"""
    modules = [
        generate_module_data(i, num_samples_per_module, num_metrics_per_module)
        for i in range(num_modules)
    ]

    return {
        "run_id": f"run_{run_index}",
        "timestamp": datetime.now().isoformat(),
        "modules": modules,
    }


def generate_all_data(
    num_runs, num_modules, num_samples_per_module, num_metrics_per_module
):
    """Generate all runs data"""
    return [
        generate_run_data(
            i, num_modules, num_samples_per_module, num_metrics_per_module
        )
        for i in range(num_runs)
    ]


def flatten_hierarchical_data(data):
    """Convert hierarchical data to flat format for Parquet"""
    flat_records = []

    for run in data:
        run_id = run["run_id"]
        timestamp = run["timestamp"]

        for module in run["modules"]:
            module_id = module["module_id"]
            module_name = module["name"]
            module_url = module["url"]
            module_comment = module["comment"]

            for sample in module["samples"]:
                sample_id = sample["sample_id"]

                for metric_name, metric_data in sample["metrics"].items():
                    value = metric_data["value"]
                    unmodified_value = metric_data["metadata"]["unmodified_value"]
                    formatted_value = metric_data["metadata"]["formatted_value"]

                    # Get metric metadata
                    metric_metadata = module["metrics_metadata"].get(metric_name, {})

                    flat_records.append(
                        {
                            "run_id": run_id,
                            "timestamp": timestamp,
                            "module_id": module_id,
                            "module_name": module_name,
                            "module_url": module_url,
                            "module_comment": module_comment,
                            "sample_id": sample_id,
                            "metric_name": metric_name,
                            "value": value,
                            "unmodified_value": unmodified_value,
                            "formatted_value": formatted_value,
                            "metric_min": metric_metadata.get("min"),
                            "metric_max": metric_metadata.get("max"),
                            "metric_scale": metric_metadata.get("scale"),
                            "metric_color": metric_metadata.get("color"),
                            "metric_type": metric_metadata.get("type"),
                            "metric_namespace": metric_metadata.get("namespace"),
                            "metric_placement": metric_metadata.get("placement"),
                        }
                    )

    return flat_records


def store_in_parquet(data, parquet_dir):
    """Store the flattened data in Parquet format"""
    print("Flattining data...")
    flat_data = flatten_hierarchical_data(data)
    print("Creating DataFrame...")
    df = pd.DataFrame(flat_data)
    print("Creating Parquet Table...")
    table = pa.Table.from_pandas(df)
    # Write to Parquet file with partitioning
    start_time = time.time()
    print(f"Writing to Parquet file {parquet_dir}...")
    pq.write_to_dataset(
        table,
        root_path=parquet_dir,
        partition_cols=["run_id"],
    )
    end_time = time.time()

    return end_time - start_time


def query_single_metric_parquet_duckdb(parquet_dir):
    """Query Parquet files to retrieve specific metric values using DuckDB"""
    start_time = time.time()

    metric_name = "metric_0"

    # Use DuckDB to query the Parquet files directly
    con = duckdb.connect(database=':memory:')
    query = f"""
        SELECT * FROM '{parquet_dir}/**/*.parquet'
        WHERE metric_name = '{metric_name}'
    """
    filtered_df = con.execute(query).fetchdf()
    
    end_time = time.time()

    return filtered_df, end_time - start_time


def query_single_metric_parquet_pyarrow(parquet_dir):
    start_time = time.time()

    metric_name = "metric_0"

    # Read the Parquet files with partitioning information
    dataset = ds.dataset(parquet_dir, format="parquet", partitioning="hive")

    # Define filter condition for the metric name
    filter_expr = (ds.field("metric_name") == metric_name)
    # Read the filtered data
    table = dataset.to_table(filter=filter_expr)
    # Convert to pandas DataFrame if needed
    df = table.to_pandas()

    end_time = time.time()

    return df, end_time - start_time


def query_single_item_parquet_duckdb(parquet_dir):
    """Query Parquet files to retrieve specific items using DuckDB"""
    start_time = time.time()

    # Filter by run_id and module_id
    run_id = "run_0"
    module_id = "module_0"

    # Use DuckDB to query the Parquet files directly
    con = duckdb.connect(database=':memory:')
    query = f"""
        SELECT * FROM '{parquet_dir}/**/*.parquet'
        WHERE run_id = '{run_id}' AND module_id = '{module_id}'
    """
    filtered_df = con.execute(query).fetchdf()
    
    end_time = time.time()

    return filtered_df, end_time - start_time


def query_single_item_parquet_pyarrow(parquet_dir):
    """Query Parquet files to retrieve specific items using pyarrow"""
    start_time = time.time()

    # Filter by run_id and module_id
    run_id = "run_0"
    module_id = "module_0"

    # Read the Parquet files with partitioning information
    dataset = ds.dataset(parquet_dir, format="parquet", partitioning="hive")

    # Define filter condition for the metric name
    filter_expr = (ds.field("run_id") == run_id) & (ds.field("module_id") == module_id)
    # Read the filtered data
    table = dataset.to_table(filter=filter_expr)
    # Convert to pandas DataFrame if needed
    df = table.to_pandas()
    end_time = time.time()

    return df, end_time - start_time    


In [45]:
print("Generating sample data...")
data = generate_all_data(
    NUM_RUNS, NUM_MODULES, NUM_SAMPLES_PER_MODULE, NUM_METRICS_PER_MODULE
)
print(f"Generated {NUM_RUNS} runs with {NUM_MODULES} modules each")
print(
    f"Each module has {NUM_SAMPLES_PER_MODULE} samples with {NUM_METRICS_PER_MODULE} metrics"
)

Generating sample data...
Generated 10 runs with 10 modules each
Each module has 100 samples with 20 metrics


In [47]:
# bucket_path = BUCKET_NAME + "/" + "2025-04-23-10-18-02"
bucket_path = BUCKET_NAME + "/" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
print(f"\n--- Storing objects to the bucket {bucket_path} ---")
parquet_store_time = store_in_parquet(data, bucket_path)
print(f"Parquet storage time: {parquet_store_time:.4f} seconds")


--- Storing objects to the bucket s3://megaqc-test/datasets/2025-04-23-10-59-58 ---
Flattining data...
Creating DataFrame...
Creating Parquet Table...
Writing to Parquet file s3://megaqc-test/datasets/2025-04-23-10-59-58...
Parquet storage time: 12.7854 seconds


In [50]:
parquet_results, parquet_query_time = query_single_metric_parquet_pyarrow(bucket_path)
print(f"Parquet query single metric time (with pyarrow): {parquet_query_time:.4f} seconds")
print(f"Parquet results count: {len(parquet_results)}")
parquet_results, parquet_query_time = query_single_item_parquet_pyarrow(bucket_path)
print(f"Parquet query single module time: {parquet_query_time:.4f} seconds")
print(f"Parquet results count: {len(parquet_results)}")

Parquet query single metric time (with pyarrow): 61.1109 seconds
Parquet results count: 10000
Parquet query single module time: 6.0639 seconds
Parquet results count: 2000
