### Compare performance of querying

In [227]:
import trino
import polars as pl

# Connection parameters based on your docker-compose configuration
conn = trino.dbapi.connect(
    host="localhost",  # Since you're accessing from host machine
    port=8081,         # Port mapped in docker-compose
    user="admin",      # User from environment variables
    catalog="iceberg", # Catalog from environment variables
)

# Create a cursor and execute a query
cursor = conn.cursor()

In [175]:
def run_query(name: str | None, query: str, as_df: bool = False, verbose: bool = False):
    """
    Execute a SQL query against the Trino/DuckDB cursor that is already open.

    Parameters
    ----------
    name   (str)  – logical run-set; when given we substitute
                    iceberg."<name>"."<name>" for {table} in the query.
    query  (str)  – SQL with an optional {table} placeholder.
    as_df (bool)  – if True, return a pandas.DataFrame instead of printing rows.

    Returns
    -------
    • pandas.DataFrame when as_df is True
    • list[tuple]      when as_df is False (same as before)
    """
    # Resolve the {table} placeholder first
    if name:
        table = f'iceberg."{name}"."{name}"'
        query = query.format(table=table)

    if verbose:
        print(query)                   # always show the final SQL sent
    cursor.execute(query)

    # Fetch all rows once, then decide what to do with them
    rows = cursor.fetchall()
    row_count = len(rows)

    if verbose:
        print(f"{row_count} rows")
        print("\n".join(" | ".join(str(cell) for cell in row) for row in rows))

    if as_df:
        # Build column labels from cursor.description
        columns = [d[0] for d in cursor.description]
        df = pl.DataFrame(rows, schema=columns, orient="row")
        return df

    print(f"{row_count} rows")
    print("\n".join(" | ".join(str(cell) for cell in row) for row in rows))
    return rows

In [None]:
# Print catalogs
run_query(None, "SHOW SCHEMAS FROM iceberg")
pass

21 rows
information_schema
reallife-bsstuff
reallife-long_bsstuff
reallife-small-test
reallife-small-test-wide
reallife-wide_bsstuff
simulated-long_10000runs_10mod_5sec_100samples_50metrics
simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset10
simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset100
simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset1000
simulated-long_100runs_10mod_5sec_100samples_50metrics
simulated-long_10runs_10mod_5sec_100samples_50metrics
simulated-wide_10000runs_10mod_5sec_100samples_50metrics
simulated-wide_10000runs_10mod_5sec_100samples_50metrics_subset100
simulated-wide_10000runs_10mod_5sec_100samples_50metrics_subset1000
simulated-wide_10000runs_10mod_5sec_100samples_50metrics_subset2000
simulated-wide_1000runs_10mod_5sec_100samples_50metrics
simulated-wide_1000runs_10mod_5sec_100samples_50metrics_subset1000
simulated-wide_100runs_10mod_5sec_100samples_50metrics
simulated-wide_10runs_10mod_5sec_100samples_50metrics
system


In [230]:
# Print columns for each schema in the iceberg catalog
def print_columns_for_all_schemas():
    # First, get all schemas
    cursor.execute("SHOW SCHEMAS FROM iceberg")
    schemas = cursor.fetchall()
    
    # For each schema, get tables and their columns
    for schema_row in schemas:
        schema = schema_row[0]
        if schema == "information_schema":
            continue
        print(f"\nSchema: {schema}")
        
        # Get tables in this schema
        cursor.execute(f'SHOW TABLES FROM iceberg."{schema}"')
        tables = cursor.fetchall()
        
        for table_row in tables:
            table = table_row[0]
            print(f"  Table: {table}")
            
            # Get columns for this table
            cursor.execute(f'SHOW COLUMNS FROM iceberg."{schema}"."{table}"')
            columns = cursor.fetchall()
            
            # Print all columns in one line
            cols = [f'{col[0]}: {col[1]}' for col in columns]
            print(f"    Columns: {' | '.join(cols)}")

# Execute the function
print_columns_for_all_schemas()


Schema: reallife-bsstuff
  Table: multiqc
    Columns: anchor: varchar | type: varchar | creation_date: timestamp(6) | plot_type: varchar | plot_input_data: varchar | sample: varchar | bismark-methylation-dp / percent_cpg_meth: double | bismark-methylation-dp / percent_chg_meth: double | bismark-methylation-dp / percent_chh_meth: double | general_stats_table / qualimap: bamqc / avg_gc: double | general_stats_table / qualimap: bamqc / 1_x_pc: double | general_stats_table / qualimap: bamqc / 5_x_pc: double | general_stats_table / qualimap: bamqc / 10_x_pc: double | general_stats_table / qualimap: bamqc / 30_x_pc: double | general_stats_table / qualimap: bamqc / 50_x_pc: double | general_stats_table / qualimap: bamqc / median_coverage: double | general_stats_table / qualimap: bamqc / mean_coverage: double | general_stats_table / qualimap: bamqc / general_error_rate: double | general_stats_table / qualimap: bamqc / percentage_aligned: double | general_stats_table / qualimap: bamqc / mappe

In [231]:
def print_columns_and_few_rows(name):
    cursor.execute(f'SHOW COLUMNS FROM iceberg."{name}"."{name}"')
    columns = cursor.fetchall()
    outputs = [[f'{col[0]}: {col[1]}' for col in columns]]

    cursor.execute(f'SELECT * FROM iceberg."{name}"."{name}" LIMIT 5')
    rows = cursor.fetchall()
    for row in rows:
        outputs.append([str(r) for r in row])
    
    for out in outputs:
        print(" | ".join(out))

print_columns_and_few_rows("reallife-small-test-wide")

anchor: varchar | type: varchar | creation_date: timestamp(6) | plot_type: varchar | plot_input_data: varchar | sample: varchar | samtools-coverage-table / numreads: double | samtools-coverage-table / covbases: double | samtools-coverage-table / coverage: double | samtools-coverage-table / meandepth: double | samtools-coverage-table / meanbaseq: double | samtools-coverage-table / meanmapq: double | general_stats_table / samtools: coverage / numreads: double | general_stats_table / samtools: coverage / covbases: double | general_stats_table / samtools: coverage / coverage: double | general_stats_table / samtools: coverage / meandepth: double | general_stats_table / samtools: coverage / meanbaseq: double | general_stats_table / samtools: coverage / meanmapq: double | config: varchar | data_sources: varchar | multiqc_version: varchar | modules: varchar
 | table_row | 2025-05-21 16:26:29.505259 | None | None | NA12878_sorted_chrM_chr20_rehead_60pdown | 0.750591 | 8.983488 | 13.936327510616

### Find average of a metric across all samples and projects

In [232]:
def average_of_metrics(name, metric, format):
    """
    Print the average of the requested metric.
    """
    is_simulated = "simulated" in name
    if format == "long":
        # Handle the optional “anchor / metric” syntax
        if " / " in metric:
            metric = metric.split(" / ")[-1]  # general_stats_table / samtools: coverage / numreads
            anchor = metric.split(" / ")[0]
            anchor_and = f"AND anchor = '{anchor}'"
        else:
            metric_name = metric
            anchor_and = ""        
        metric_col = "metric_name" if is_simulated else "metric"
        value_col  = "val_raw"
        run_query(name, f"""
        SELECT AVG({value_col}) AS mean
        FROM {{table}}
        WHERE {metric_col} = '{metric_name}' {anchor_and}
        """)
    else:
        run_query(name, f"""
        SELECT AVG({metric}) FROM {{table}}
        """)


In [233]:
def metric_dates_samples_stats(name, metric, format, samples):
    """
    Print min, max, mean and median for a metric over the requested samples
    and date window.

    Parameters
    ----------
    name     : str   – logical “run-set” name (used by run_query to pick a DB)
    metric   : str   – metric of interest; may be written as "anchor / metric"
    format   : str   – "long" or "wide"
    samples  : list  – list of sample identifiers
    """
    # Convert `samples` into the SQL IN (…) list
    samples_sql = ", ".join(f"'{s}'" for s in samples)
    is_simulated = "simulated" in name

    if format == "long":
        # Handle the optional “anchor / metric” syntax
        if " / " in metric:
            metric = metric.split(" / ")[-1]  # general_stats_table / samtools: coverage / numreads
            anchor = metric.split(" / ")[0]
            anchor_and = f"AND anchor = '{anchor}'"
        else:
            metric_name = metric
            anchor_and = ""

        metric_col = "metric_name" if is_simulated else "metric"
        sample_col = "sample_name" if is_simulated else "sample"
        value_col  = "val_raw"

        query = f"""
        SELECT
            MIN({value_col})    AS min,
            MAX({value_col})    AS max,
            AVG({value_col})    AS mean,
            approx_percentile({value_col}, 0.5)    AS median
        FROM {{table}}
        WHERE {metric_col} = '{metric_name}' {anchor_and}
          AND creation_date BETWEEN TIMESTAMP '2025-03-15'
                               AND TIMESTAMP '2025-12-31'
          AND {sample_col} IN ({samples_sql})
        """
    else:  # wide format
        sample_col = "sample_name" if is_simulated else "sample"
        value_col  = f'"{metric}"'  # quote because metric names may contain punctuation

        query = f"""
        SELECT
            MIN({value_col})    AS min,
            MAX({value_col})    AS max,
            AVG({value_col})    AS mean,
            approx_percentile({value_col}, 0.5)    AS median
        FROM {{table}}
        WHERE creation_date BETWEEN TIMESTAMP '2025-03-15'
                               AND TIMESTAMP '2025-12-31'
          AND {sample_col} IN ({samples_sql})
        """

    # Expect run_query to return a DataFrame when as_df=True
    stats_df = run_query(name, query, as_df=True, verbose=False)
    if stats_df.is_empty():
        print("No rows matched the given filters.")
        return

    row = stats_df.row(0, named=True)
    print(
        f"Statistics for “{metric}” (samples={samples}, 2025-03-15 → 2025-12-31):\n"
        f"  min   : {row['min']}\n"
        f"  max   : {row['max']}\n"
        f"  mean  : {row['mean']}\n"
        f"  median: {row['median']}"
    )

In [234]:
def metric_outliers_conditional(name, metric_filter, condition, metric_outlier, threshold=1.5, limit=10, verbose=False):
    """
    Find samples that satisfy a condition on one metric, then identify outliers on another metric using IQR.
    """
    is_simulated = "simulated" in name
    format = "long" if "wide" not in name else "wide"

    metric_col = "metric_name" if is_simulated else "metric"
    sample_col = "sample_name" if is_simulated else "sample"

    if format == "long":
        anchor_filter_clause = ""
        if " / " in metric_filter:
            anchor_filter = metric_filter.split(" / ")[0]
            metric_filter = metric_filter.split(" / ")[-1]
            anchor_filter_clause = f"AND anchor = '{anchor_filter}'"

        anchor_outlier_clause = ""
        if " / " in metric_outlier:
            anchor_outlier = metric_outlier.split(" / ")[0]
            metric_outlier = metric_outlier.split(" / ")[-1]
            anchor_outlier_clause = f"AND anchor = '{anchor_outlier}'"

        query = f"""
        WITH filtered_samples AS (
            SELECT DISTINCT {sample_col}
            FROM {{table}}
            WHERE {metric_col} = '{metric_filter}' {anchor_filter_clause}
              AND val_raw {condition}
        ),
        outlier_data AS (
            SELECT {sample_col}, val_raw
            FROM {{table}}
            WHERE {metric_col} = '{metric_outlier}' {anchor_outlier_clause}
              AND {sample_col} IN (SELECT {sample_col} FROM filtered_samples)
        ),
        stats AS (
            SELECT 
                approx_percentile(val_raw, 0.25) AS q1,
                approx_percentile(val_raw, 0.75) AS q3
            FROM outlier_data
        ),
        with_bounds AS (
            SELECT 
                q1,
                q3,
                (q3 - q1) AS iqr,
                q1 - (q3 - q1) * {threshold} AS lower_bound,
                q3 + (q3 - q1) * {threshold} AS upper_bound
            FROM stats
        )
        SELECT 
            od.{sample_col},
            od.val_raw AS "{metric_outlier}_value",
            wb.q1,
            wb.q3,
            wb.iqr * {threshold} AS iqr_range,
            CASE 
                WHEN od.val_raw < wb.lower_bound THEN 'low_outlier'
                WHEN od.val_raw > wb.upper_bound THEN 'high_outlier'
                ELSE 'normal'
            END AS outlier_status
        FROM outlier_data od
        CROSS JOIN with_bounds wb
        WHERE od.val_raw < wb.lower_bound OR od.val_raw > wb.upper_bound
        ORDER BY od.val_raw DESC
        LIMIT {limit}
        """
        run_query(name, query, verbose=verbose)

    else:
        query = f"""
        WITH filtered_samples AS (
            SELECT {sample_col}
            FROM {{table}}
            WHERE "{metric_filter}" {condition}
        ),
        outlier_data AS (
            SELECT {sample_col}, "{metric_outlier}" AS metric_value
            FROM {{table}}
            WHERE {sample_col} IN (SELECT {sample_col} FROM filtered_samples)
        ),
        stats AS (
            SELECT 
                approx_percentile(metric_value, 0.25) AS q1,
                approx_percentile(metric_value, 0.75) AS q3
            FROM outlier_data
        ),
        with_bounds AS (
            SELECT 
                q1,
                q3,
                (q3 - q1) AS iqr,
                q1 - (q3 - q1) * {threshold} AS lower_bound,
                q3 + (q3 - q1) * {threshold} AS upper_bound
            FROM stats
        )
        SELECT 
            od.{sample_col},
            od.metric_value AS "{metric_outlier}_value",
            wb.q1,
            wb.q3,
            wb.iqr * {threshold} AS iqr_range,
            CASE 
                WHEN od.metric_value < wb.lower_bound THEN 'low_outlier'
                WHEN od.metric_value > wb.upper_bound THEN 'high_outlier'
                ELSE 'normal'
            END AS outlier_status
        FROM outlier_data od
        CROSS JOIN with_bounds wb
        WHERE od.metric_value < wb.lower_bound OR od.metric_value > wb.upper_bound
        ORDER BY od.metric_value DESC
        LIMIT {limit}
        """
        run_query(name, query, verbose=verbose)

In [None]:
metric_dates_samples_stats(
    "reallife-small-test-wide",
    metric="general_stats_table / samtools: coverage / numreads",
    format="wide",
    samples=["NA12878_sorted_chrM_chr20_rehead_60pdown", "test.paired_end"]
)

Statistics for “general_stats_table / samtools: coverage / numreads” (samples=['NA12878_sorted_chrM_chr20_rehead_60pdown', 'test.paired_end'], 2025-03-15 → 2025-12-31):
  min   : 0.141
  max   : 0.750591
  mean  : 0.4457955
  median: 0.750591


In [264]:
metric_outliers_conditional(
    "reallife-small-test-wide", 
    metric_filter="general_stats_table / samtools: coverage / coverage", 
    condition="> 2", 
    metric_outlier="general_stats_table / samtools: coverage / meanmapq", 
)

0 rows



In [None]:
print_columns_and_few_rows("reallife-wide_bsstuff")

anchor: varchar | type: varchar | creation_date: timestamp(6) | plot_type: varchar | plot_input_data: varchar | sample: varchar | bismark-methylation-dp / percent_cpg_meth: double | bismark-methylation-dp / percent_chg_meth: double | bismark-methylation-dp / percent_chh_meth: double | general_stats_table / qualimap: bamqc / avg_gc: double | general_stats_table / qualimap: bamqc / 1_x_pc: double | general_stats_table / qualimap: bamqc / 5_x_pc: double | general_stats_table / qualimap: bamqc / 10_x_pc: double | general_stats_table / qualimap: bamqc / 30_x_pc: double | general_stats_table / qualimap: bamqc / 50_x_pc: double | general_stats_table / qualimap: bamqc / median_coverage: double | general_stats_table / qualimap: bamqc / mean_coverage: double | general_stats_table / qualimap: bamqc / general_error_rate: double | general_stats_table / qualimap: bamqc / percentage_aligned: double | general_stats_table / qualimap: bamqc / mapped_reads: double | general_stats_table / qualimap: bamqc 

In [166]:
metric_dates_samples_stats(
    "reallife-wide_bsstuff", 
    metric="general_stats_table / qualimap: bamqc / avg_gc", 
    format="wide",
    samples=["Bisulfite_2_S2_DNA_H12_R1", "Helicase_1_S3_DNA_J22_R1"]
)


Statistics for “general_stats_table / qualimap: bamqc / avg_gc” (samples=['Bisulfite_2_S2_DNA_H12_R1', 'Helicase_1_S3_DNA_J22_R1'], 2025-03-15 → 2025-12-31):
  min   : 24.318481848184817
  max   : 24.371985157699445
  mean  : 24.34523350294213
  median: 24.371985157699445


In [None]:
metric_dates_samples_stats(
    "reallife-bsstuff", 
    metric="general_stats_table / qualimap: bamqc / avg_gc", 
    format="wide",
    samples=["Bisulfite_2_S2_DNA_H12_R1", "Helicase_1_S3_DNA_J22_R1"]
)

Statistics for “general_stats_table / qualimap: bamqc / avg_gc” (samples=['Bisulfite_2_S2_DNA_H12_R1', 'Helicase_1_S3_DNA_J22_R1'], 2025-03-15 → 2025-12-31):
  min   : 24.318481848184817
  max   : 24.371985157699445
  mean  : 24.34523350294213
  median: 24.371985157699445


In [38]:
print_columns_and_few_rows("simulated-long_10runs_10mod_5sec_100samples_50metrics")

run_id: varchar | creation_date: timestamp(6) | module_name: varchar | sample_name: varchar | metric_name: varchar | metric_min: double | metric_max: double | metric_dmin: double | metric_dmax: double | metric_scale: varchar | metric_color: varchar | val_raw: double | val_raw_type: varchar
run_1 | 2025-04-06 17:12:32.456908 | module_0 | sample_0 | metric_0 | 5.495419114785129 | 96.75975901945004 | 1.6238252032547913 | 9.031953154242153 | Accent | #61fe79 | 74.83896928398039 | float
run_1 | 2025-04-06 17:12:32.456908 | module_0 | sample_0 | metric_1 | 5.69906284939199 | 92.00361465604668 | 2.2255552424325087 | 9.113154542081853 | Paired | #dd7c16 | 66.98567783316085 | float
run_1 | 2025-04-06 17:12:32.456908 | module_0 | sample_0 | metric_2 | 8.959603245979986 | 90.2454446445848 | 2.801095339031841 | 9.269297642148208 | Set3 | #6c31bb | 30.19826258471592 | float
run_1 | 2025-04-06 17:12:32.456908 | module_0 | sample_0 | metric_3 | 9.707969533101707 | 94.3023425378321 | 2.766972474235412

In [14]:
print_columns_and_few_rows("simulated-long_10000runs_10mod_5sec_100samples_50metrics")

run_id: varchar | creation_date: timestamp(6) | module_name: varchar | sample_name: varchar | metric_name: varchar | metric_min: double | metric_max: double | metric_dmin: double | metric_dmax: double | metric_scale: varchar | metric_color: varchar | val_raw: double | val_raw_type: varchar
run_1 | 2025-02-27 08:30:29.421394 | module_0 | sample_0 | metric_0 | 8.187422717238181 | 99.09294188036411 | 2.800081586567719 | 9.686040573432331 | Pastel2 | #718754 | 53.227891884914506 | float
run_1 | 2025-02-27 08:30:29.421394 | module_0 | sample_0 | metric_1 | 0.6189464903850472 | 95.00990616596145 | 2.504969401613949 | 9.75638499643777 | Accent | #d62c2b | 57.879120040092 | float
run_1 | 2025-02-27 08:30:29.421394 | module_0 | sample_0 | metric_2 | 0.8117001920682132 | 91.84371073523403 | 1.5076375825884356 | 9.1234107926833 | Accent | #263ead | 51.424642994452576 | float
run_1 | 2025-02-27 08:30:29.421394 | module_0 | sample_0 | metric_3 | 6.110040575627345 | 90.39865100056717 | 1.08012283931

In [100]:
print_columns_and_few_rows("simulated-wide_10000runs_10mod_5sec_100samples_50metrics")

run_id: bigint | creation_date: timestamp(6) | type: varchar | sample_name: varchar | metric_0: double | metric_1: double | metric_2: double | metric_3: double | metric_4: double | metric_5: double | metric_6: double | metric_7: double | metric_8: double | metric_9: double | metric_10: double | metric_11: double | metric_12: double | metric_13: double | metric_14: double | metric_15: double | metric_16: double | metric_17: double | metric_18: double | metric_19: double | metric_20: double | metric_21: double | metric_22: double | metric_23: double | metric_24: double | metric_25: double | metric_26: double | metric_27: double | metric_28: double | metric_29: double | metric_30: double | metric_31: double | metric_32: double | metric_33: double | metric_34: double | metric_35: double | metric_36: double | metric_37: double | metric_38: double | metric_39: double | metric_40: double | metric_41: double | metric_42: double | metric_43: double | metric_44: double | metric_45: double | metr

In [267]:
average_of_metrics(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset10",
    metric="metric_0",
    format="long",  
)

1 rows
50.117647237753


In [269]:
average_of_metrics(
    name="simulated-wide_10runs_10mod_5sec_100samples_50metrics",
    metric="metric_0",
    format="wide",
)

1 rows
2.477719367334907


In [272]:
average_of_metrics(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset100",
    metric="metric_0",
    format="long",
)

1 rows
49.97822153589666


In [273]:
average_of_metrics(
    name="simulated-wide_10000runs_10mod_5sec_100samples_50metrics_subset100",
    metric="metric_0",
    format="wide",
)

1 rows
2.5130486027913346


In [None]:
average_of_metrics(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset1000",
    metric="metric_0",
    format="long",
)

In [236]:
average_of_metrics(
    name="simulated-wide_10000runs_10mod_5sec_100samples_50metrics_subset1000",
    metric="metric_0",
    format="wide",
)

1 rows
2.500375932166651


In [None]:
average_of_metrics(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics",
    metric="metric_0",
    format="long",
)

In [None]:
average_of_metrics(
    name="simulated-wide_10000runs_10mod_5sec_100samples_50metrics",
    metric="metric_0",
    format="wide",
)

In [241]:
metric_dates_samples_stats(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset10",
    format="long",
    metric="metric_0",
    samples=["sample_0"],
)   

Statistics for “metric_0” (samples=['sample_0'], 2025-03-15 → 2025-12-31):
  min   : 14.872785000133085
  max   : 85.1203476176099
  mean  : 50.816444740413814
  median: 51.47962297335995


In [239]:
metric_dates_samples_stats(
    name="simulated-wide_10runs_10mod_5sec_100samples_50metrics",
    format="wide",
    metric="metric_1",
    samples=["sample_1"],
)   

Statistics for “metric_1” (samples=['sample_1'], 2025-03-15 → 2025-12-31):
  min   : 0.1814643355246165
  max   : 4.902889942215793
  mean  : 2.649888129891732
  median: 2.874458946670262


In [276]:
metric_dates_samples_stats(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset100",
    format="long",
    metric="metric_0",
    samples=["sample_0"],
)   

Statistics for “metric_0” (samples=['sample_0'], 2025-03-15 → 2025-12-31):
  min   : 4.288868364124605
  max   : 95.53434172573604
  mean  : 49.732834710205886
  median: 49.976846307803605


In [277]:
metric_dates_samples_stats(
    name="simulated-wide_10000runs_10mod_5sec_100samples_50metrics_subset100",
    format="wide",
    metric="metric_1",
    samples=["sample_1"],
)   

Statistics for “metric_1” (samples=['sample_1'], 2025-03-15 → 2025-12-31):
  min   : 0.011531067737503564
  max   : 4.989384517252
  mean  : 2.5464541927857933
  median: 2.501499692276699


In [246]:
metric_dates_samples_stats(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset1000",
    format="long",
    metric="metric_0",
    samples=["sample_0"],
)

Statistics for “metric_0” (samples=['sample_0'], 2025-03-15 → 2025-12-31):
  min   : 2.6183105835895892
  max   : 99.70785639689917
  mean  : 49.91859277046798
  median: 49.95359825113672


In [247]:
metric_dates_samples_stats(
    name="simulated-wide_10000runs_10mod_5sec_100samples_50metrics_subset1000",
    format="wide",
    metric="metric_0",
    samples=["sample_0"],
)

Statistics for “metric_0” (samples=['sample_0'], 2025-03-15 → 2025-12-31):
  min   : 0.014003305227338614
  max   : 4.997923072131517
  mean  : 2.4877507658731592
  median: 2.494292917220527


In [None]:
metric_dates_samples_stats(
    name="simulated-wide_10000runs_10mod_5sec_100samples_50metrics",
    format="wide",
    metric="metric_0",
    samples=["sample_0"],
)

In [None]:
metric_dates_samples_stats(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics",
    format="long",
    metric="metric_0",
    samples=["sample_0"],
)

In [None]:
metric_dates_samples_stats(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics",
    format="long",
    metric="metric_0",
    samples=["sample_0"],
)

In [249]:
metric_outliers_conditional(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset10",
    metric_filter="metric_0",
    condition="> 0.5",
    metric_outlier="metric_1",
    threshold=1.5,
    limit=10,
)

10 rows
sample_75 | 99.35935019190657 | 38.87009592970488 | 61.26697657092866 | 33.59532096183567 | high_outlier
sample_88 | 99.27853988936971 | 38.87009592970488 | 61.26697657092866 | 33.59532096183567 | high_outlier
sample_67 | 99.27853988936971 | 38.87009592970488 | 61.26697657092866 | 33.59532096183567 | high_outlier
sample_98 | 98.88736028181299 | 38.87009592970488 | 61.26697657092866 | 33.59532096183567 | high_outlier
sample_98 | 98.39355327915406 | 38.87009592970488 | 61.26697657092866 | 33.59532096183567 | high_outlier
sample_38 | 98.32300170115244 | 38.87009592970488 | 61.26697657092866 | 33.59532096183567 | high_outlier
sample_8 | 98.2522302087748 | 38.87009592970488 | 61.26697657092866 | 33.59532096183567 | high_outlier
sample_50 | 98.03025315827539 | 38.87009592970488 | 61.26697657092866 | 33.59532096183567 | high_outlier
sample_68 | 98.03025315827539 | 38.87009592970488 | 61.26697657092866 | 33.59532096183567 | high_outlier
sample_53 | 98.03025315827539 | 38.87009592970488

In [255]:
metric_outliers_conditional(
    name="simulated-wide_10runs_10mod_5sec_100samples_50metrics",
    metric_filter="metric_1",
    condition="> 0.5",
    metric_outlier="metric_2",
    threshold=1.5,
    limit=10,
)

0 rows



In [254]:
metric_outliers_conditional(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset100",
    metric_filter="metric_1",
    condition="> 0.5",
    metric_outlier="metric_2",
    threshold=1.5,
    limit=10,
)

10 rows
sample_63 | 99.99709198523344 | 38.8383455816897 | 61.172596498771014 | 33.50137637562197 | high_outlier
sample_66 | 99.99709198523344 | 38.8383455816897 | 61.172596498771014 | 33.50137637562197 | high_outlier
sample_25 | 99.99498849385381 | 38.8383455816897 | 61.172596498771014 | 33.50137637562197 | high_outlier
sample_0 | 99.96127448326476 | 38.8383455816897 | 61.172596498771014 | 33.50137637562197 | high_outlier
sample_34 | 99.96127448326476 | 38.8383455816897 | 61.172596498771014 | 33.50137637562197 | high_outlier
sample_58 | 99.49329734144365 | 38.8383455816897 | 61.172596498771014 | 33.50137637562197 | high_outlier
sample_96 | 99.44400105245796 | 38.8383455816897 | 61.172596498771014 | 33.50137637562197 | high_outlier
sample_35 | 99.41214072883862 | 38.8383455816897 | 61.172596498771014 | 33.50137637562197 | high_outlier
sample_48 | 99.41214072883862 | 38.8383455816897 | 61.172596498771014 | 33.50137637562197 | high_outlier
sample_26 | 99.41214072883862 | 38.8383455816897

In [260]:
metric_outliers_conditional(
    name="simulated-wide_10000runs_10mod_5sec_100samples_50metrics_subset100",
    metric_filter="metric_1",
    condition="> 0.5",
    metric_outlier="metric_2",
    threshold=1.5,
    limit=10,
)

0 rows



In [262]:
metric_outliers_conditional(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset100",
    metric_filter="metric_1",
    condition="> 0.5",
    metric_outlier="metric_2"
)

10 rows
sample_66 | 99.99709198523344 | 38.90067971338683 | 61.16945821994164 | 33.403167759832215 | high_outlier
sample_63 | 99.99709198523344 | 38.90067971338683 | 61.16945821994164 | 33.403167759832215 | high_outlier
sample_25 | 99.99498849385381 | 38.90067971338683 | 61.16945821994164 | 33.403167759832215 | high_outlier
sample_34 | 99.96127448326476 | 38.90067971338683 | 61.16945821994164 | 33.403167759832215 | high_outlier
sample_0 | 99.96127448326476 | 38.90067971338683 | 61.16945821994164 | 33.403167759832215 | high_outlier
sample_58 | 99.49329734144365 | 38.90067971338683 | 61.16945821994164 | 33.403167759832215 | high_outlier
sample_96 | 99.44400105245796 | 38.90067971338683 | 61.16945821994164 | 33.403167759832215 | high_outlier
sample_26 | 99.41214072883862 | 38.90067971338683 | 61.16945821994164 | 33.403167759832215 | high_outlier
sample_83 | 99.41214072883862 | 38.90067971338683 | 61.16945821994164 | 33.403167759832215 | high_outlier
sample_48 | 99.41214072883862 | 38.9006

In [257]:
metric_outliers_conditional(
    name="simulated-wide_10000runs_10mod_5sec_100samples_50metrics_subset1000",
    metric_filter="metric_0",
    condition="> 0.5",
    metric_outlier="metric_1",
)

0 rows



In [256]:
metric_outliers_conditional(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics_subset1000",
    metric_filter="metric_0",
    condition="> 0.5",
    metric_outlier="metric_1",
)

10 rows
sample_67 | 99.99466402226406 | 38.964785638904424 | 61.06644332141097 | 33.15248652375982 | high_outlier
sample_19 | 99.99466402226406 | 38.964785638904424 | 61.06644332141097 | 33.15248652375982 | high_outlier
sample_63 | 99.99466402226406 | 38.964785638904424 | 61.06644332141097 | 33.15248652375982 | high_outlier
sample_55 | 99.98303423102604 | 38.964785638904424 | 61.06644332141097 | 33.15248652375982 | high_outlier
sample_2 | 99.98303423102604 | 38.964785638904424 | 61.06644332141097 | 33.15248652375982 | high_outlier
sample_41 | 99.98303423102604 | 38.964785638904424 | 61.06644332141097 | 33.15248652375982 | high_outlier
sample_52 | 99.96232595873349 | 38.964785638904424 | 61.06644332141097 | 33.15248652375982 | high_outlier
sample_99 | 99.9452570538043 | 38.964785638904424 | 61.06644332141097 | 33.15248652375982 | high_outlier
sample_95 | 99.9452570538043 | 38.964785638904424 | 61.06644332141097 | 33.15248652375982 | high_outlier
sample_89 | 99.93489602918602 | 38.964785

In [263]:
metric_outliers_conditional(
    name="simulated-wide_10000runs_10mod_5sec_100samples_50metrics",
    metric_filter="metric_0",
    condition="> 0.5",
    metric_outlier="metric_1",
)

KeyboardInterrupt: 

In [None]:
metric_outliers_conditional(
    name="simulated-long_10000runs_10mod_5sec_100samples_50metrics",
    metric_filter="metric_0",
    condition="> 0.5",
    metric_outlier="metric_1",
)