In [1]:
import argparse
import csv
import json
import os

import numpy as np
import pandas as pd

from benchmark import Benchmark

workload_names = [
    "archeology.json",
    "astronomy.json",
    "biomedical.json" "environment.json",
    "legal.json",
    "wildfire.json",
]


sys_names = {
    'BaselineLLMSystemGPT4oNaive': 'GPT-4o',
    'BaselineLLMSystemGPTo3Naive': 'GPT-o3',
    'BaselineLLMSystemLlama3_3InstructNaive': 'Llama3-3Intruct',
    'BaselineLLMSystemDeepseekR1Naive': 'DeepSeek-R1',
    'BaselineLLMSystemQwen2_5CoderNaive': 'Qwen2-5Coder',
    'BaselineLLMSystemGPT4oOneShot': 'GPT-4o',
    'BaselineLLMSystemGPTo3OneShot': 'GPT-o3',
    'BaselineLLMSystemLlama3_3InstructOneShot': 'Llama3-3Intruct',
    'BaselineLLMSystemDeepseekR1OneShot': 'DeepSeek-R1',
    'BaselineLLMSystemQwen2_5CoderOneShot': 'Qwen2-5Coder',
    'BaselineLLMSystemGPT4oFewShot': 'GPT-4o',
    'BaselineLLMSystemGPTo3FewShot': 'GPT-o3',
    'BaselineLLMSystemGPTo3FewShot_subset': 'GPT-o3-subset',
    'BaselineLLMSystemLlama3_3InstructFewShot': 'Llama3-3Intruct',
    'BaselineLLMSystemDeepseekR1FewShot': 'DeepSeek-R1',
    'BaselineLLMSystemQwen2_5CoderFewShot': 'Qwen2-5Coder',
}

In [3]:
sut_metrics = {}
for sut_name in sys_names:

    aggregated_result_filepath = "./results/aggregated_results.csv"

    df = pd.read_csv(aggregated_result_filepath)
    metric_aggregation_dict = {}
    for (sut, metric), group in df.groupby(["sut", "metric"]):
        if sut != sut_name:
            continue
        group_dropped_na = group.dropna()
        metric_aggregation_dict[metric] = group["value_mean"].mean()
    # print(f"Aggregated results for {sut_name}:")
    # print(metric_aggregation_dict)
    sut_metrics[sut_name] = metric_aggregation_dict

metrics_df = pd.DataFrame.from_dict(sut_metrics, orient="index")
metrics = ['bleu', 'llm_code_eval', 'f1', 'mean_absolute_error', 'precision', 'recall', 'rouge', 'success', 'runtime']
for m in metrics:
    if m != 'runtime':
        metrics_df[m] = metrics_df[m]*100

display(metrics_df)
ltx_table = metrics_df.to_latex(
    index=True,
    label="tab:metrics",
    caption="Metrics for different systems.",
    float_format="%.2f",
    column_format="l" + "c" * len(metrics_df.columns),
)

for sut_name in sys_names:
    ltx_table = ltx_table.replace(sut_name, "& "+sys_names[sut_name])

print(ltx_table)

Unnamed: 0,bleu,f1,f1_approximate,llm_code_eval,llm_paraphrase,mean_absolute_error,mean_relative_absolute_error,mean_squared_error,precision,rae_score,recall,rouge,string_bootstrap,success,runtime
BaselineLLMSystemGPT4oNaive,12.171064,15.315745,0.0,32.443249,0.079365,12.3,26520050000.0,0.015129,6.152178,0.45,6.038297,12.053496,0.079365,3.546527,
BaselineLLMSystemGPTo3Naive,3.955646,19.038749,0.0,29.291005,0.0,,1.0,,2.720607,0.5,2.631186,7.083333,0.0,0.0,
BaselineLLMSystemLlama3_3InstructNaive,9.932932,14.944,0.0,27.601616,0.034722,,1.0,,6.034999,0.5,5.351114,9.022173,0.066667,1.739857,
BaselineLLMSystemDeepseekR1Naive,5.835241,19.191955,0.0,13.615251,0.031746,14050.0,1.0,28112.5,4.137997,0.5,4.463457,7.129926,0.038549,1.874494,
BaselineLLMSystemQwen2_5CoderNaive,4.28525,12.741364,0.0,34.479681,0.022222,,1.0,,2.833365,0.5,3.681342,4.353161,0.022222,0.372369,
BaselineLLMSystemGPT4oOneShot,7.160961,20.299605,0.0,25.704468,0.031746,594.64,0.9701427,68.0369,9.021415,0.509839,9.738329,6.147737,0.036508,5.335355,
BaselineLLMSystemGPTo3OneShot,5.441542,24.293086,0.0,3.408163,0.04,580609.91,0.9527001,33710790.0,2.113997,0.516511,0.992117,4.4,0.04,1.269841,
BaselineLLMSystemLlama3_3InstructOneShot,4.560274,14.183887,0.0,19.260143,0.013889,151352.868769,0.9754751,2290769.0,5.632101,0.509337,6.151734,4.384218,0.013889,2.716115,
BaselineLLMSystemDeepseekR1OneShot,8.022083,13.947355,0.0,10.696598,0.049603,,1.0,,4.318183,0.5,4.99918,11.444559,0.0672,2.116127,
BaselineLLMSystemQwen2_5CoderOneShot,3.13977,15.542826,0.0,26.456845,0.024074,,1.0,,4.338621,0.5,5.527292,5.854729,0.026125,1.902755,


\begin{table}
\caption{Metrics for different systems.}
\label{tab:metrics}
\begin{tabular}{lccccccccccccccc}
\toprule
 & bleu & f1 & f1_approximate & llm_code_eval & llm_paraphrase & mean_absolute_error & mean_relative_absolute_error & mean_squared_error & precision & rae_score & recall & rouge & string_bootstrap & success & runtime \\
\midrule
& GPT-4o & 12.17 & 15.32 & 0.00 & 32.44 & 0.08 & 12.30 & 26520051747.24 & 0.02 & 6.15 & 0.45 & 6.04 & 12.05 & 0.08 & 3.55 & NaN \\
& GPT-o3 & 3.96 & 19.04 & 0.00 & 29.29 & 0.00 & NaN & 1.00 & NaN & 2.72 & 0.50 & 2.63 & 7.08 & 0.00 & 0.00 & NaN \\
& Llama3-3Intruct & 9.93 & 14.94 & 0.00 & 27.60 & 0.03 & NaN & 1.00 & NaN & 6.03 & 0.50 & 5.35 & 9.02 & 0.07 & 1.74 & NaN \\
& DeepSeek-R1 & 5.84 & 19.19 & 0.00 & 13.62 & 0.03 & 14050.00 & 1.00 & 28112.50 & 4.14 & 0.50 & 4.46 & 7.13 & 0.04 & 1.87 & NaN \\
& Qwen2-5Coder & 4.29 & 12.74 & 0.00 & 34.48 & 0.02 & NaN & 1.00 & NaN & 2.83 & 0.50 & 3.68 & 4.35 & 0.02 & 0.37 & NaN \\
& GPT-4o & 7.16 & 20.30 & 0.

In [8]:
[k for k in sut_metrics.keys()]

['BaselineLLMSystemGPT4oNaive',
 'BaselineLLMSystemGPTo3Naive',
 'BaselineLLMSystemLlama3_3InstructNaive',
 'BaselineLLMSystemDeepseekR1Naive',
 'BaselineLLMSystemQwen2_5CoderNaive',
 'BaselineLLMSystemGPT4oOneShot',
 'BaselineLLMSystemGPTo3OneShot',
 'BaselineLLMSystemLlama3_3InstructOneShot',
 'BaselineLLMSystemDeepseekR1OneShot',
 'BaselineLLMSystemQwen2_5CoderOneShot',
 'BaselineLLMSystemGPT4oFewShot',
 'BaselineLLMSystemGPTo3FewShot',
 'BaselineLLMSystemLlama3_3InstructFewShot',
 'BaselineLLMSystemDeepseekR1FewShot',
 'BaselineLLMSystemQwen2_5CoderFewShot']

In [None]:
print("Per-domain aggregation:")
# Calculate the weighted mean of the following metrics per domain
domains = ['archeology', 'astronomy', 'biomedical', 'environment', 'legal', 'wildfire']
metrics = ['success', 'llm_paraphrase', 'rae_score', 'f1']
suts = list(sys_names.keys())
# measures = {}
# for sut in df['sut'].unique():
#     for workload_name in workload_names:
#         df['weighted_metric'] = df['value_support'] + df['value_mean']
#         x = df.groupby(['sut', 'workload'])['value_support'].sum()
#         supports = dict(x)
#         measures = []
#         for key in supports.keys():
#             sut, workload = key
#             sys_domain_measure = df[df['sut'] == sut][df['workload'] == workload]['value_mean'] * df[df['sut'] == sut][df['workload'] == workload]['value_support'] / supports[key]
df['meansupp'] = df['value_mean'] * df['value_support']
results = {}
for domain in domains+['overall', 'runtime']:
    if domain not in ['overall', 'runtime']:
        sut_df = df.query(f'sut in {suts} and workload == "{domain}.json" and metric in {metrics}')
        x = sut_df.groupby(['sut']).sum()['meansupp']
        y = sut_df.groupby(['sut']).sum()['value_support']
        results[domain] = x/y

    elif domain == 'overall':
        sut_df = df.query(f'sut in {suts} and metric in {metrics}')
        x = sut_df.groupby(['sut']).sum()['meansupp']
        y = sut_df.groupby(['sut']).sum()['value_support']
        results[domain] = x/y

    elif domain == 'runtime':
        sut_df = df.query(f'sut in {suts} and metric == "runtime"')
        x = sut_df.groupby(['sut']).sum()['meansupp']
        y = sut_df.groupby(['sut']).sum()['value_support']
        results[domain] = x/y


domain_df = pd.DataFrame(results)*100
domain_df = domain_df.reindex(suts)

display(domain_df)
ltx_table = domain_df.to_latex(
    index=True,
    label="tab:metrics",
    caption="Metrics for different domains.",
    float_format="%.2f",
    column_format="l" + "c" * len(metrics_df.columns),
)
for sys_name in sys_names:
    ltx_table = ltx_table.replace(sys_name, "& "+sys_names[sys_name])

print(ltx_table)


Per-domain aggregation:


Unnamed: 0_level_0,archeology,astronomy,biomedical,environment,legal,wildfire,overall,runtime
sut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BaselineLLMSystemGPT4oNaive,15.303708,186760900000.0,5.788301,10.833985,8.857099,12.905159,21233030000.0,
BaselineLLMSystemGPTo3Naive,,,,,9.42785,,9.42785,
BaselineLLMSystemLlama3_3InstructNaive,10.16683,7.618729,7.485324,6.781062,9.24969,11.474962,8.714968,
BaselineLLMSystemDeepseekR1Naive,10.475279,6.04444,6.355795,8.370288,11.1904,25.366534,12.11501,
BaselineLLMSystemQwen2_5CoderNaive,6.016142,6.854128,4.98294,6.927643,8.34391,22.877891,9.564852,
BaselineLLMSystemGPT4oOneShot,15.054116,12.21093,11.166194,12.399345,10.836533,23.704486,13.26521,
BaselineLLMSystemGPTo3OneShot,16.242718,12.27747,8.321737,12.399345,14.88644,13.203366,13.62116,
BaselineLLMSystemLlama3_3InstructOneShot,9.557678,6.647481,6.866257,6.958456,10.032132,15.727115,9.305058,
BaselineLLMSystemDeepseekR1OneShot,10.276769,5.75562,5.113828,9.098545,10.303284,25.673494,11.95479,
BaselineLLMSystemQwen2_5CoderOneShot,11.331994,6.785391,5.741985,10.212372,9.80957,24.92722,11.64145,


\begin{table}
\caption{Metrics for different domains.}
\label{tab:metrics}
\begin{tabular}{lccccccccccccccc}
\toprule
 & archeology & astronomy & biomedical & environment & legal & wildfire & overall & runtime \\
sut &  &  &  &  &  &  &  &  \\
\midrule
& GPT-4o & 15.30 & 186760927800.92 & 5.79 & 10.83 & 8.86 & 12.91 & 21233027829.81 & NaN \\
& GPT-o3 & NaN & NaN & NaN & NaN & 9.43 & NaN & 9.43 & NaN \\
& Llama3-3Intruct & 10.17 & 7.62 & 7.49 & 6.78 & 9.25 & 11.47 & 8.71 & NaN \\
& DeepSeek-R1 & 10.48 & 6.04 & 6.36 & 8.37 & 11.19 & 25.37 & 12.12 & NaN \\
& Qwen2-5Coder & 6.02 & 6.85 & 4.98 & 6.93 & 8.34 & 22.88 & 9.56 & NaN \\
& GPT-4o & 15.05 & 12.21 & 11.17 & 12.40 & 10.84 & 23.70 & 13.27 & NaN \\
& GPT-o3 & 16.24 & 12.28 & 8.32 & 12.40 & 14.89 & 13.20 & 13.62 & NaN \\
& Llama3-3Intruct & 9.56 & 6.65 & 6.87 & 6.96 & 10.03 & 15.73 & 9.31 & NaN \\
& DeepSeek-R1 & 10.28 & 5.76 & 5.11 & 9.10 & 10.30 & 25.67 & 11.95 & NaN \\
& Qwen2-5Coder & 11.33 & 6.79 & 5.74 & 10.21 & 9.81 & 24.93 & 11.