In [30]:
import os
import json
import pandas as pd


def process_json(file_path):
    """
    Load a single JSON file, extract model_name and results,
    and return a DataFrame with a MultiIndex (model_name, metric).
    Also rename 'all' column conditionally.
    """
    with open(file_path, 'r') as f:
        data = json.load(f)

    model_name = data["config_general"]["model_name"]
    results = data["results"]

    # Define the metric rows
    metrics = ["acc", "acc_stderr", "acc_norm", "acc_norm_stderr"]

    # Build a MultiIndex for the rows: (model_name, metric)
    index = pd.MultiIndex.from_product(
        [[model_name], metrics], names=["model_name", "metric"]
    )

    # Prepare columns by stripping trailing '|<digits>' from each result key
    cols = {}
    for full_key, vals in results.items():
        parts = full_key.split("|")
        # remove trailing numeric suffix
        col = "|".join(parts[:-1]) if parts[-1].isdigit() else full_key
        # Extract values in the same order as metrics
        cols[col] = [vals.get(m) for m in metrics]

    # Construct DataFrame
    df = pd.DataFrame(cols, index=index)

    # Rename 'all' column based on presence of community|mmlu:stem
    if 'all' in df.columns:
        if 'community|mmlu:stem' in df.columns:
            df.rename(columns={'all': 'all_community'}, inplace=True)
        else:
            df.rename(columns={'all': 'all_lighteval'}, inplace=True)

    return df


In [31]:
json_files = []
for root, dirs, files in os.walk('.'):
    for filename in files:
        if filename.lower().endswith('.json'):
            json_files.append(os.path.join(root, filename))

# Process each JSON into a DataFrame
dfs = []
for jf in json_files:
    try:
        df = process_json(jf)
        #print(df)
        dfs.append(df)
    except Exception as e:
        print(f"Skipping {jf}: {e}")

# Combine all DataFrames by stacking their rows (union on rows)
# instead of axis=0 (stacking), do axis=1 (horizontal join)
combined_df = pd.concat(dfs, axis=1, join='outer')
combined_df = (
    combined_df
    .T
    .groupby(level=0)    # group by the original column names (now the index)
    .first()             # take the first non-null entry in each group
    .T                   # transpose back
)
combined_df = combined_df.loc[:, ~combined_df.columns.str.startswith("all_")]

# write it out
combined_df.to_csv("accuracy_results.csv")

In [36]:
df = combined_df

# 1) drop all the “norm” rows by masking on the second level of the index:
metric_level = df.index.get_level_values('metric')
mask = ~metric_level.str.contains('norm')
df2 = df[mask]

# 2) unstack the metric level so acc & acc_stderr become sub-columns:
df_un = df2.unstack(level='metric')
# now df_un.columns is a MultiIndex of (dataset_name, metric_name)

# 3) for each dataset, round and format “acc ± acc_stderr” into one string column:
out = pd.DataFrame(index=df_un.index)
for dataset in df_un.columns.levels[0]:
    acc    = (df_un[(dataset, 'acc')]*100).round(2).map("{:.2f}".format)
    stderr = (df_un[(dataset, 'acc_stderr')]*100).round(2).map("{:.2f}".format)
    out[dataset] = acc + ' ± ' + stderr



# 4) bring model_name back to a column if you like:
out = out.reset_index()
out.to_csv("pretty_accuracy_results.csv", index=False)