In [1]:
import os
import json
import pandas as pd
from pathlib import Path

def process_json(file_path):
    """
    Load a single JSON file, extract model_name and results,
    and return a DataFrame with a MultiIndex (model_name, metric).
    Also rename 'all' column conditionally.
    """
    with open(file_path, 'r') as f:
        data = json.load(f)

    #model_name = data["config_general"]["model_name"]
    path = Path(file_path)
    model_name = '/'.join(path.parts[-3:-1])
    if '.ipynb_checkpoints' in model_name:
        return None
    results = data["results"]

    # Define the metric rows
    metrics = ["acc", "acc_stderr", "acc_norm", "acc_norm_stderr"]

    # Build a MultiIndex for the rows: (model_name, metric)
    index = pd.MultiIndex.from_product(
        [[model_name], metrics], names=["model_name", "metric"]
    )

    # Prepare columns by stripping trailing '|<digits>' from each result key
    cols = {}
    for full_key, vals in results.items():
        parts = full_key.split("|")
        # remove trailing numeric suffix
        col = "|".join(parts[:-1]) if parts[-1].isdigit() else full_key
        # Extract values in the same order as metrics
        cols[col] = [vals.get(m) for m in metrics]

    # Construct DataFrame
    df = pd.DataFrame(cols, index=index)

    if "original|mmlu:global_facts" in df.columns:
        # Drop every column that starts with "original|mmlu:"
        cols_to_drop = [col for col in df.columns if col.startswith("original|mmlu:")]
        df.drop(columns=cols_to_drop, inplace=True)
        # Keep "all" (if present) and rename it to "original|mmlu"
        if "all" in df.columns:
            df.rename(columns={"all": "original|mmlu"}, inplace=True)
    else:
        # If "original|mmlu:global_facts" is not present, drop "all" (if present)
        if "all" in df.columns:
            df.drop(columns=["all"], inplace=True)

    return df

import os
import pandas as pd

def get_mtime(path):
    return os.path.getmtime(path)

base_dir = 'outputs_mcqa/results'

# 1. Collect and sort the JSON files by modification time (oldest to newest)
json_files = []
for root, dirs, files in os.walk(base_dir):
    for filename in files:
        if filename.lower().endswith('.json'):
            full_path = os.path.join(root, filename)
            json_files.append(full_path)
# Sort so that newer files come last—they will override older ones in case of column name collisions
json_files.sort(key=get_mtime)

# 2. Process each JSON file into a DataFrame
dfs = []
for jf in json_files:
    try:
        df = process_json(jf)
        dfs.append(df)
    except Exception as e:
        print(f"Skipping {jf}: {e}")

# 3. Horizontally concatenate all DataFrames (union on columns)
combined_df = pd.concat(dfs, axis=1, join='outer')

# 4. In case of duplicate columns, keep the *last* (newest) occurrence
combined_df = (
    combined_df
    .T
    .groupby(level=0)
    .last()
    .T
)

# 5. Define raw weights in a single dictionary and normalize over all tasks
raw_weights = {
    "community|MNLP_M3_mcqa_dataset": 0,
    "original|mmlu": 0.0,
    "community|mmlu:stem": 9,
    "community|mnlp_mcqa_evals": 0,
    "community|mnlp_mcqa_evals_legacy": 0,
    "helm|commonsenseqa": 1,
    "helm|med_qa": 2.5,
    "lighteval|agieval:aqua-rat": 0.25,
    "lighteval|openbookqa": 1,
    "lighteval|race:high": 1,
    "lighteval|sciq": 1,
    "original|arc:c:letters": 1,
}

total_weight_raw = sum(raw_weights.values())

d = {k: (v / total_weight_raw if total_weight_raw else 0) for k, v in raw_weights.items()}

# Filter available columns and corresponding weights
weight_cols = [col for col in d if col in combined_df.columns]
weight_vals = [d[col] for col in weight_cols]

# 6. Compute weighted mean using normalized weights
total_weight = sum(weight_vals)
combined_df["mean_weighted"] = (
    combined_df[weight_cols]
    .multiply(weight_vals, axis=1)
    .sum(axis=1)
    / total_weight
)

# 7. Compute raw mean: use binary weights (1 if weight > 0, else 0)
binary_weights = {col: 1 if d.get(col, 0) > 0 else 0 for col in d}
bin_vals = [binary_weights[col] for col in weight_cols]
norm = sum(bin_vals)
combined_df["mean_raw"] = (
    combined_df[weight_cols]
    .multiply(bin_vals, axis=1)
    .sum(axis=1)
    / norm
)

# 8. Reorder columns at the end
desired_cols = [
    "mean_weighted",
    "mean_raw", 
    "community|mmlu:stem",
    "helm|commonsenseqa",
    "helm|med_qa",
    "lighteval|agieval:aqua-rat",
    "lighteval|openbookqa",
    "lighteval|race:high",
    "lighteval|sciq",
    "original|arc:c:letters"
    #"community|MNLP_M3_mcqa_dataset",
    #"original|mmlu",
    #"community|mnlp_mcqa_evals",
    #"community|mnlp_mcqa_evals_legacy",
]
# Keep only columns present in the DataFrame, in the specified order
ordered = [col for col in desired_cols if col in combined_df.columns]
combined_df = combined_df[ordered]

df = combined_df

# 1) drop all the “norm” rows by masking on the second level of the index:
metric_level = df.index.get_level_values('metric')
mask = ~metric_level.str.contains('norm')
df2 = df[mask]

# 2) unstack the metric level so acc & acc_stderr become sub-columns:
df_un = df2.unstack(level='metric')
# now df_un.columns is a MultiIndex of (dataset_name, metric_name)

# 3) for each dataset, round and format “acc ± acc_stderr” into one string column:
out = pd.DataFrame(index=df_un.index)
for dataset in df_un.columns.levels[0]:
    acc    = (df_un[(dataset, 'acc')]*100).round(2).map("{:.2f}".format)
    #stderr = (df_un[(dataset, 'acc_stderr')]*100).round(2).map("{:.2f}".format)
    out[dataset] = acc #+ ' ± ' + stderr

# Step 1: Extract the model_name level from the MultiIndex
out = out.reset_index()  # This moves the index (model names) to a column named 'model_name'

# Now we can safely modify the model_name column
# Step 1: Truncate "NicoHelemon/" prefix
out["model_name"] = out["model_name"].str.replace(r'^NicoHelemon/', '', regex=True)

# Step 2: Apply rename_map
rename_map = {
    "MNLP_M3_mcqa_model": "MNLP_M3_mcqa_model_cot00_e1",
    "MNLP_M2_mcqa_model_cot00": "MNLP_M2_mcqa_model_cot00_e1",
    "MNLP_M2_mcqa_model_cot10": "MNLP_M2_mcqa_model_cot10_e1",
}
out["model_name"] = out["model_name"].replace(rename_map)

# 4) bring model_name back to a column if you like:
#out = out.reset_index()
out.to_csv("accuracy_results.csv", index=False)