In [1]:
import os
import re
import ast
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
results = []

for filepath in Path("weight_averaging/polaris").glob("*.md"):
    model_name = os.path.splitext(os.path.basename(filepath))[0]
    with open(filepath, "r") as f:
        content = f.read()
    dicts = re.findall(r"results_dict\s*=\s*({.*?})", content, re.DOTALL)
    for d in dicts:
        parsed = ast.literal_eval(d)
        for benchmark, score in parsed.items():
            results.append({
                "model": model_name,
                "benchmark": benchmark,
                "score": score
            })

polaris_df = pd.DataFrame(results)
polaris_df

Unnamed: 0,model,benchmark,score
0,CheMeleon,polaris/pkis2-ret-wt-cls-v2,0.531761
1,CheMeleon,polaris/pkis2-ret-wt-reg-v2,710.026300
2,CheMeleon,polaris/pkis2-kit-wt-cls-v2,0.709325
3,CheMeleon,polaris/pkis2-kit-wt-reg-v2,763.733810
4,CheMeleon,polaris/pkis2-egfr-wt-reg-v2,485.443445
...,...,...,...
555,CheMeleon_SWA,tdcommons/caco2-wang,0.348626
556,CheMeleon_SWA,tdcommons/herg,0.827541
557,CheMeleon_SWA,tdcommons/bbb-martins,0.906621
558,CheMeleon_SWA,tdcommons/ames,0.839578


In [3]:
import polaris as po

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
metrics = {}

In [5]:
%%capture
for benchmark_name in polaris_df["benchmark"].unique():
    benchmark = po.load_benchmark(benchmark_name)
    metrics[benchmark_name] = benchmark.main_metric

In [6]:
def extract_dicts(filepath):
    with open(filepath, "r") as f:
        text = f.read()
    
    dicts = []
    start = 0
    while True:
        start_idx = text.find("results_dict =", start)
        if start_idx == -1:
            break
        brace_idx = text.find("{", start_idx)
        if brace_idx == -1:
            break

        brace_count = 0
        end_idx = brace_idx
        while end_idx < len(text):
            if text[end_idx] == "{":
                brace_count += 1
            elif text[end_idx] == "}":
                brace_count -= 1
                if brace_count == 0:
                    break
            end_idx += 1
        
        dict_text = text[brace_idx:end_idx+1]
        parsed_dict = ast.literal_eval(dict_text)
        dicts.append(parsed_dict)
        start = end_idx + 1

    return dicts

# Read all markdown files
results = []

for filepath in Path("weight_averaging/moleculeace").glob("*.md"):
    model_name = os.path.splitext(os.path.basename(filepath))[0]
    dicts = extract_dicts(filepath)
    for d in dicts:
        for benchmark, subtasks in d.items():
            for subtask, score in subtasks.items():
                results.append({
                    "model": model_name,
                    "benchmark": benchmark,
                    "subtask": subtask,
                    "score": score
                })

# Create DataFrame
mace_df = pd.DataFrame(results)

size_dict = {}
for benchmark_name in mace_df["benchmark"].unique():
    bench_df = pd.read_csv(f"https://raw.githubusercontent.com/molML/MoleculeACE/7e6de0bd2968c56589c580f2a397f01c531ede26/MoleculeACE/Data/benchmark_data/{benchmark_name}.csv")
    test_df = bench_df[bench_df["split"] == "test"]
    size_dict[benchmark_name] = dict(
        cliff=len(test_df[test_df["cliff_mol"] == 1]),
        noncliff=len(test_df[test_df["cliff_mol"] == 0]),
    )

mace_df["n_mols"] = mace_df.apply(lambda row: size_dict[row['benchmark']][row['subtask']], axis=1)
mace_df["benchmark"] = mace_df["benchmark"].map(lambda s: s.replace("CHEMBL", ""))

records = []
for i, (model, model_df) in enumerate(mace_df.groupby("model", sort=False)):
    for j, (benchmark, bench_df) in enumerate(model_df.groupby("benchmark", sort=False)):
        for k in range(0, len(bench_df), 2):
            records.append(
                dict(
                    model=model,
                    benchmark=benchmark,
                    score=(bench_df["n_mols"].iloc[k] * bench_df["score"].iloc[k] + bench_df["n_mols"].iloc[k + 1] * bench_df["score"].iloc[k+1]) / (bench_df["n_mols"].iloc[k] + bench_df["n_mols"].iloc[k + 1]),
                )
            )
mace_df = pd.DataFrame.from_dict(records)
mace_df

Unnamed: 0,model,benchmark,score
0,CheMeleon,1862_Ki,0.728708
1,CheMeleon,1862_Ki,0.773882
2,CheMeleon,1862_Ki,0.767012
3,CheMeleon,1862_Ki,0.709411
4,CheMeleon,1862_Ki,0.777132
...,...,...,...
595,CheMeleon_SWA,4792_Ki,0.830627
596,CheMeleon_SWA,4792_Ki,0.718113
597,CheMeleon_SWA,4792_Ki,0.791928
598,CheMeleon_SWA,4792_Ki,0.733022


In [7]:
df = pd.concat((mace_df, polaris_df))
df

Unnamed: 0,model,benchmark,score
0,CheMeleon,1862_Ki,0.728708
1,CheMeleon,1862_Ki,0.773882
2,CheMeleon,1862_Ki,0.767012
3,CheMeleon,1862_Ki,0.709411
4,CheMeleon,1862_Ki,0.777132
...,...,...,...
555,CheMeleon_SWA,tdcommons/caco2-wang,0.348626
556,CheMeleon_SWA,tdcommons/herg,0.827541
557,CheMeleon_SWA,tdcommons/bbb-martins,0.906621
558,CheMeleon_SWA,tdcommons/ames,0.839578


In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from collections import defaultdict

# Prepare to collect ranks
rank_tracker = defaultdict(int)

# Step 2: Group by benchmark, perform Tukey HSD, and collect plotting info
benchmark_groups = df.groupby("benchmark", sort=False)
n_benchmarks = len(benchmark_groups)


for i, (benchmark, bench_df) in enumerate(benchmark_groups):
    if "_" in benchmark or metrics[benchmark].label in {'mean_squared_error', 'mean_absolute_error'}:
        bench_df["score"] = (bench_df["score"].max() - bench_df["score"]) / (bench_df["score"].max() - bench_df["score"].min())
    
    # Run Tukey HSD
    tukey = pairwise_tukeyhsd(bench_df["score"], bench_df["model"])
    
    # Plot Tukey HSD
    comparison_name = bench_df.groupby("model")["score"].mean().idxmax()
    # increment win counter for best group of models
    rank_tracker[comparison_name] += 1
    for model_name in bench_df["model"].unique():
        if model_name == comparison_name:
            continue
        else:
            tukey_df = pd.DataFrame(tukey.summary()[1:], columns=[str(i) for i in tukey.summary()[0]])
            tukey_df["group1"] = tukey_df["group1"].map(str)
            tukey_df["group2"] = tukey_df["group2"].map(str)
            if tukey_df[
                ((tukey_df["group1"] == comparison_name) & (tukey_df["group2"] == model_name)) | 
                ((tukey_df["group1"] == model_name) & (tukey_df["group2"] == comparison_name))
            ]["reject"].values[0].data == False:
                rank_tracker[model_name] += 1
                continue  # no need to check the rest

results = pd.DataFrame.from_records(
    [{
        "model": model.replace("_", " "),
        f"win count (total={n_benchmarks})": win_count,
        "win rate (%)": 100 * win_count / n_benchmarks,
    } for model, win_count in rank_tracker.items()
    ]).sort_values(by="win rate (%)", axis=0, ascending=False)
results.round(1)
results

Unnamed: 0,model,win count (total=58),win rate (%)
0,CheMeleon ASWA,58,100.0
1,CheMeleon,58,100.0
2,CheMeleon EMA,27,46.551724
3,CheMeleon SWA,27,46.551724
