In [10]:
import os
from pathlib import Path
import re
import ast

import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

In [2]:
results = []

for filepath in Path("pretraining_size").glob("*polaris.md"):
    model_name = os.path.splitext(os.path.basename(filepath))[0]
    with open(filepath, "r") as f:
        content = f.read()
    dicts = re.findall(r"results_dict\s*=\s*({.*?})", content, re.DOTALL)
    for d in dicts:
        parsed = ast.literal_eval(d)
        for benchmark, score in parsed.items():
            results.append({
                "model": model_name.split("_")[0],
                "benchmark": benchmark,
                "score": score
            })

polaris_df = pd.DataFrame(results)

In [3]:
polaris_df

Unnamed: 0,model,benchmark,score
0,10MM,polaris/pkis2-ret-wt-cls-v2,0.583787
1,10MM,polaris/pkis2-ret-wt-reg-v2,849.739197
2,10MM,polaris/pkis2-kit-wt-cls-v2,0.463547
3,10MM,polaris/pkis2-kit-wt-reg-v2,1145.444715
4,10MM,polaris/pkis2-egfr-wt-reg-v2,512.773594
...,...,...,...
275,1MM,tdcommons/caco2-wang,0.277218
276,1MM,tdcommons/herg,0.843741
277,1MM,tdcommons/bbb-martins,0.922803
278,1MM,tdcommons/ames,0.840900


In [4]:
def extract_dicts(filepath):
    with open(filepath, "r") as f:
        text = f.read()
    
    dicts = []
    start = 0
    while True:
        start_idx = text.find("results_dict =", start)
        if start_idx == -1:
            break
        brace_idx = text.find("{", start_idx)
        if brace_idx == -1:
            break

        brace_count = 0
        end_idx = brace_idx
        while end_idx < len(text):
            if text[end_idx] == "{":
                brace_count += 1
            elif text[end_idx] == "}":
                brace_count -= 1
                if brace_count == 0:
                    break
            end_idx += 1
        
        dict_text = text[brace_idx:end_idx+1]
        parsed_dict = ast.literal_eval(dict_text)
        dicts.append(parsed_dict)
        start = end_idx + 1

    return dicts

# Read all markdown files
results = []

for filepath in Path("pretraining_size").glob("*mace.md"):
    model_name = os.path.splitext(os.path.basename(filepath))[0]
    dicts = extract_dicts(filepath)
    for d in dicts:
        for benchmark, subtasks in d.items():
            for subtask, score in subtasks.items():
                results.append({
                    "model": model_name.split("_")[0],
                    "benchmark": benchmark,
                    "subtask": subtask,
                    "score": score
                })

# Create DataFrame
mace_df = pd.DataFrame(results)


In [5]:
mace_df

Unnamed: 0,model,benchmark,subtask,score
0,10MM,CHEMBL1862_Ki,cliff,0.811278
1,10MM,CHEMBL1862_Ki,noncliff,0.808528
2,10MM,CHEMBL1871_Ki,cliff,0.843151
3,10MM,CHEMBL1871_Ki,noncliff,0.590621
4,10MM,CHEMBL2034_Ki,cliff,0.936306
...,...,...,...,...
535,1MM,CHEMBL4203_Ki,noncliff,0.886460
536,1MM,CHEMBL4616_EC50,cliff,0.759195
537,1MM,CHEMBL4616_EC50,noncliff,0.687618
538,1MM,CHEMBL4792_Ki,cliff,0.715063


In [6]:
size_dict = {}
for benchmark_name in mace_df["benchmark"].unique():
    bench_df = pd.read_csv(f"https://raw.githubusercontent.com/molML/MoleculeACE/7e6de0bd2968c56589c580f2a397f01c531ede26/MoleculeACE/Data/benchmark_data/{benchmark_name}.csv")
    test_df = bench_df[bench_df["split"] == "test"]
    size_dict[benchmark_name] = dict(
        cliff=len(test_df[test_df["cliff_mol"] == 1]),
        noncliff=len(test_df[test_df["cliff_mol"] == 0]),
    )
mace_df["n_mols"] = mace_df.apply(lambda row: size_dict[row['benchmark']][row['subtask']], axis=1)

In [7]:
# iterate over pairs of the df to calculate the differences
records = []
for i, (model, model_df) in enumerate(mace_df.groupby("model", sort=False)):
    for j, (benchmark, bench_df) in enumerate(model_df.groupby("benchmark", sort=False)):
        for k in range(0, len(bench_df), 2):
            records.append(
                dict(
                    model=model,
                    benchmark=benchmark,
                    diff=bench_df["score"].iloc[k] - bench_df["score"].iloc[k+1],
                    avg=(bench_df["n_mols"].iloc[k] * bench_df["score"].iloc[k] + bench_df["n_mols"].iloc[k + 1] * bench_df["score"].iloc[k+1]) / (bench_df["n_mols"].iloc[k] + bench_df["n_mols"].iloc[k + 1]),
                )
            )
mace_df = pd.DataFrame.from_dict(records).drop(columns=["diff"]).rename(columns=dict(avg="score"))

In [8]:
mace_df

Unnamed: 0,model,benchmark,score
0,10MM,CHEMBL1862_Ki,0.809672
1,10MM,CHEMBL1862_Ki,0.817892
2,10MM,CHEMBL1862_Ki,0.823395
3,10MM,CHEMBL1862_Ki,0.800546
4,10MM,CHEMBL1862_Ki,0.800058
...,...,...,...
265,1MM,CHEMBL4616_EC50,0.725209
266,1MM,CHEMBL4792_Ki,0.722674
267,1MM,CHEMBL4792_Ki,0.758554
268,1MM,CHEMBL4792_Ki,0.761374


In [9]:
df = pd.concat((polaris_df, mace_df))
df

Unnamed: 0,model,benchmark,score
0,10MM,polaris/pkis2-ret-wt-cls-v2,0.583787
1,10MM,polaris/pkis2-ret-wt-reg-v2,849.739197
2,10MM,polaris/pkis2-kit-wt-cls-v2,0.463547
3,10MM,polaris/pkis2-kit-wt-reg-v2,1145.444715
4,10MM,polaris/pkis2-egfr-wt-reg-v2,512.773594
...,...,...,...
265,1MM,CHEMBL4616_EC50,0.725209
266,1MM,CHEMBL4792_Ki,0.722674
267,1MM,CHEMBL4792_Ki,0.758554
268,1MM,CHEMBL4792_Ki,0.761374


In [13]:
# Function to calculate Cohen's d
def cohens_d(mean1, mean2, std1, std2, n1, n2):
    pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2))
    return (mean1 - mean2) / pooled_std

# Initialize lists to store results
results = []

# Group data by benchmark
benchmarks = df['benchmark'].unique()

for benchmark in benchmarks:
    # Get scores for each model for the current benchmark
    scores_10MM = df[(df['benchmark'] == benchmark) & (df['model'] == '10MM')]['score']
    scores_1MM = df[(df['benchmark'] == benchmark) & (df['model'] == '1MM')]['score']
    
    # Ensure we have exactly 5 scores for each model
    if len(scores_10MM) == 5 and len(scores_1MM) == 5:
        # Perform t-test
        t_stat, p_value = ttest_ind(scores_10MM, scores_1MM, equal_var=True)
        
        # Calculate means and standard deviations
        mean_10MM = scores_10MM.mean()
        mean_1MM = scores_1MM.mean()
        std_10MM = scores_10MM.std(ddof=1)
        std_1MM = scores_1MM.std(ddof=1)
        
        # Calculate Cohen's d
        d = cohens_d(mean_10MM, mean_1MM, std_10MM, std_1MM, 5, 5)
        
        # Determine if 10MM is practically better
        # Criteria: p-value < 0.05 and Cohen's d > 0.5 (10MM mean > 1MM mean)
        is_practical_improvement = (p_value < 0.05) and (d > 0.5) and (mean_10MM > mean_1MM)
        
        results.append({
            'benchmark': benchmark,
            'mean_10MM': mean_10MM,
            'mean_1MM': mean_1MM,
            'p_value': p_value,
            'cohens_d': d,
            'practical_improvement': is_practical_improvement
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Calculate the rate of practical improvement
total_benchmarks = len(results_df)
practical_improvements = results_df['practical_improvement'].sum()
rate = practical_improvements / total_benchmarks if total_benchmarks > 0 else 0

print(f"Rate of practical improvement (10MM over 1MM): {rate:.2%}")
print("\nDetailed Results:")
print(results_df[['benchmark', 'mean_10MM', 'mean_1MM', 'p_value', 'cohens_d', 'practical_improvement']])

Rate of practical improvement (10MM over 1MM): 0.00%

Detailed Results:
                                   benchmark    mean_10MM     mean_1MM  \
0                polaris/pkis2-ret-wt-cls-v2     0.575418     0.557221   
1                polaris/pkis2-ret-wt-reg-v2   805.722766   836.117596   
2                polaris/pkis2-kit-wt-cls-v2     0.441465     0.467138   
3                polaris/pkis2-kit-wt-reg-v2  1146.601561  1125.700446   
4               polaris/pkis2-egfr-wt-reg-v2   514.918566   501.984610   
5                   polaris/adme-fang-solu-1     0.547233     0.552396   
6                   polaris/adme-fang-rppb-1     0.842584     0.832804   
7                   polaris/adme-fang-hppb-1     0.779766     0.820743   
8                   polaris/adme-fang-perm-1     0.757212     0.768211   
9                 polaris/adme-fang-rclint-1     0.696740     0.694446   
10                polaris/adme-fang-hclint-1     0.667459     0.661216   
11       tdcommons/lipophilicity-astraze