In [None]:
import numpy as np
import pandas as pd
from scipy.stats import distributions
from scipy.stats._hypotests import TukeyHSDResult

Adapted from SciPy (https://github.com/scipy/scipy/blob/v1.14.1/scipy/stats/_hypotests.py#L1841-L2027) to work with already-calculated means and standard deviations.

In [None]:
def tukey(mean_1, stdev_1, mean_2, stdev_2, mean_3, stdev_3):
    ntreatments = 3
    means = np.array([mean_1, mean_2, mean_3])
    nsamples_treatments = np.array([5, 5, 5])
    nobs = 15

    # determine mean square error [5]. Note that this is sometimes called
    # mean square error within.
    mse = (np.sum([stdev_1**2, stdev_2**2, stdev_3**2] *
                  (nsamples_treatments - 1)) / (nobs - ntreatments))

    normalize = 2 / nsamples_treatments[0]

    # the standard error is used in the computation of the tukey criterion and
    # finding the p-values.
    stand_err = np.sqrt(normalize * mse / 2)

    # the mean difference is the test statistic.
    mean_differences = means[None].T - means

    # Calculate the t-statistic to use within the survival function of the
    # studentized range to get the p-value.
    t_stat = np.abs(mean_differences) / stand_err

    params = t_stat, ntreatments, nobs - ntreatments
    pvalues = distributions.studentized_range.sf(*params)

    return TukeyHSDResult(mean_differences, pvalues, ntreatments,
                          nobs, stand_err)

In [None]:
df = pd.read_csv("results.csv")
df

In [None]:
for _, row in df.iterrows():
    mean_1, stdev_1 = row["Chemprop"].split("+/-")
    mean_2, stdev_2 = row["transformer-cnn"].split("+/-")
    mean_3, stdev_3 = row["fastprop"].split("+/-")
    print(f"""
    {'-':-^{50}}
    {row["Dataset"]:^{50}}
    {'-':-^{50}}
    """)
    print(tukey(float(mean_1), float(stdev_1), float(mean_2), float(stdev_2), float(mean_3), float(stdev_3)))


    --------------------------------------------------
                           HIV                        
    --------------------------------------------------
    
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)      0.267     0.000     0.138     0.396
 (0 - 2)      0.043     0.656    -0.086     0.172
 (1 - 0)     -0.267     0.000    -0.396    -0.138
 (1 - 2)     -0.224     0.002    -0.353    -0.095
 (2 - 0)     -0.043     0.656    -0.172     0.086
 (2 - 1)      0.224     0.002     0.095     0.353


    --------------------------------------------------
                           QM8                        
    --------------------------------------------------
    
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)     -0.008     0.000    -0.008    -0.008
 (0 - 2)     -0.011     0.000    -0.011    -0.010
 (1 - 0)      0.008  