In [1]:
import pandas as pd

METRIC = 'ndcg_at_10'
METRIC_ALIAS = METRIC.replace('_at_', '@').upper()

df = pd.read_csv(f'../../data/{METRIC}.csv').rename({METRIC: METRIC_ALIAS}, axis=1)

In [2]:
import numpy as np
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests

ALPHA = 0.05

results = []
for dataset_name, dataset_group in df[df['context'] != 512].groupby('dataset'):
    for model_name, model_group in dataset_group.groupby('model'):
        for ctx, ctx_group in model_group.groupby('context'):
            ctx_group = ctx_group.sort_values('query_id')
            data1 = ctx_group[ctx_group['interpolation'] == 'cubic'][METRIC_ALIAS].values
            data2 = ctx_group[ctx_group['interpolation'] == 'linear'][METRIC_ALIAS].values

            # paired differences
            diff = data1 - data2
            mean_diff = diff.mean()

            # Cohen’s d
            cohen_d = mean_diff / diff.std()

            # Wilcoxon test (two-sided)
            _, p_raw = wilcoxon(data1, data2, alternative='two-sided')

            results.append({
                'Dataset': dataset_name,
                'Model': model_name,
                'Ctx': ctx,
                'Mean \\Delta': mean_diff,
                'Cohen\'s d': cohen_d,
                'p (Wilcoxon)': p_raw
            })

# adjust p-values (Holm–Bonferroni)
p_values = [r['p (Wilcoxon)'] for r in results]
_, p_adj, _, _ = multipletests(p_values, alpha=ALPHA, method='holm')

for r, p_corr in zip(results, p_adj):
    r['adj. p (Holm)'] = p_corr

results_df = pd.DataFrame(results)
results_df.to_csv(f'../../data/{METRIC}.stats.csv', index=False)
results_df

Unnamed: 0,Dataset,Model,Ctx,Mean \Delta,Cohen's d,p (Wilcoxon),adj. p (Holm)
0,NQA,e5-large-v2,1024,0.228635,0.029642,0.001653198,0.05951514
1,NQA,e5-large-v2,1536,0.665537,0.052271,2.317279e-08,1.112294e-06
2,NQA,e5-large-v2,2048,4.863518,0.224635,7.636001e-108,4.5816e-106
3,NQA,e5-large-v2,2560,3.632711,0.181879,7.260351999999999e-70,4.211004000000001e-68
4,NQA,e5-large-v2,3072,3.625646,0.188856,1.002485e-79,5.914662e-78
5,NQA,roberta-large,1024,0.052828,0.009496,0.3317434,1.0
6,NQA,roberta-large,1536,0.187856,0.040641,2.010865e-05,0.0008244546
7,NQA,roberta-large,2048,0.126629,0.019226,0.01134471,0.3630308
8,NQA,roberta-large,2560,0.138485,0.022454,0.02315499,0.6946496
9,NQA,roberta-large,3072,0.001937,0.000371,0.8930346,1.0


In [3]:
significant_mask = results_df['adj. p (Holm)'] < ALPHA

better = len(results_df[significant_mask & (results_df['Mean \\Delta'] > 0)])
worse = len(results_df[significant_mask & (results_df['Mean \\Delta'] < 0)])
insignificant = int(sum(~significant_mask))

print(f'Significanly better: {100 * better / len(results_df):.1f}% ({better}/{len(results_df)})')
print(f'Significanly worse:   {100 * worse / len(results_df):.1f}% ({worse}/{len(results_df)})')
print(f'No difference:       {100 * insignificant / len(results_df):.1f}% ({insignificant}/{len(results_df)})')

Significanly better: 38.3% (23/60)
Significanly worse:   1.7% (1/60)
No difference:       60.0% (36/60)
