In [4]:
# 1) Setup imports & path ─────────────────────────────────────────────────────
import sys
from pathlib import Path
import json
import pandas as pd

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))
%load_ext autoreload
%autoreload 2

from evaluation.evaluate_loss import summarize_with_context

# 3) Run & sort ────────────────────────────────────────────────────────────────
base_dir = project_root / 'evaluation' / 'output'
df = summarize_with_context(str(base_dir))
df = df.sort_values(['model', 'task'], ignore_index=True)
df.head(100)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Unnamed: 0,model,task,mean_train_loss,mean_val_loss,mean_gap,std_gap,gap_ratio
0,RESNET18_MOCO_average_adaptive_pooling_embeddi...,biomass_mean,0.004511,0.009969,0.005458,0.003199,0.547466
1,RESNET18_MOCO_average_adaptive_pooling_embeddi...,biomass_std,0.007293,0.014719,0.007426,0.003762,0.504509
2,RESNET18_MOCO_average_adaptive_pooling_embeddi...,cdl,0.019598,0.028521,0.008923,0.003857,0.312865
3,RESNET18_MOCO_average_adaptive_pooling_embeddi...,corrine_agriculture,0.013740,0.020340,0.006601,0.003728,0.324516
4,RESNET18_MOCO_average_adaptive_pooling_embeddi...,corrine_forest,0.012749,0.019404,0.006655,0.004243,0.342988
...,...,...,...,...,...,...,...
95,RESNET50_MOCO_max_pooling_embeddings_0.001_202...,corrine_forest,0.013939,0.030553,0.016614,0.004282,0.543777
96,RESNET50_MOCO_max_pooling_embeddings_0.001_202...,heatisland_mean,0.008392,0.023367,0.014975,0.003947,0.640871
97,RESNET50_MOCO_max_pooling_embeddings_0.001_202...,heatisland_std,0.009234,0.026348,0.017114,0.003898,0.649540
98,RESNET50_MOCO_mean_pooling_embeddings_0.001_20...,biomass_mean,0.004857,0.014458,0.009601,0.003521,0.664078


Among those top-val models (or across all of them), you want a small gap_ratio (or small mean_gap):

- val: This tells you, for each task, which model got the lowest val loss

- gap_ratio: mean_gap ÷ mean_val_loss gives you “what fraction of your val loss is just due to overfitting.”

- std_gap: tells you consistency: lower is more stable across folds.

- composite score: Sometimes the absolute val-loss winner overfits more than a close second. Tweaking alpha (e.g. 0.7–0.9) lets you decide how much you care about raw performance vs. overfitting.

In [11]:
# assume df is your DataFrame from summarize_with_context
import pandas as pd

df[["backbone", "method", "resize"]] = df["model"].str.extract(r"(RESNET18|RESNET50|VIT16)_(MOCO|DECUR|DINO)_(average|max|mean|min)")
df = df.drop(columns={'model'})

# 1) Best by validation loss
best_by_val   = df.sort_values(['task','mean_val_loss']).groupby('task').first().reset_index()

# 2) Best by gap ratio (generalization)
best_by_gap   = df.sort_values(['task','gap_ratio']).groupby('task').first().reset_index()

# 3) Best by stability
best_by_std   = df.sort_values(['task','std_gap']).groupby('task').first().reset_index()

# 4) Composite score
# alpha defines raw performance vs. overfitting - 1.0 means only caring about performance and not gap
alpha = 0.7
df['composite_score'] = alpha * df['mean_val_loss'] + (1-alpha) * df['gap_ratio']
best_by_score = df.sort_values(['task','composite_score']).groupby('task').first().reset_index()

# display all of them
print("=== Best by val loss ===")
display(best_by_val)

print("=== Best by gap ratio ===")
display(best_by_gap)

print("=== Best by std gap  ===")
display(best_by_std)

print("=== Best by composite score ===")
display(best_by_score)


=== Best by val loss ===


Unnamed: 0,task,mean_train_loss,mean_val_loss,mean_gap,std_gap,gap_ratio,composite_score,backbone,method,resize
0,biomass_mean,0.00689,0.009669,0.00278,0.00322,0.287457,0.093006,VIT16,MOCO,average
1,biomass_std,0.009453,0.012949,0.003496,0.003738,0.269979,0.090058,VIT16,MOCO,average
2,cdl,0.01769,0.022206,0.004515,0.003146,0.203339,0.076545,RESNET50,DINO,mean
3,corrine_agriculture,0.011933,0.01741,0.005477,0.002766,0.314586,0.106563,RESNET50,DINO,average
4,corrine_forest,0.011634,0.017582,0.005948,0.002507,0.338308,0.1138,RESNET50,DINO,average
5,heatisland_mean,0.007646,0.01322,0.005575,0.002072,0.42167,0.135755,RESNET18,MOCO,average
6,heatisland_std,0.012007,0.015541,0.003534,0.003109,0.227395,0.079097,VIT16,MOCO,average


=== Best by gap ratio ===


Unnamed: 0,task,mean_train_loss,mean_val_loss,mean_gap,std_gap,gap_ratio,composite_score,backbone,method,resize
0,biomass_mean,0.00689,0.009669,0.00278,0.00322,0.287457,0.093006,VIT16,MOCO,average
1,biomass_std,0.009701,0.01314,0.00344,0.002852,0.261756,0.087725,VIT16,MOCO,mean
2,cdl,0.01769,0.022206,0.004515,0.003146,0.203339,0.076545,RESNET50,DINO,mean
3,corrine_agriculture,0.024164,0.029975,0.005811,0.006519,0.193864,0.079142,VIT16,MOCO,average
4,corrine_forest,0.022433,0.027211,0.004779,0.005559,0.175612,0.071731,VIT16,MOCO,mean
5,heatisland_mean,0.010209,0.013718,0.003509,0.002816,0.255774,0.086335,VIT16,MOCO,mean
6,heatisland_std,0.012007,0.015541,0.003534,0.003109,0.227395,0.079097,VIT16,MOCO,average


=== Best by std gap  ===


Unnamed: 0,task,mean_train_loss,mean_val_loss,mean_gap,std_gap,gap_ratio,composite_score,backbone,method,resize
0,biomass_mean,0.005883,0.010693,0.00481,0.002299,0.449832,0.142434,RESNET50,DINO,average
1,biomass_std,0.009701,0.01314,0.00344,0.002852,0.261756,0.087725,VIT16,MOCO,mean
2,cdl,0.01769,0.022206,0.004515,0.003146,0.203339,0.076545,RESNET50,DINO,mean
3,corrine_agriculture,0.011846,0.020194,0.008348,0.002455,0.413401,0.138156,RESNET50,MOCO,average
4,corrine_forest,0.011634,0.017582,0.005948,0.002507,0.338308,0.1138,RESNET50,DINO,average
5,heatisland_mean,0.007166,0.013397,0.006231,0.001766,0.465087,0.148904,VIT16,DINO,mean
6,heatisland_std,0.009939,0.017776,0.007837,0.002916,0.440892,0.144711,VIT16,DINO,average


=== Best by composite score ===


Unnamed: 0,task,mean_train_loss,mean_val_loss,mean_gap,std_gap,gap_ratio,composite_score,backbone,method,resize
0,biomass_mean,0.00689,0.009669,0.00278,0.00322,0.287457,0.093006,VIT16,MOCO,average
1,biomass_std,0.009701,0.01314,0.00344,0.002852,0.261756,0.087725,VIT16,MOCO,mean
2,cdl,0.01769,0.022206,0.004515,0.003146,0.203339,0.076545,RESNET50,DINO,mean
3,corrine_agriculture,0.024164,0.029975,0.005811,0.006519,0.193864,0.079142,VIT16,MOCO,average
4,corrine_forest,0.022433,0.027211,0.004779,0.005559,0.175612,0.071731,VIT16,MOCO,mean
5,heatisland_mean,0.010209,0.013718,0.003509,0.002816,0.255774,0.086335,VIT16,MOCO,mean
6,heatisland_std,0.012007,0.015541,0.003534,0.003109,0.227395,0.079097,VIT16,MOCO,average
