In [26]:
import pandas as pd
from pathlib import Path

pd.set_option('display.max_colwidth', None)

base_models_path = Path("./model_checkpoints/")

model_paths = [pth for pth in base_models_path.iterdir()]

keep_cols = ["model", "F1 Score", "Accuracy", "Track1 Metric", "Track2 Metric"]

performance_dfs = []
for model_path in model_paths:
    score_result_path = model_path / "competition_metrics_scores.json"
    if (score_result_path).exists():
        df = pd.read_json(score_result_path)
        df["model"] = score_result_path.parent.name
        performance_dfs.append(df[keep_cols])

performance_comparison_df = pd.concat(performance_dfs, ignore_index=True)
performance_comparison_df.drop_duplicates(inplace=True)
performance_comparison_df.reset_index(inplace=True, drop=True)

In [27]:
performance_comparison_df.sort_values("Track1 Metric", ascending=False)

Unnamed: 0,model,F1 Score,Accuracy,Track1 Metric,Track2 Metric
5,caformer_s18_fine_tune_val_test,69.93,82.4,92.21,1805
0,2024-05-05 22:41:41.115323,54.21,70.48,86.33,3183
2,2024-05-08-caformer_s18-weighted_venom_loss,53.65,68.51,85.64,3398
4,2024-05-08-caformer_s18-focal-balanced-sampling-paused,51.56,65.57,84.31,3735
3,2024-05-08-caformer_s18-focal-balanced-sampling-higher-do,52.5,65.12,84.2,3802
1,2024-05-07-metaformer_0,48.13,64.74,83.91,3867


In [28]:
leaderboard_performances = [
    {"Track1": 78.14, "Track2": 1134, "F1": 27.71, "Accuracy": 59.38, "model": "2024-05-05 22:41:41.115323"},
    {"Track1": 74.53, "Track2": 1358, "F1": 21.38, "Accuracy": 55.11, "model": "2024-05-07-metaformer_0"},
    {"Track1": 74.66, "Track2": 1353, "F1": 21.92, "Accuracy": 52.39, "model": "2024-05-08-caformer_s18-focal-balanced-sampling-higher-do"},
    {"Track1": 76.28, "Track2": 1248, "F1": 24.21, "Accuracy": 57.17, "model": "2024-05-08-caformer_s18-weighted_venom_loss"},
]
dfs = []
for lbp in leaderboard_performances:
    dfs.append(pd.DataFrame(lbp, index=[0]))
leaderboard_df = pd.concat(dfs, ignore_index=True)
leaderboard_df.rename({"Track1": "LB Track1", "Track2": "LB Track2", "F1": "LB F1", "Accuracy": "LB Accuracy"}, axis=1, inplace=True)

combined_df = performance_comparison_df.merge(leaderboard_df, on="model", how="outer")
combined_df.fillna(0, inplace=True)
combined_df["LB Track2"] = combined_df["LB Track2"].astype(int)

In [29]:
combined_df.sort_values("LB Track1", ascending=False)

Unnamed: 0,model,F1 Score,Accuracy,Track1 Metric,Track2 Metric,LB Track1,LB Track2,LB F1,LB Accuracy
0,2024-05-05 22:41:41.115323,54.21,70.48,86.33,3183,78.14,1134,27.71,59.38
4,2024-05-08-caformer_s18-weighted_venom_loss,53.65,68.51,85.64,3398,76.28,1248,24.21,57.17
2,2024-05-08-caformer_s18-focal-balanced-sampling-higher-do,52.5,65.12,84.2,3802,74.66,1353,21.92,52.39
1,2024-05-07-metaformer_0,48.13,64.74,83.91,3867,74.53,1358,21.38,55.11
3,2024-05-08-caformer_s18-focal-balanced-sampling-paused,51.56,65.57,84.31,3735,0.0,0,0.0,0.0
5,caformer_s18_fine_tune_val_test,69.93,82.4,92.21,1805,0.0,0,0.0,0.0


### Notes:
- higher dropout of 0.4 is better for CAFormer_S18 (and presumably larger variants as well) relative to 0.2
- balanced sampling + focal loss performs well, but seesaw loss performs better