In [1]:
%load_ext autoreload
%autoreload 2

# Evaluation

In [2]:
from pathlib import Path

import cdmetadl

root_dir = Path(cdmetadl.__file__).parent.parent
eval_dir = Path("/fastdata/vilab24/output/final/eval")

In [3]:
import pandas as pd
import pickle

# df = pd.read_pickle(root_dir / "output/tmp/eval/train_cfg_baseline/eval_cfg_generative_augment_all/finetuning/cross-domain/evaluation.pkl")
# print(df)

# predictions = pd.read_pickle(
#     "/fastdata/vilab24/output/full/eval/train_cfg_baseline_k_5/eval_cfg_standard_augment_mc_k_5/finetuning/cross-domain/predictions.pkl"
# )
# for prediction in predictions:
#     print(prediction["Number of Shots per Class"], prediction["Confidence Scores"])

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
import pandas as pd
import plotly.express as px
from pathlib import Path


def get_model_name(train_config_name, eval_config_name, model_name):
    match (train_config_name, eval_config_name):
        case (train, eval) if train.startswith("train_cfg_baseline_k") and eval.startswith("eval_cfg_baseline_k"):
            return f"{model_name} (baseline)"
        case (train, eval
              ) if train.startswith("train_cfg_baseline_k") and eval.startswith("eval_cfg_standard_augment_all_k"):
            return f"{model_name} (all, standard augmentation)"
        case (train, eval) if "dropout" in train  and eval.startswith("eval_cfg_baseline_k_"):
            return f"{model_name} (baseline, dropout)"
        case (train, eval) if "dropout" in train and eval.startswith("eval_cfg_standard_augment_all_k"):
            return f"{model_name} (all, standard augmentation, dropout)"
        case (train, eval) if "dropout" in train and eval.startswith("eval_cfg_standard_augment_pc_k"):
            return f"{model_name} (pc, standard augmentation, dropout)"
        case (train, eval) if "dropout" in train and eval.startswith("eval_cfg_generative_augment_pc_k"):
            return f"{model_name} (pc, generative augmentation, dropout)"
    return None

def read_df(path: Path) -> pd.DataFrame:
    df = pd.read_pickle(path)
    if path.parts[-3] == "within-domain":
        path_parts = path.parts[-6:]
    else:
        path_parts = path.parts[-5:]

    # Correct number of shots for pseudo confidence estimation
    if "pc" in path_parts[1] :
        df["Number of Shots"] -= 10


    model_name = get_model_name(*path_parts[0:3])
    if model_name is None:
        return None
    df.insert(0, 'Model', model_name)
    df.insert(0, 'Mode', path_parts[3])
    return df

glob_dirs = [eval_dir] #, root_dir / "output/tmp/eval/train_cfg_baseline/eval_cfg_generative_augment_all"]
full_df = pd.concat([read_df(filepath) for glob_dir in glob_dirs for filepath in glob_dir.glob('**/evaluation.pkl') ])

for mode_name, df in full_df.groupby(['Mode']):
    df_mean = df.groupby(['Number of Shots', 'Model'])['Accuracy'].mean().reset_index()

    fig = px.line(
        df_mean,
        x='Number of Shots',
        y='Accuracy',
        color='Model',
        title=f'Accuracy vs. Number of Shots for Mode: {mode_name}',
        markers=True,
    )

    fig.update_layout(
        xaxis_title="Number of Shots",
        yaxis_title="Accuracy",
        legend_title="Model"
    )

    fig.update_xaxes(
        tickvals=df_mean['Number of Shots'].unique(),
        range=[1, max(df_mean['Number of Shots'])]
    )

    fig.show()







