# Evaluation demo
This notebook will present how to evalute the intent recognition including macro f1 scores for each subsets.
Download the predicted scores from our baseline models at [this link](https://cornell.box.com/s/5g5q7tnak1le5cxa3nv69xep6o7e7uwi) and save them to `ROOT`.

In [9]:
import numpy as np
import pandas as pd
import torch
from collections import defaultdict

from eval_utils import eval_all_metrics, SUBSET2IDS

ROOT = ""  # the folder which you place the downloaded scores

In [12]:
"""
organize the evaluation results in a table
"""
def get_allresults_df(root: str) -> pd.DataFrame:
    data_dict = defaultdict(list)
    # these are the three baseline model ablations
    for model_type in ["image", "image_cam", "image_hs_cam"]:
        data_dict["model"].append(model_type)
        val_f1s = []  # 5 x 28
        all_f1s = defaultdict(list)

        # get results for each run
        for run_num in range(5):
            d_dict = torch.load(f"{root}/{model_type}_{run_num}.pth")
            f1_dict = eval_all_metrics(
                d_dict["val_scores"], d_dict["test_scores"],
                d_dict["val_targets"], d_dict["test_targets"]
            )
            for k, v in f1_dict.items():
                if isinstance(v, float):
                    all_f1s[k].append(v * 100)
                else:
                    all_f1s[k].append(np.array(v)[np.newaxis, :] * 100)

        val_f1s = np.vstack(all_f1s["val_none"])

        for e_type, c_ids in SUBSET2IDS.items():
            e_f1s = np.mean(np.hstack([val_f1s[:, c:c+1] for c in c_ids]), 1)
            data_dict[f"val-{e_type}"].append("{:.2f} +- {:.2f}".format(
                np.mean(e_f1s), np.std(e_f1s)
            ))

        for k, values in all_f1s.items():
            if not k.endswith("none"):
                data_dict[k].append("{:.2f} +- {:.2f}".format(
                    np.mean(values), np.std(values)
                ))
    df = pd.DataFrame(data_dict)
    return df

In [13]:
# these are the same results reported in README.md
get_allresults_df(ROOT)

Unnamed: 0,model,val-easy,val-medium,val-hard,val-object,val-context,val-other,val_micro,val_samples,val_macro,test_micro,test_samples,test_macro
0,image,54.64 +- 2.54,24.92 +- 1.18,10.71 +- 1.33,25.58 +- 2.51,30.16 +- 2.97,21.34 +- 0.74,31.36 +- 1.16,29.91 +- 1.73,23.03 +- 0.79,30.23 +- 0.73,28.45 +- 1.71,22.77 +- 0.59
1,image_cam,57.10 +- 1.84,25.68 +- 1.24,12.72 +- 2.31,28.15 +- 1.94,28.62 +- 2.13,22.60 +- 1.40,32.87 +- 1.13,32.46 +- 1.18,24.42 +- 0.95,32.07 +- 0.84,30.91 +- 1.27,24.37 +- 0.65
2,image_hs_cam,58.86 +- 2.56,26.30 +- 1.42,13.11 +- 2.15,29.66 +- 2.19,32.48 +- 1.34,22.61 +- 0.48,32.94 +- 1.16,33.61 +- 0.92,25.07 +- 0.52,31.28 +- 0.36,31.39 +- 0.78,23.98 +- 0.85
