In [1]:
import matplotlib.pyplot as plt
import sys, os, json
import seaborn as sns
import numpy as np

from sklearn.metrics import confusion_matrix, matthews_corrcoef
from pathlib import Path

cur_dir = Path(os.path.abspath("")).parent
project_root = cur_dir.parent.parent
sys.path.append(str(project_root))

from definitions.overcooked import OverCookedDefinitions
from src.eval_utils import get_precision_per_class, get_recall_per_class, get_f1_per_class, get_macro_precision, get_macro_recall, get_macro_f1, calculate_tp_fp_fn_counts, get_micro_precision_from_counts, get_micro_recall_from_counts, get_micro_f1


In [4]:
def get_classwise_metrics(preds, gt, num_classes):
    """Add class-wise metrics to the results dictionary.
    
    Args:
        results (dict): Dictionary containing model results
        model (str): Model name
    """
    precisions = get_precision_per_class(preds, gt, list(range(num_classes)))
    recalls = get_recall_per_class(preds, gt, list(range(num_classes)))
    f1s = get_f1_per_class(precisions, recalls)
    return precisions, recalls, f1s

In [6]:
# confusion matrix
def get_confusion_matrix(all_gt, all_preds, num_classes):
    labels = list(range(num_classes))
    mcc = matthews_corrcoef(all_gt, all_preds)
    return confusion_matrix(all_gt, all_preds, labels=labels), mcc, labels

In [None]:
def get_individual_player_label(joint_action_label):
    individual_action_space = OverCookedDefinitions.INDIVIDUAL_ACTION_SPACE
    discrete_to_joint = OverCookedDefinitions.PLAYER_ACTION_SPACE_TUPLES
    
    player0_action, player1_action = discrete_to_joint.get(joint_action_label, (None, None))
    player0_label, player1_label = \
        individual_action_space.get(player0_action, -1), individual_action_space.get(player1_action, -1)
    return player0_label, player1_label

def extract_per_player_metrics(preds, gt):
    player0_preds = []
    player1_preds = []
    player0_gts = []
    player1_gts = []
    for pred in preds:
        player0_pred, player1_pred = get_individual_player_label(pred)
        player0_preds.append(player0_pred)
        player1_preds.append(player1_pred)
    for gt_action in gt:
        player0_gt, player1_gt = get_individual_player_label(gt_action)
        player0_gts.append(player0_gt)
        player1_gts.append(player1_gt)
    return player0_preds, player1_preds, player0_gts, player1_gts

In [8]:
def plot_and_save_confusion_matrix(cm, model, labels, mcc, title, fig_size=(24, 20), cell_font_size=12):
    # Plot union confusion matrix
    plt.figure(figsize=fig_size)
    # Row normalize the union confusion matrix
    row_sums = cm.sum(axis=1)[:, np.newaxis]
    # Handle zero-sum rows to avoid division by zero
    row_sums[row_sums == 0] = 1 
    
    cm_normalized = cm.astype('float') / row_sums
    sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='YlOrRd',
                xticklabels=[str(a) for a in labels],
                yticklabels=[str(a) for a in labels],
                annot_kws={'size': cell_font_size})
    plt.title(f'{title} for {model}\nMCC: {mcc:.3f}', fontsize=20)
    plt.xlabel('Predicted', fontsize=20)
    plt.ylabel('True', fontsize=20)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.tight_layout()

    filename = f'{title.replace(" ", "_")}_{model}.png'
    output_path = os.path.abspath(os.path.join("plots", model, filename))
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    plt.savefig(output_path)
    plt.close()

In [14]:
def plot_and_save_joint_action_heatmaps(model, class_precisions, class_recalls, class_f1s):
    # Get action order
    actions = list(OverCookedDefinitions.INDIVIDUAL_ACTION_SPACE.keys())
    action_to_index = {action: idx for idx, action in enumerate(actions)}

    # 6x6 heatmap matrix (rows: Player 1, cols: Player 0)
    num_actions = len(actions)
    heatmap_matrix = np.zeros((num_actions, num_actions))

    # Get class-wise metrics for joint action ids 0..35
    for label, metric_results in [('F1-Score', class_f1s), ('Recall', class_recalls), ('Precision', class_precisions)]:

        # Map joint action id -> (Player0Action, Player1Action)
        joint_mapping = OverCookedDefinitions.PLAYER_ACTION_SPACE_TUPLES

        for joint_id, (p0_action, p1_action) in joint_mapping.items():
            # Convert to matrix indices (row = Player 1, col = Player 0)
            row_idx = action_to_index[p1_action]
            col_idx = action_to_index[p0_action]
            metric_value = metric_results.get(str(joint_id), metric_results.get(int(joint_id), 0.0))
            heatmap_matrix[row_idx, col_idx] = metric_value

        title = f"Overcooked Joint Action {label} Heatmap {model}"
        filename = f"overcooked_joint_action_{label}_heatmap_{model}.png"
        # Plot heatmap
        plt.figure(figsize=(12, 10))
        sns.heatmap(
            heatmap_matrix,
            cmap='YlOrRd',
            annot=True,
            fmt='.2f',
            xticklabels=actions,
            yticklabels=actions,
            cbar_kws={'label': label}
        )
        plt.title(title, fontsize=18)
        plt.xlabel('Player 0 Action', fontsize=14)
        plt.ylabel('Player 1 Action', fontsize=14)
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()

        # Save and close
        output_path = os.path.abspath(os.path.join("plots", model, filename))
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        plt.savefig(output_path, dpi=300)
        plt.close()

# GPT 5

In [2]:
results_dir = f"{project_root}/src/v1/results/genesis/gpt_5/low_reasoning/overcooked_ai/"
results_file = "overcooked_ai_results.json"

In [3]:
with open(Path(results_dir) / results_file, 'r') as f:
    results =json.load(f)

In [5]:
all_preds = results["overcooked_ai"]["preds"]
all_gt = results["overcooked_ai"]["gt_actions"]


In [7]:
full_cm, mcc, labels = get_confusion_matrix(all_gt, all_preds, 36)
model = 'gpt5'
plot_and_save_confusion_matrix(full_cm, model, labels, mcc, title="Overcooked Confusion Matrix")

In [9]:
player0_preds, player1_preds, player0_gts, player1_gts = extract_per_player_metrics(all_preds, all_gt)

In [10]:
player0_cm, player0_mcc, player0_labels = get_confusion_matrix(player0_gts, player0_preds, 6)
model = 'gpt5'
plot_and_save_confusion_matrix(player0_cm, model, player0_labels, player0_mcc, title="Player 0 Overcooked Confusion Matrix", fig_size=(12, 10))


In [12]:
player1_cm, player1_mcc, player1_labels = get_confusion_matrix(player1_gts, player1_preds, 6)
model = 'gpt5'
plot_and_save_confusion_matrix(player1_cm, model, player1_labels, player1_mcc, title="Player 1 Overcooked Confusion Matrix", fig_size=(12, 10))


In [13]:
all_individual_gts = player0_gts + player1_gts
all_individual_preds = player0_preds + player1_preds
all_individual_cm, all_individual_mcc, all_individual_labels = get_confusion_matrix(all_individual_gts, all_individual_preds, 6)
model = 'gpt5'
plot_and_save_confusion_matrix(all_individual_cm, model, all_individual_labels, all_individual_mcc, title="Individual Actions Overcooked Confusion Matrix", fig_size=(12, 10))

In [16]:
model = 'gpt5'
plot_and_save_joint_action_heatmaps(model, results["overcooked_ai"]["class_precisions"], results["overcooked_ai"]["class_recalls"], results["overcooked_ai"]["class_f1s"])

# Pi0

In [5]:
results_dir = f"{project_root}/src/v1/results/pi0"
results_file = "pi0_base_overcooked_results.json"
with open(Path(results_dir) / results_file, 'r') as f:
    results = json.load(f)

In [None]:
all_preds = results["overcooked"]["all_preds"]
all_gt = results["overcooked"]["all_gt"]

In [None]:
# Create full confusion matrix for pi0
model = 'pi0'
full_cm, mcc, labels = get_confusion_matrix(all_gt, all_preds, 36)
plot_and_save_confusion_matrix(full_cm, model, labels, mcc, title="Overcooked Confusion Matrix")
player0_cm, player0_mcc, player0_labels = get_confusion_matrix(player0_gts, player0_preds, 6)
plot_and_save_confusion_matrix(player0_cm, model, player0_labels, player0_mcc, title="Player 0 Overcooked Confusion Matrix", fig_size=(12, 10))
player1_cm, player1_mcc, player1_labels = get_confusion_matrix(player1_gts, player1_preds, 6)
plot_and_save_confusion_matrix(player1_cm, model, player1_labels, player1_mcc, title="Player 1 Overcooked Confusion Matrix", fig_size=(12, 10))

all_individual_gts = player0_gts + player1_gts
all_individual_preds = player0_preds + player1_preds
all_individual_cm, all_individual_mcc, all_individual_labels = get_confusion_matrix(all_individual_gts, all_individual_preds, 6)
plot_and_save_confusion_matrix(all_individual_cm, model, all_individual_labels, all_individual_mcc, title="Individual Actions Overcooked Confusion Matrix", fig_size=(12, 10))


In [None]:
precs, recs, f1s = get_classwise_metrics(all_preds, all_gt, 36)
plot_and_save_joint_action_heatmaps(model, precs, recs, f1s)