# Evaluation

In [None]:
import random
from pathlib import Path
from typing import Optional

import pandas as pd
import matplotlib.pyplot as plt; plt.style.use('seaborn-v0_8')
from loguru import logger

from fax.config import get_cfg_defaults
from fax.dataprep.slp_reader import EvalReplayRecord, parse_eval_replay

cfg = get_cfg_defaults()

After having run `uv run fax/eval/run_eval.py` on all eight agents on the FOX, FALCO, and MARTH dittos, we can now parse the replays and calculate some stats.

In [None]:
def clean_replay_dir(dir: Path) -> None:
    """Remove invalid replays from a directory."""
    n_removed = 0
    for replay_path in list(dir.glob('*.slp')):
        try:
            replay: Optional[EvalReplayRecord] = parse_eval_replay(replay_path)
        except Exception as e:
            logger.warning(f'Failed to parse replay {replay_path}: {e}')
            replay_path.unlink(missing_ok=True)
            n_removed += 1
            continue
        if replay is None:
            logger.warning(f'Invalid replay: {replay_path}, removing')
            replay_path.unlink(missing_ok=True)
            n_removed += 1
        else:  # valid replay
            continue
    if n_removed > 0:
        logger.info(f'Removed {n_removed} invalid replays from {dir}')

In [None]:
agents = ['XvX', 'XvX-ft', 'XvF', 'XvF-ft', 'FvX', 'FvX-ft', 'FvF', 'FvF-ft']
columns = ['win_rate', 'kd_ratio']

df_fox = pd.DataFrame(index=agents, columns=columns)
df_falco = pd.DataFrame(index=agents, columns=columns)
df_marth = pd.DataFrame(index=agents, columns=columns)

random.seed(cfg.base.seed)

for agent in agents:
    for char, df in zip(['FOX', 'FALCO', 'MARTH'], [df_fox, df_falco, df_marth]):
        replay_dir = Path(cfg.paths.replays) / 'eval' / char / f'1k-{agent}_vs_cpu'
        # clean_replay_dir(replay_dir)
        replay_paths = list(replay_dir.glob('*.slp'))
        assert len(replay_paths) >= 100, f'Expected at least 100 replays for {char} {agent}, found {len(replay_paths)}'
        wins = 0
        stocks_taken = 0
        stocks_lost = 0
        for replay_path in random.sample(replay_paths, k=100):
            replay: Optional[EvalReplayRecord] = parse_eval_replay(replay_path)
            assert replay is not None, f'Failed to parse replay {replay_path}'            
            if replay.p1stocks > 0:
                wins += 1
            stocks_taken += 4 - replay.p2stocks
            stocks_lost += 4 - replay.p1stocks
        win_rate = wins / 100
        kd_ratio = stocks_taken / max(stocks_lost, 1)
        df.loc[agent] = [win_rate, kd_ratio]

df_fox

In [None]:
fig, (axf, axc, axm) = plt.subplots(1, 3, figsize=(10, 5), sharey=True)

# define 8 pairs of colors (one for win_rate, one for kd_ratio)
colors = [
    ("#1f77b4", "#aec7e8"),  # blues for XvX
    ("#2ca02c", "#98df8a"),  # greens for XvF
    ("#ff7f0e", "#ffbb78"),  # oranges for FvX
    ("#d62728", "#ff9896"),  # reds for FvF
]

# for normalizing kd_ratio bars to fit in the same plot
max_kd = max(df_fox['kd_ratio'].max(), df_falco['kd_ratio'].max(), df_marth['kd_ratio'].max())
bar_width = 0.4

for df, char, ax in zip([df_fox, df_falco, df_marth], ['FOX', 'FALCO', 'MARTH'], [axf, axc, axm]):
    ax: plt.Axes
    for i, idx in enumerate(df.index):
        win_col, kd_col = colors[i // 2 % len(colors)]

        # win_rate bar
        ax.bar(idx, df.loc[idx, 'win_rate'], 
               label='win rate' if i == 0 else "", 
               align='edge', width=-bar_width,
               color=win_col, edgecolor='black', linewidth=0.5
        )
        # kd_ratio bar (normalized)
        ax.bar(idx, df.loc[idx, 'kd_ratio'] / max_kd,
               label='K/D ratio (normalized)' if i == 0 else "", 
               align='edge', width=bar_width,
               color=kd_col, edgecolor='black', linewidth=0.5
        )

    # annotate k/d ratio bars with actual value
    for i, idx in enumerate(df.index):
        win_val = df.loc[idx, "win_rate"]
        kd_val = df.loc[idx, "kd_ratio"] / max_kd

        # kd_ratio bar
        ax.text(
            i + bar_width/2,
            kd_val + 0.01,
            f"{df.loc[idx, 'kd_ratio']:.2f}",
            ha="center",
            va="bottom",
        )

    # set labels, title, legend
    ax.set_ylim(0, 1.1)
    ax.legend()
    ax.set_title(f'{char} vs {char}')

fig.suptitle('Agent Evaluation vs Level 9 CPU (100 games each)')
fig.tight_layout()
fig.savefig('../../figs/eval_results.png', dpi=500)