# Setup

In [None]:
%matplotlib
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from dataclasses import dataclass
from typing import Dict, List, Tuple

# Themeing
sns.set_theme(style="whitegrid")
plt.rcParams.update({
    'font.family': 'serif',
    'font.serif': ['Computer Modern', 'DejaVu Serif', 'serif'],
    'mathtext.fontset': 'cm',
    'axes.formatter.use_mathtext': True,
})

# Font Sizes
plt.rcParams.update({
    "font.size": 8,
    "axes.labelsize": 8,
    "axes.titlesize": 8,
    "xtick.labelsize": 7,
    "ytick.labelsize": 7,
    "legend.fontsize": 7,
})

In [None]:
DOUBLE_COLUMN_WIDTH = 6.75133
SINGLE_COLUMN_WIDTH = 3.25063

FONT_SIZE = 8
LABEL_SIZE_MULTIPLIER = 1
GRID_ALPHA = 0.4
ACQUISITION_ORDER = ['Random', 'UltraFeedback', 'MaxMin', 'DeltaQwen', 'DeltaUCB', 'DRTS', 'InfoMax', 'DTS', 'MaxMinLCB']
DATASET_ORDER = ["UltraFeedback", "Skywork", "Combined", "Tulu 3"]

BENCHMARKS = ['gsm8k', 'ifeval', 'truthfulqa', 'alpacaeval_2', 'rewardbench_2']
DOWNSTREAM_BENCHMARKS = ['gsm8k', 'ifeval', 'truthfulqa', 'alpacaeval_2']
RM_BENCHMARKS = ["rewardbench_2"]

@dataclass
class AcquisitionStyle:
    marker: str
    hatch: str
    color: str
    dashes: Tuple[int, ...] | None

ACQUISITION_STYLES = {
    'Random': AcquisitionStyle(marker='o', hatch='', color='#a63f3f', dashes=None),
    'UltraFeedback': AcquisitionStyle(marker='s', hatch='/', color='#cb4d4d', dashes=(5, 2)),
    'MaxMin': AcquisitionStyle(marker='^', hatch='\\', color='#e06c6c', dashes=(2, 2)),
    'DeltaQwen': AcquisitionStyle(marker='D', hatch='x', color='#ef8f8f', dashes=(5, 2, 2, 2)),
    'DeltaUCB': AcquisitionStyle(marker='o', hatch='', color='#3f3fa6', dashes=None),
    'DRTS': AcquisitionStyle(marker='s', hatch='/', color='#4d4dcb', dashes=(5, 2)),
    'InfoMax': AcquisitionStyle(marker='o', hatch='', color='#3fa63f', dashes=None),
    'DTS': AcquisitionStyle(marker='s', hatch='/', color='#4dcb4d', dashes=(5, 2)),
    'MaxMinLCB': AcquisitionStyle(marker='^', hatch='\\', color='#6ce06c', dashes=(2, 2)),
}

# Load Data

In [None]:
if os.path.exists('full_results.csv'):
    print("Loaded full results")
    data = pd.read_csv('full_results.csv', sep=',')
else:
    acquisition_function_mapping = {
        "random": "Random",
        "ultrafeedback": "UltraFeedback",
        "maxmin": "MaxMin",
        "delta_qwen": "DeltaQwen",
        "DeltaUCB": "DeltaUCB",
        "DRTS": "DRTS",
        "InfoMax": "InfoMax",
        "DTS": "DTS",
        "MaxMinLCB": "MaxMinLCB",
    }

    base_model_scores = {
        "gsm8k": 0.758,
        "ifeval": 0.713,
        "truthfulqa": 0.468,
        "alpacaeval_2": 0.083,
        "rewardbench_2": 0.290
    }

    data = pd.read_csv('results.csv', sep=',')

    uf_dpo_sample_efficiency = pd.read_csv("ultrafeedback_dpo_sample_efficiency.csv")
    uf_rm_sample_efficiency = pd.read_csv("ultrafeedback_rm_sample_efficiency.csv")

    uf_sample_efficiency = pd.merge(
        uf_dpo_sample_efficiency,
        uf_rm_sample_efficiency,
        on='Method',
        suffixes=('_dpo', '_rm')
    )

    uf_sample_efficiency = uf_sample_efficiency[uf_sample_efficiency["Method"] != "SFT Base Model"].copy().reset_index(drop=True)

    uf_sample_efficiency = uf_sample_efficiency.rename(columns={
        'Mean_rm': 'rewardbench_2',
        'GSM8K': 'gsm8k',
        'IF Eval': 'ifeval',
        'Truthful QA': 'truthfulqa',
        'Alpaca Eval': 'alpacaeval_2',
    })

    uf_sample_efficiency['num_train_samples'] = uf_sample_efficiency['Method'].apply(lambda x: int(x.split('_')[-1]))
    uf_sample_efficiency['acquisition_function'] = uf_sample_efficiency['Method'].apply(lambda x: acquisition_function_mapping["_".join(x.split('_')[:-1]).split('-')[-1]])
    uf_sample_efficiency['po_algorithm'] = "DPO"
    uf_sample_efficiency['judge'] = "Qwen 3 235B"
    uf_sample_efficiency['dataset'] = "UltraFeedback"

    # Add base model scores at num_train_samples = 0 for sample efficiency plots
    for acq_name in acquisition_function_mapping.values():
        uf_sample_efficiency.loc[len(uf_sample_efficiency)] = {
            'dataset': 'UltraFeedback',
            'judge': 'Qwen 3 235B',
            'acquisition_function': acq_name,
            'po_algorithm': 'DPO',
            'num_train_samples': 0,
            'gsm8k': 0,
            'ifeval': 0,
            'truthfulqa': 0,
            'alpacaeval_2': 0,
            'rewardbench_2': 0
        }

    uf_sample_efficiency = uf_sample_efficiency.drop(columns=['Type_dpo', 'Mean_dpo', 'Type_rm', 'Factuality', 'Focus', 'Math', 'Precise IF', 'Safety', 'Ties', 'Method'])
    uf_sample_efficiency = uf_sample_efficiency[data.columns]
    acq_order = list(acquisition_function_mapping.values())
    uf_sample_efficiency['acq_func_order'] = uf_sample_efficiency['acquisition_function'].apply(lambda x: acq_order.index(x) if x in acq_order else -1)
    uf_sample_efficiency = uf_sample_efficiency.sort_values(by=['acq_func_order', 'num_train_samples']).drop(columns=['acq_func_order']).reset_index(drop=True)
    uf_sample_efficiency.to_csv("ultrafeedback_sample_efficiency.csv", index=False)

    data = pd.concat([data, uf_sample_efficiency], ignore_index=True)
    data = data.drop_duplicates().reset_index(drop=True)

    data = data.assign(
        num_train_samples_null=data['num_train_samples'].isna(),
        dataset_order_idx=data['dataset'].apply(lambda x: DATASET_ORDER.index(x) if x in DATASET_ORDER else len(DATASET_ORDER)),
        acquisition_order_idx=data['acquisition_function'].apply(
            lambda x: ACQUISITION_ORDER.index(x) if x in ACQUISITION_ORDER else len(ACQUISITION_ORDER))
    ).sort_values(
        by=['num_train_samples_null', 'dataset_order_idx', 'po_algorithm', 'acquisition_order_idx', 'num_train_samples'],
        ascending=[False, True, True, True, True]
    ).drop(columns=['num_train_samples_null', 'dataset_order_idx', 'acquisition_order_idx']).reset_index(drop=True)

    data.to_csv("full_results.csv", index=False)

In [None]:
data["rm_mean_score"] = data[RM_BENCHMARKS].mean(axis=1)
data["downstream_mean_score"] = data[DOWNSTREAM_BENCHMARKS].mean(axis=1)

po_algo_ablation_data = data[(data['dataset'] == 'UltraFeedback') & (data['num_train_samples'].isna())].copy()
po_algo_ablation_data.drop(columns=['rewardbench_2'], inplace=True)

dataset_ablation_data = data[(data['po_algorithm'] == 'DPO') & (data['num_train_samples'].isna())]

teaser_data = data[(data['po_algorithm'] == 'DPO') & (data['num_train_samples'].isna())]

sample_efficiency_ultrafeedback_data = data[(data['dataset'] == 'UltraFeedback') & (~data['num_train_samples'].isna())].copy()

# Plots

In [None]:
# ==============================================================================
# Dataset Ablation Plot
# ==============================================================================

# --- Style Setup ---
acquisition_colors = {k: v.color for k, v in ACQUISITION_STYLES.items()}
acquisition_hatches = {k: v.hatch for k, v in ACQUISITION_STYLES.items()}

# --- Figure Setup ---
fig, (ax_left, ax_right) = plt.subplots(1, 2, figsize=(DOUBLE_COLUMN_WIDTH, 2))

# --- Plot Data ---
# Left: downstream scores
sns.barplot(
    data=dataset_ablation_data,
    x='dataset',
    y='downstream_mean_score',
    hue='acquisition_function',
    palette=acquisition_colors,
    width=0.7,
    linewidth=1,
    edgecolor="white",
    order=DATASET_ORDER,
    hue_order=ACQUISITION_ORDER,
    ax=ax_left
)
# Right: reward model scores
sns.barplot(
    data=dataset_ablation_data,
    x='dataset',
    y='rm_mean_score',
    hue='acquisition_function',
    palette=acquisition_colors,
    width=0.7,
    linewidth=1,
    edgecolor="white",
    order=DATASET_ORDER,
    hue_order=ACQUISITION_ORDER,
    ax=ax_right
)

# --- Apply Hatches ---
n_hues = len(ACQUISITION_ORDER)
n_groups = len(dataset_ablation_data['dataset'].unique())
for ax in [ax_left, ax_right]:
    for i, bar in enumerate(ax.patches):
        hue_idx = i // n_groups
        if hue_idx < n_hues:
            acq_func = ACQUISITION_ORDER[hue_idx]
            bar.set_hatch(acquisition_hatches[acq_func])

# --- Legend ---
ax_left.get_legend().remove()
ax_right.get_legend().remove()
handles, labels = ax_left.get_legend_handles_labels()
for i, handle in enumerate(handles):
    if i < len(ACQUISITION_ORDER):
        acq_func = ACQUISITION_ORDER[i]
        handle.set_hatch(acquisition_hatches[acq_func] * 2)
fig.legend(
    handles,
    labels,
    loc='upper center',
    bbox_to_anchor=(0.515, 1.06),
    ncol=(len(acquisition_colors) - 1) // 2 + 1,
    frameon=False,
    fontsize=FONT_SIZE,
)

# --- Axis Labels & Titles ---
ax_left.set_xlabel('\n(a) Downstream Models', fontsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER)
ax_right.set_xlabel('\n(b) Reward Models', fontsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER)
ax_left.set_ylabel('Downstream Score $\\Delta$', fontsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER)
ax_right.set_ylabel('Reward Model Score $\\Delta$', fontsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER)

# --- Axis Ticks ---
ax_left.tick_params(axis='x', labelsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER)
ax_right.tick_params(axis='x', labelsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER)

# --- Grid ---
for ax in [ax_left, ax_right]:
    ax.grid(alpha=GRID_ALPHA)
    ax.grid(axis='x', alpha=0.0)

# --- Axis Limits ---
ax_right.set_ylim(0, 0.4)

# --- Save & Show ---
plt.tight_layout()
fig.savefig("dataset_ablation.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
# ==============================================================================
# PO Algorithm Ablation Plot (with broken y-axis)
# ==============================================================================

# --- Style Setup ---
acquisition_colors = {k: v.color for k, v in ACQUISITION_STYLES.items()}
acquisition_hatches = {k: v.hatch for k, v in ACQUISITION_STYLES.items()}

# --- Figure Setup ---
fig, (ax_top, ax_bottom) = plt.subplots(2, 1, sharex=True, figsize=(10, 4.1), gridspec_kw={
    'height_ratios': [8, 1],
    'hspace': 0.1
})

# --- Plot Data ---
for ax in [ax_top, ax_bottom]:
    sns.barplot(
        data=po_algo_ablation_data,
        x='po_algorithm',
        y='downstream_mean_score',
        hue='acquisition_function',
        palette=acquisition_colors,
        width=0.7,
        linewidth=1,
        edgecolor="white",
        order=['DPO', 'IPO', 'SimPO'],
        hue_order=ACQUISITION_ORDER,
        ax=ax
    )
    ax.get_legend().remove()

# --- Apply Hatches ---
n_hues = len(ACQUISITION_ORDER)
n_groups = len(po_algo_ablation_data['po_algorithm'].unique())
for ax in [ax_top, ax_bottom]:
    for i, bar in enumerate(ax.patches):
        hue_idx = i // n_groups
        if hue_idx < n_hues:
            acq_func = ACQUISITION_ORDER[hue_idx]
            bar.set_hatch(acquisition_hatches[acq_func])

# --- Legend ---
handles, labels = ax_top.get_legend_handles_labels()
for i, handle in enumerate(handles):
    if i < len(ACQUISITION_ORDER):
        acq_func = ACQUISITION_ORDER[i]
        handle.set_hatch(acquisition_hatches[acq_func] * 2)
fig.legend(
    handles,
    labels,
    loc='upper center',
    bbox_to_anchor=(0.51, 1.18),
    ncol=(len(acquisition_colors) - 1) // 3 + 1,
    frameon=False,
    fontsize=FONT_SIZE,
)

# --- Axis Labels & Titles ---
ax_bottom.set_xlabel('', fontsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER)
ax_top.set_ylabel('Downstream Score $\\Delta$', fontsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER, y=0.4)
ax_bottom.set_ylabel('')

# --- Axis Ticks ---
ax_top.set_yticks(np.arange(0, 0.25, 0.05))
ax_bottom.set_yticks([-0.25])
ax_bottom.tick_params(axis='x', labelsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER)

# --- Grid ---
for ax in [ax_top, ax_bottom]:
    ax.grid(axis='y', alpha=GRID_ALPHA)
    ax.grid(axis='x', alpha=0.0)

# --- Axis Limits ---
ax_top.set_ylim(-0.01, 0.22)
ax_bottom.set_ylim(-0.30, -0.20)

# --- Broken Axis Styling ---
ax_top.spines['bottom'].set_visible(False)
ax_bottom.spines['top'].set_visible(False)
ax_top.tick_params(bottom=False)
break_kwargs = {
    'marker': [(-1, -0.5), (1, 0.5)],
    'markersize': 12,
    'linestyle': 'none',
    'color': '0.8',
    'clip_on': False
}
ax_top.plot([0, 1], [0, 0], transform=ax_top.transAxes, **break_kwargs)
ax_bottom.plot([0, 1], [1, 1], transform=ax_bottom.transAxes, **break_kwargs)

# --- Save & Show ---
plt.tight_layout()
fig.savefig("po_algo_ablation.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
# ==============================================================================
# Sample Efficiency Plot (UltraFeedback)
# ==============================================================================

# --- Style Setup ---
acquisition_colors = {k: v.color for k, v in ACQUISITION_STYLES.items()}
acquisition_markers = {k: v.marker for k, v in ACQUISITION_STYLES.items()}
acquisition_dashes = {k: v.dashes if v.dashes is not None else "" for k, v in ACQUISITION_STYLES.items()}

# --- Figure Setup ---
fig, (ax_left, ax_right) = plt.subplots(1, 2, figsize=(14, 4), sharey=False)

# --- Plot Data ---
# Left: downstream scores
sns.lineplot(
    data=sample_efficiency_ultrafeedback_data,
    x='num_train_samples',
    y='downstream_mean_score',
    hue='acquisition_function',
    style='acquisition_function',
    hue_order=ACQUISITION_ORDER,
    style_order=ACQUISITION_ORDER,
    palette=acquisition_colors,
    markers=acquisition_markers,
    dashes=acquisition_dashes,
    markersize=10,
    ax=ax_left
)
# Right: reward model scores
sns.lineplot(
    data=sample_efficiency_ultrafeedback_data,
    x='num_train_samples',
    y='rm_mean_score',
    hue='acquisition_function',
    style='acquisition_function',
    hue_order=ACQUISITION_ORDER,
    style_order=ACQUISITION_ORDER,
    palette=acquisition_colors,
    markers=acquisition_markers,
    dashes=acquisition_dashes,
    markersize=10,
    ax=ax_right
)

# --- Legend ---
ax_left.get_legend().remove()
ax_right.get_legend().remove()
handles, labels = ax_left.get_legend_handles_labels()
fig.legend(
    handles,
    labels,
    loc='upper center',
    bbox_to_anchor=(0.5, 1.15),
    ncol=(len(acquisition_colors) - 1) // 2 + 1,
    frameon=False,
    fontsize=FONT_SIZE
)

# --- Axis Labels & Titles ---
ax_left.set_xlabel('Consumed Samples\n\n(a) Downstream Models', fontsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER)
ax_right.set_xlabel('Consumed Samples\n\n(b) Reward Models', fontsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER)
ax_left.set_ylabel('Downstream Score $\\Delta$', fontsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER)
ax_right.set_ylabel('Reward Model Score $\\Delta$', fontsize=FONT_SIZE * LABEL_SIZE_MULTIPLIER)

# --- Grid ---
ax_left.grid(axis='y', alpha=GRID_ALPHA)
ax_right.grid(axis='y', alpha=GRID_ALPHA)

# --- Axis Limits ---
ax_left.set_xlim(
    sample_efficiency_ultrafeedback_data['num_train_samples'].min() * 1.1,
    sample_efficiency_ultrafeedback_data['num_train_samples'].max() * 1.1
)
ax_left.set_ylim(
    sample_efficiency_ultrafeedback_data['downstream_mean_score'].min() * 1.1,
    sample_efficiency_ultrafeedback_data['downstream_mean_score'].max() * 1.1
)
ax_right.set_xlim(
    sample_efficiency_ultrafeedback_data['num_train_samples'].min() * 1.1,
    sample_efficiency_ultrafeedback_data['num_train_samples'].max() * 1.1
)
ax_right.set_ylim(
    sample_efficiency_ultrafeedback_data['rm_mean_score'].min() * 1.1,
    sample_efficiency_ultrafeedback_data['rm_mean_score'].max() * 1.1
)

# --- Grid ---
for ax in [ax_left, ax_right]:
    ax.grid(alpha=GRID_ALPHA)

# --- Save & Show ---
plt.tight_layout()
fig.savefig("sample_efficiency_ultrafeedback.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
# ==============================================================================
# Teaser Radar Plot (Normalized)
# ==============================================================================

# --- Style Setup ---
acquisition_colors = {k: v.color for k, v in ACQUISITION_STYLES.items()}
acquisition_markers = {k: v.marker for k, v in ACQUISITION_STYLES.items()}

# --- Data Preparation ---
labels = ['IFEval', 'GSM8K', 'TruthfulQA', 'AlpacaEval 2', 'RewardBench 2']
benchmark_cols = ['ifeval', 'gsm8k', 'truthfulqa', 'alpacaeval_2', 'rewardbench_2']
num_labels = len(labels)

teaser_data = teaser_data[teaser_data['acquisition_function'].isin([
    'DeltaUCB',
    'UltraFeedback',
    'DTS',
    'DRTS',
    'DeltaQwen',
])]

benchmark_mins = teaser_data[benchmark_cols].min() / 1.1
benchmark_maxs = teaser_data[benchmark_cols].max() * 1.1
y_max = 1.05

angle_offset = np.pi / 2 - (2 * np.pi / num_labels) * 1
angles = (np.linspace(0, 2 * np.pi, num_labels, endpoint=False) + angle_offset).tolist()
angles += angles[:1]

# --- Figure Setup ---
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

# --- Plot Data ---
for acq_func in ACQUISITION_ORDER:
    acq_data = teaser_data[teaser_data['acquisition_function'] == acq_func]
    if acq_data.empty:
        continue

    values = acq_data[benchmark_cols].mean()
    values_normalized = []
    for col in benchmark_cols:
        min_val, max_val = benchmark_mins[col], benchmark_maxs[col]
        norm_val = (values[col] - min_val) / (max_val - min_val)
        values_normalized.append(norm_val)

    values_closed = values_normalized + [values_normalized[0]]
    color = acquisition_colors[acq_func]
    marker = acquisition_markers[acq_func]

    ax.plot(angles, values_closed, color=color, linewidth=2, marker=marker, markersize=10, label=acq_func, zorder=10)

# --- Legend ---
ax.legend(
    loc='upper center',
    bbox_to_anchor=(0.5, -0.05),
    ncol=3,
    frameon=False,
    fontsize=FONT_SIZE
)

# --- Axis Labels & Titles ---
ax.set_xticks(angles[:-1])
ax.set_xticklabels([])
label_rotations = {
    'GSM8K': 0,
    'IFEval': -72,
    'RewardBench 2': 36,
    'AlpacaEval 2': -36,
    'TruthfulQA': 72
}
for angle, label in zip(angles[:-1], labels):
    rotation = label_rotations.get(label, 0)
    ax.text(angle, 1.18, label, fontsize=FONT_SIZE, fontweight='bold',
            ha='center', va='center', rotation=rotation)

# --- Axis Ticks ---
ax.set_yticks([0.25, 0.5, 0.75, 1.0])
ax.set_yticklabels([])
tick_positions = [0.25, 0.5, 0.75, 1.0]
for angle, col in zip(angles[:-1], benchmark_cols):
    min_val, max_val = benchmark_mins[col], benchmark_maxs[col]
    for tick_pos in tick_positions:
        actual_val = min_val + tick_pos * (max_val - min_val)
        label_angle = angle + 0.06
        ax.text(label_angle, tick_pos, f'{actual_val:.2f}', fontsize=12, color='gray',
                ha='left', va='center', zorder=1)

# --- Grid ---
ax.yaxis.grid(True, linestyle='--', color='gray', alpha=0.5)
ax.xaxis.grid(True, linestyle='--', color='gray', alpha=0.3)

# --- Axis Limits ---
ax.set_ylim(0, y_max)
ax.set_rlim(0, y_max)
ax.spines['polar'].set_visible(False)

# --- Save & Show ---
plt.tight_layout()
fig.savefig("teaser_radar_normalized.pdf", format="pdf", bbox_inches="tight")
plt.show()