In [None]:
%matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from dataclasses import dataclass
from typing import Dict, List, Tuple

sns.set_theme(style="whitegrid")

plt.rcParams.update({
    'font.family': 'serif',
    'font.serif': ['Computer Modern', 'DejaVu Serif', 'serif'],
    'mathtext.fontset': 'cm',
    'axes.formatter.use_mathtext': True,
})

base_model_scores = {
    "gsm8k": 0.758,
    "ifeval": 0.713,
    "truthfulqa": 0.468,
    "alpacaeval_2": 0.083,
    "rewardbench_2": 0.290
}

data = pd.read_csv('results.csv', sep=',')
data.head()

In [None]:
FONT_SIZE = 12
ACQUISITION_ORDER = ['Random', 'UltraFeedback', 'MaxMin', 'DeltaQwen', 'DeltaUCB', 'DRTS', 'InfoMax', 'DTS', 'MaxMinLCB']
DATASET_ORDER = ["UltraFeedback", "Skywork", "Combined", "Tulu 3"]

BENCHMARKS = ['gsm8k', 'ifeval', 'truthfulqa', 'alpacaeval_2', 'rewardbench_2']
DOWNSTREAM_BENCHMARKS = ['gsm8k', 'ifeval', 'truthfulqa', 'alpacaeval_2']
RM_BENCHMARKS = ["rewardbench_2"]

In [None]:
acquisition_function_mapping = {
    "random": "Random",
    "ultrafeedback": "UltraFeedback",
    "maxmin": "MaxMin",
    "delta_qwen": "DeltaQwen",
    "DeltaUCB": "DeltaUCB",
    "DRTS": "DRTS",
    "InfoMax": "InfoMax",
    "DTS": "DTS",
    "MaxMinLCB": "MaxMinLCB",
}

# UltraFeedback Sample Efficiency Data for Baselines:
# if True:
#     all_results = pd.read_csv("./all_results.csv")

#     all_results['dataset'] = "UltraFeedback"
#     all_results['po_algorithm'] = "DPO"
#     all_results['judge'] = "Qwen 3 235B"

#     # Rename columns
#     all_results = all_results.rename(columns={
#         "gsm8k_tulu": "gsm8k",
#         "ifeval_tulu": "ifeval",
#         "truthfulqa_tulu": "truthfulqa"
#     })

#     # Extract num_train_steps and acquisition function from model name
#     all_results['num_train_steps'] = all_results['model'].apply(lambda x: int(x.split('_')[-1]))
#     all_results['acquisition_function'] = all_results['model'].apply(lambda x: acquisition_function_mapping["_".join(x.split('_')[:-1]).split('-')[-1]])
#     all_results['alpacaeval_2'] = np.nan
#     all_results['rewardbench_2'] = np.nan

#     # Drop unnecessary columns
#     all_results = all_results.drop(columns=['average', 'model']).copy()
#     all_results['num_train_steps'] = all_results['num_train_steps'].astype(int)

#     # Sort results by acquisition function order
#     all_results = all_results[data.columns]
#     acq_order = list(acquisition_function_mapping.values())
#     all_results['acq_func_order'] = all_results['acquisition_function'].apply(lambda x: acq_order.index(x) if x in acq_order else -1)
#     all_results = all_results.sort_values(by=['acq_func_order', 'num_train_steps']).drop(columns=['acq_func_order']).reset_index(drop=True)

#     # Add base model scores at num_train_steps = 0 for sample efficiency plots
#     for acq_name in acquisition_function_mapping.values():
#         row = {
#             'dataset': 'UltraFeedback',
#             'judge': 'Qwen 3 235B',
#             'acquisition_function': acq_name,
#             'po_algorithm': 'DPO',
#             'num_train_steps': 0,
#             'gsm8k': base_model_scores.get('gsm8k'),
#             'ifeval': base_model_scores.get('ifeval'),
#             'truthfulqa': base_model_scores.get('truthfulqa'),
#             'alpacaeval_2': base_model_scores.get('alpacaeval_2'),
#             'rewardbench_2': base_model_scores.get('rewardbench_2'),
#         }
#         all_results.loc[len(all_results)] = row

# UltraFeedback Sample Efficiency Data for Active Learning Methods:
if True:
    uf_dpo_sample_efficiency = pd.read_csv("ultrafeedback_dpo_sample_efficiency.csv")
    uf_rm_sample_efficiency = pd.read_csv("ultrafeedback_rm_sample_efficiency.csv")

    uf_sample_efficiency = pd.merge(
        uf_dpo_sample_efficiency,
        uf_rm_sample_efficiency,
        on='Method',
        suffixes=('_dpo', '_rm')
    )

    uf_sample_efficiency = uf_sample_efficiency[uf_sample_efficiency["Method"] != "SFT Base Model"].copy().reset_index(drop=True)

    uf_sample_efficiency = uf_sample_efficiency.rename(columns={
        'Mean_rm': 'rewardbench_2',
        'GSM8K': 'gsm8k',
        'IF Eval': 'ifeval',
        'Truthful QA': 'truthfulqa',
        'Alpaca Eval': 'alpacaeval_2',
    })

    uf_sample_efficiency['num_train_steps'] = uf_sample_efficiency['Method'].apply(lambda x: int(x.split('_')[-1]))
    uf_sample_efficiency['acquisition_function'] = uf_sample_efficiency['Method'].apply(lambda x: acquisition_function_mapping["_".join(x.split('_')[:-1]).split('-')[-1]])
    uf_sample_efficiency['po_algorithm'] = "DPO"
    uf_sample_efficiency['judge'] = "Qwen 3 235B"
    uf_sample_efficiency['dataset'] = "UltraFeedback"

    # Add base model scores at num_train_steps = 0 for sample efficiency plots
    for acq_name in acquisition_function_mapping.values():
        uf_sample_efficiency.loc[len(uf_sample_efficiency)] = {
            'dataset': 'UltraFeedback',
            'judge': 'Qwen 3 235B',
            'acquisition_function': acq_name,
            'po_algorithm': 'DPO',
            'num_train_steps': 0,
            'gsm8k': 0, # base_model_scores.get('gsm8k'),
            'ifeval': 0, # base_model_scores.get('ifeval'),
            'truthfulqa': 0, # base_model_scores.get('truthfulqa'),
            'alpacaeval_2': 0, # base_model_scores.get('alpacaeval_2'),
            'rewardbench_2': 0 # base_model_scores.get('rewardbench_2'),
        }

    uf_sample_efficiency = uf_sample_efficiency.drop(columns=['Type_dpo', 'Mean_dpo', 'Type_rm', 'Factuality', 'Focus', 'Math', 'Precise IF', 'Safety', 'Ties', 'Method'])
    uf_sample_efficiency = uf_sample_efficiency[data.columns]
    acq_order = list(acquisition_function_mapping.values())
    uf_sample_efficiency['acq_func_order'] = uf_sample_efficiency['acquisition_function'].apply(lambda x: acq_order.index(x) if x in acq_order else -1)
    uf_sample_efficiency = uf_sample_efficiency.sort_values(by=['acq_func_order', 'num_train_steps']).drop(columns=['acq_func_order']).reset_index(drop=True)
    uf_sample_efficiency.to_csv("ultrafeedback_sample_efficiency.csv", index=False)

data = pd.concat([data, uf_sample_efficiency], ignore_index=True)
data = data.drop_duplicates().reset_index(drop=True)


data = data.assign(
    num_train_steps_null=data['num_train_steps'].isna(),
    dataset_order_idx=data['dataset'].apply(lambda x: DATASET_ORDER.index(x) if x in DATASET_ORDER else len(DATASET_ORDER)),
    acquisition_order_idx=data['acquisition_function'].apply(
        lambda x: ACQUISITION_ORDER.index(x) if x in ACQUISITION_ORDER else len(ACQUISITION_ORDER))
).sort_values(
    by=['num_train_steps_null', 'dataset_order_idx', 'po_algorithm', 'acquisition_order_idx', 'num_train_steps'],
    ascending=[False, True, True, True, True]
).drop(columns=['num_train_steps_null', 'dataset_order_idx', 'acquisition_order_idx']).reset_index(drop=True)

data.to_csv("merged.csv", index=False)


In [None]:
@dataclass
class AcquisitionStyle:
    marker: str
    hatch: str
    color: str
    dashes: Tuple[int, ...] | None

acquisition_styles = {
    'Random': AcquisitionStyle(marker='o', hatch='', color='#a63f3f', dashes=None),
    'UltraFeedback': AcquisitionStyle(marker='s', hatch='/', color='#cb4d4d', dashes=(5, 2)),
    'MaxMin': AcquisitionStyle(marker='^', hatch='\\', color='#e06c6c', dashes=(2, 2)),
    'DeltaQwen': AcquisitionStyle(marker='D', hatch='x', color='#ef8f8f', dashes=(5, 2, 2, 2)),
    'DeltaUCB': AcquisitionStyle(marker='o', hatch='', color='#3f3fa6', dashes=None),
    'DRTS': AcquisitionStyle(marker='s', hatch='/', color='#4d4dcb', dashes=(5, 2)),
    'InfoMax': AcquisitionStyle(marker='o', hatch='', color='#3fa63f', dashes=None),
    'DTS': AcquisitionStyle(marker='s', hatch='/', color='#4dcb4d', dashes=(5, 2)),
    'MaxMinLCB': AcquisitionStyle(marker='^', hatch='\\', color='#6ce06c', dashes=(2, 2)),
}

In [None]:
data["rm_mean_score"] = data[RM_BENCHMARKS].mean(axis=1)
data["downstream_mean_score"] = data[DOWNSTREAM_BENCHMARKS].mean(axis=1)

po_algo_ablation_data = data[(data['dataset'] == 'UltraFeedback') & (data['num_train_steps'].isna())].copy()
po_algo_ablation_data.drop(columns=['rewardbench_2'], inplace=True)

dataset_ablation_data = data[(data['po_algorithm'] == 'DPO') & (data['num_train_steps'].isna())]
teaser_data = data[(data['po_algorithm'] == 'DPO') & (data['num_train_steps'].isna())]

sample_efficiency_ultrafeedback_data = data[(data['dataset'] == 'UltraFeedback') & (~data['num_train_steps'].isna())].copy()

In [None]:
po_algo_ablation_copy = po_algo_ablation_data.copy()
po_algo_ablation_copy['downstream_mean_score'] = po_algo_ablation_copy['downstream_mean_score'].clip(lower=0)

acquisition_colors = {k: v.color for k, v in acquisition_styles.items()}
acquisition_hatches = {k: v.hatch for k, v in acquisition_styles.items()}

fig, ax = plt.subplots(figsize=(17, 4))
sns.barplot(
    data=po_algo_ablation_copy,
    x='po_algorithm',
    y='downstream_mean_score',
    hue='acquisition_function',
    palette=acquisition_colors,
    width=0.7,
    linewidth=1,
    edgecolor="white",
    order=['DPO', 'IPO', 'SimPO'],
    hue_order=['Random', 'UltraFeedback', 'MaxMin', 'DeltaQwen', 'DRTS', 'DeltaUCB', 'InfoMax', 'DTS', 'MaxMinLCB'],
    ax=ax
)

# Apply hatches to bars - seaborn orders patches by hue first
hue_order = list(acquisition_styles.keys())
n_hues = len(hue_order)
n_groups = len(po_algo_ablation_data['po_algorithm'].unique())

for i, bar in enumerate(ax.patches):
    # Each hue has n_groups bars consecutively
    hue_idx = i // n_groups
    if hue_idx < n_hues:
        acq_func = hue_order[hue_idx]
        bar.set_hatch(acquisition_hatches[acq_func])

ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15),
            ncol=len(acquisition_colors), frameon=False, fontsize=FONT_SIZE)

# Update legend to show hatches
for i, patch in enumerate(ax.legend_.get_patches()):
    if i < len(hue_order):
        acq_func = hue_order[i]
        patch.set_hatch(acquisition_hatches[acq_func] * 2)

plt.grid(axis='y', alpha=0.75)
plt.xlabel('', fontsize=FONT_SIZE * 1.5, weight='bold')
plt.xticks(fontweight='bold', fontsize=FONT_SIZE * 1.5)
plt.ylabel('Mean Score Delta', fontsize=FONT_SIZE * 1.5, weight='bold')
ax.set_yticks(np.arange(po_algo_ablation_copy['downstream_mean_score'].min(), po_algo_ablation_copy['downstream_mean_score'].max()+0.05, 0.05))
# plt.tight_layout()
fig.savefig("po_algo_ablation.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
acquisition_colors = {k: v.color for k, v in acquisition_styles.items()}
acquisition_hatches = {k: v.hatch for k, v in acquisition_styles.items()}

fig, (ax_top, ax_bottom) = plt.subplots(2, 1, sharex=True, figsize=(17, 4.1), gridspec_kw={
    'height_ratios': [6, 1], 
    'hspace': 0.1
})

for ax in [ax_top, ax_bottom]:
    sns.barplot(
        data=po_algo_ablation_data,
        x='po_algorithm',
        y='downstream_mean_score',
        hue='acquisition_function',
        palette=acquisition_colors,
        width=0.7,
        linewidth=1,
        edgecolor="white",
        order=['DPO', 'IPO', 'SimPO'],
        hue_order=ACQUISITION_ORDER,
        ax=ax
    )

    # Remove in favor of custom legend
    ax.get_legend().remove()

# Add legend to top axis
ax_top.legend(
    loc='upper center',
    bbox_to_anchor=(0.5, 1.18),
    ncol=len(acquisition_colors),
    frameon=False,
    fontsize=FONT_SIZE,
    columnspacing=1.5
)

# Apply hatches
n_hues = len(ACQUISITION_ORDER)
n_groups = len(po_algo_ablation_data['po_algorithm'].unique())
for ax in [ax_top, ax_bottom]:
    for i, bar in enumerate(ax.patches):
        hue_idx = i // n_groups
        if hue_idx < n_hues:
            acq_func = ACQUISITION_ORDER[hue_idx]
            bar.set_hatch(acquisition_hatches[acq_func])

# Update legend to show hatches
for i, patch in enumerate(ax_top.legend_.get_patches()):
    if i < len(ACQUISITION_ORDER):
        acq_func = ACQUISITION_ORDER[i]
        patch.set_hatch(acquisition_hatches[acq_func] * 2)

ax_top.spines['bottom'].set_visible(False)
ax_bottom.spines['top'].set_visible(False)
ax_top.tick_params(bottom=False)

ax_top.set_ylim(-0.025, 0.22)
ax_bottom.set_ylim(-0.30, -0.22)

# Grid and labels
ax_top.grid(axis='y', alpha=0.75)
ax_bottom.grid(axis='y', alpha=0.75)
ax_bottom.set_xlabel('', fontsize=FONT_SIZE * 1.5, weight='bold')
ax_bottom.tick_params(axis='x', labelsize=FONT_SIZE * 1.5)
for label in ax_bottom.get_xticklabels():
    label.set_fontweight('bold')

# Set y-ticks for both axes
ax_top.set_yticks(np.arange(0, 0.25, 0.05))
ax_bottom.set_yticks([-0.25])

# Shared y-axis label
ax_top.set_ylabel('Mean Score Delta', fontsize=FONT_SIZE * 1.5, weight='bold', y=0.4)
ax_bottom.set_ylabel('')

# Add break lines
width = .5
kwargs = {
    'marker': [(-1, -width), (1, width)],
    'markersize': 12,
    'linestyle': "none",
    'color': '0.8',
    'clip_on': False
}
ax_top.plot([0, 1], [0, 0], transform=ax_top.transAxes, **kwargs)
ax_bottom.plot([0, 1], [1, 1], transform=ax_bottom.transAxes, **kwargs)

fig.savefig("po_algo_ablation.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
acquisition_colors = {k: v.color for k, v in acquisition_styles.items()}
acquisition_markers = {k: v.marker for k, v in acquisition_styles.items()}
acquisition_dashes = {k: v.dashes if v.dashes is not None else "" for k, v in acquisition_styles.items()}

fig, ax = plt.subplots(figsize=(6,5))
sns.lineplot(
    data=sample_efficiency_ultrafeedback_data,
    x='num_train_steps',
    y='downstream_mean_score',
    hue='acquisition_function',
    style='acquisition_function',
    palette=acquisition_colors,
    markers=acquisition_markers,
    dashes=acquisition_dashes,
    markersize=10,
    ax=ax
)

ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.3),
            ncol=3, frameon=False, fontsize=FONT_SIZE)

plt.xlim(0, sample_efficiency_ultrafeedback_data['num_train_steps'].max() * 1.1)
plt.ylim(0, sample_efficiency_ultrafeedback_data['downstream_mean_score'].max() * 1.1)

plt.xlabel('Number of Training Samples', fontsize=FONT_SIZE * 1.5, weight='bold')
plt.ylabel('Mean Score', fontsize=FONT_SIZE * 1.5, weight='bold')
plt.tight_layout()
fig.savefig("sample_efficiency_ultrafeedback.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
# Normalize downstream_mean_score per acquisition function to [0, 1]
sample_efficiency_normalized = sample_efficiency_ultrafeedback_data.copy()

def normalize_group(group):
    min_val = group['downstream_mean_score'].min()
    max_val = group['downstream_mean_score'].max()
    if max_val - min_val == 0:
        norm = [0.5] * len(group)
    else:
        norm = (group['downstream_mean_score'] - min_val) / (max_val - min_val)
    return pd.Series(norm, index=group.index)

sample_efficiency_normalized['downstream_mean_score_normalized'] = (
    sample_efficiency_normalized
    .groupby('acquisition_function', group_keys=False)
    .apply(normalize_group, include_groups=False)
)

acquisition_colors = {k: v.color for k, v in acquisition_styles.items()}
acquisition_markers = {k: v.marker for k, v in acquisition_styles.items()}
acquisition_dashes = {k: v.dashes if v.dashes is not None else "" for k, v in acquisition_styles.items()}

fig, ax = plt.subplots(figsize=(6,5))
sns.lineplot(
    data=sample_efficiency_normalized,
    x='num_train_steps',
    y='downstream_mean_score_normalized',
    hue='acquisition_function',
    style='acquisition_function',
    palette=acquisition_colors,
    markers=acquisition_markers,
    dashes=acquisition_dashes,
    markersize=10,
    ax=ax
)

ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.3),
          ncol=3, frameon=False, fontsize=FONT_SIZE)

plt.xlim(0, 65000)
plt.ylim(0, 1.1)

plt.xlabel('Number of Training Samples', fontsize=FONT_SIZE * 1.5, weight='bold')
plt.ylabel('Normalized Mean Score Delta', fontsize=FONT_SIZE * 1.5, weight='bold')
plt.tight_layout()
fig.savefig("sample_efficiency_ultrafeedback_normalized.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
# Radar plot with per-benchmark normalization
labels = ['GSM8K', 'IFEval', 'TruthfulQA', 'AlpacaEval 2', 'RewardBench 2']
benchmark_cols = ['gsm8k', 'ifeval', 'truthfulqa', 'alpacaeval_2', 'rewardbench_2']
num_labels = len(labels)

acquisition_colors = {k: v.color for k, v in acquisition_styles.items()}
acquisition_markers = {k: v.marker for k, v in acquisition_styles.items()}

# Pre-compute min/max per benchmark for normalization
benchmark_mins = teaser_data[benchmark_cols].min()
benchmark_maxs = teaser_data[benchmark_cols].max()

# Create angles for each category
# Offset so IFEval (index 1) points straight up (Ï€/2)
angle_offset = np.pi / 2 - (2 * np.pi / num_labels) * 1  # Rotate to put IFEval at top
angles = (np.linspace(0, 2 * np.pi, num_labels, endpoint=False) + angle_offset).tolist()
angles += angles[:1]  # Close the polygon

# Create the plot
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

# Plot each acquisition function from teaser_data
for acq_func in ACQUISITION_ORDER:
    acq_data = teaser_data[teaser_data['acquisition_function'] == acq_func]
    if acq_data.empty:
        continue
    
    # Get benchmark values (use mean if multiple rows per acquisition function)
    values = acq_data[benchmark_cols].mean()
    
    # Normalize to [0, 1] per benchmark (each axis has its own scale)
    values_normalized = []
    for col in benchmark_cols:
        min_val, max_val = benchmark_mins[col], benchmark_maxs[col]
        if max_val - min_val > 0:
            norm_val = (values[col] - min_val) / (max_val - min_val)
        else:
            norm_val = 0.5
        values_normalized.append(norm_val)
    
    values_closed = values_normalized + [values_normalized[0]]  # Close the polygon
    
    color = acquisition_colors.get(acq_func, '#888888')
    marker = acquisition_markers.get(acq_func, 'o')
    
    ax.plot(angles, values_closed, color=color, linewidth=2, marker=marker, markersize=10, label=acq_func)

# Set category labels with proper alignment
ax.set_xticks(angles[:-1])
ax.set_xticklabels([])  # Remove default labels

# Add labels manually, positioned outside the plot
for angle, label in zip(angles[:-1], labels):
    angle_deg = np.degrees(angle) % 360
    
    # Special handling for bottom labels to avoid clipping
    if label == 'AlpacaEval 2':
        ha, va = 'right', 'top'
    elif label == 'RewardBench 2':
        ha, va = 'left', 'top'
    # Right side
    elif -45 <= angle_deg <= 45 or angle_deg >= 315:
        ha, va = 'left', 'center'
    # Top
    elif 45 < angle_deg < 135:
        ha, va = 'center', 'bottom'
    # Left side  
    elif 135 <= angle_deg <= 225:
        ha, va = 'right', 'center'
    # Bottom
    else:
        ha, va = 'center', 'top'
    
    ax.text(angle, 1.12, label, fontsize=FONT_SIZE, fontweight='bold',
            ha=ha, va=va)

# Configure radial axis (normalized scale 0-1)
ax.set_ylim(0, 1.05)
ax.set_yticks([0.25, 0.5, 0.75, 1.0])
ax.set_yticklabels(['0.25', '0.5', '0.75', '1.0'], color='gray', fontsize=9)
ax.yaxis.grid(True, linestyle='--', alpha=0.5)

# Add legend
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
          ncol=3, frameon=False, fontsize=FONT_SIZE)

plt.tight_layout()
fig.savefig("teaser_radar_normalized.pdf", format="pdf", bbox_inches="tight")
plt.show()