In [2]:
import sys
sys.path.append("../")

from pathlib import Path
import json
from src.ensembles import MODEL_GROUPS
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
import numpy as np
from typing import List

In [3]:
# Extract models in each trial
TRIALS = [] 
for i in range(10): 
    fpath = Path(f"../smoothie_data/alpaca/algorithm_outputs/pick_random_{i}.json")
    results = json.loads(fpath.read_text())
    TRIALS.append(eval(results[0]["models_in_trial"]))
ALL_MODELS = list(set([model for sublist in TRIALS for model in sublist]))

In [4]:
results = pd.read_csv("../smoothie_data/alpaca/leaderboard.csv")
results = results.rename(columns={'Unnamed: 0': 'method'})
results

Unnamed: 0,method,win_rate,standard_error,mode,avg_length,n_wins,n_wins_base,n_draws,n_total,discrete_win_rate,length_controlled_winrate
0,yi-large-preview,55.652174,1.747646,verified,2317,446,355,4,805,55.652174,51.152662
1,smoothie_independent_all-mpnet-base-v2_trial_n...,55.652174,1.747646,verified,2317,446,355,4,805,55.652174,51.152662
2,smoothie_independent_all-mpnet-base-v2_trial_n...,55.652174,1.747646,verified,2317,446,355,4,805,55.652174,51.152662
3,smoothie_independent_all-mpnet-base-v2_trial_n...,55.652174,1.747646,verified,2317,446,355,4,805,55.652174,51.152662
4,smoothie_independent_all-mpnet-base-v2_trial_n...,55.652174,1.747646,verified,2317,446,355,4,805,55.652174,51.152662
5,smoothie_independent_all-mpnet-base-v2_trial_n...,55.652174,1.747646,verified,2317,446,355,4,805,55.652174,51.152662
6,smoothie_independent_all-mpnet-base-v2_trial_n...,55.652174,1.747646,verified,2317,446,355,4,805,55.652174,51.152662
7,smoothie_independent_all-mpnet-base-v2_trial_n...,55.652174,1.747646,verified,2317,446,355,4,805,55.652174,51.152662
8,smoothie_all-mpnet-base-v2_n_neighbors=1_trial...,51.872659,1.765422,verified,2323,415,385,1,801,51.872659,48.470315
9,Storm-7B,51.943005,1.799346,verified,2788,401,371,0,772,51.943005,47.431415


In [5]:
model_win_rates = {}
model_lc_win_rates = {}
for idx in range(len(results)):
    method = results.iloc[idx]["method"]
    if method in ALL_MODELS:
        model_win_rates[method] = results.iloc[idx]["win_rate"]
        model_lc_win_rates[method] = results.iloc[idx]["length_controlled_winrate"]

In [6]:
selected_best_by_win_rate = 0
selected_best_by_lc_win_rate = 0
for i in range(10): 
    ensemble = TRIALS[i]
    sorted_models_wr = sorted(ensemble, key=lambda x: model_win_rates[x], reverse=True)
    sorted_models_lcwr = sorted(ensemble, key=lambda x: model_lc_win_rates[x], reverse=True)
    
    fpath = Path(f"../smoothie_data/alpaca/algorithm_outputs/smoothie_independent_all-mpnet-base-v2_{i}_n_neighbors=1_trial_num={i}.json")
    results = json.loads(fpath.read_text())
    model_selected = results[0]["selected_model"]
    if sorted_models_wr[0] == model_selected: 
        selected_best_by_win_rate += 1

    if sorted_models_lcwr[0] == model_selected: 
        selected_best_by_lc_win_rate += 1
    
print(f"Selected best by win rate: {selected_best_by_win_rate}/10")
print(f"Selected best by LC win rate: {selected_best_by_lc_win_rate}/10")

Selected best by win rate: 8/10
Selected best by LC win rate: 7/10


In [14]:
best_win_rate_scores = []
selected_win_rate_scores = []
avg_win_rate_scores = []
best_lc_win_rate_scores = []
selected_lc_win_rate_scores = []
avg_lc_win_rate_scores = []

# Track per-trial differences for max difference calculation
selected_vs_random_diffs = []
selected_vs_random_lc_diffs = []

for i in range(10):
    ensemble = TRIALS[i]
    
    # Calculate win rates for this ensemble
    ensemble_win_rates = [model_win_rates[model] for model in ensemble]
    ensemble_lc_win_rates = [model_lc_win_rates[model] for model in ensemble]
    
    # Track best scores
    best_win_rate_scores.append(max(ensemble_win_rates))
    best_lc_win_rate_scores.append(max(ensemble_lc_win_rates))
    
    # Track average (random) scores
    avg_win_rate = np.mean(ensemble_win_rates)
    avg_lc_win_rate = np.mean(ensemble_lc_win_rates)
    avg_win_rate_scores.append(avg_win_rate)
    avg_lc_win_rate_scores.append(avg_lc_win_rate)
    
    # Track selected model scores
    fpath = Path(f"../smoothie_data/alpaca/algorithm_outputs/smoothie_independent_all-mpnet-base-v2_{i}_n_neighbors=1_trial_num={i}.json")
    results = json.loads(fpath.read_text())
    model_selected = results[0]["selected_model"]
    
    selected_score = model_win_rates[model_selected]
    selected_lc_score = model_lc_win_rates[model_selected]
    selected_win_rate_scores.append(selected_score)
    selected_lc_win_rate_scores.append(selected_lc_score)
    
    # Track differences between selected and random
    selected_vs_random_diffs.append(selected_score - avg_win_rate)
    selected_vs_random_lc_diffs.append(selected_lc_score - avg_lc_win_rate)

print("Standard Win Rate:")
print(f"Oracle:   {np.mean(best_win_rate_scores):.2f}")
print(f"Selected: {np.mean(selected_win_rate_scores):.2f}")
print(f"Random:   {np.mean(avg_win_rate_scores):.2f}")
print(f"Max Diff: {max(selected_vs_random_diffs):.2f}")
print()
print("Length-Controlled Win Rate:")
print(f"Oracle:   {np.mean(best_lc_win_rate_scores):.2f}")
print(f"Selected: {np.mean(selected_lc_win_rate_scores):.2f}")
print(f"Random:   {np.mean(avg_lc_win_rate_scores):.2f}")
print(f"Max Diff: {max(selected_vs_random_lc_diffs):.2f}")

Standard Win Rate:
Oracle:   55.47
Selected: 50.56
Random:   37.68
Max Diff: 22.99

Length-Controlled Win Rate:
Oracle:   49.42
Selected: 47.42
Random:   40.25
Max Diff: 13.34


# MixInstruct