In [89]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from package.RankAMIP.logistic import run_logistic_regression
from package.RankAMIP.data_script import make_BT_design_matrix
from package.RankAMIP.logistic import LogisticAMIP
from package.RankAMIP.logistic import find_closest_matchups
from package.RankAMIP.logistic import isRankingRobust
from package.RankAMIP.data_script import *

### How Robust is the Multi-turn Benchmark to Data-Dropping?

The MT-Bench (Multi-Turn Benchmark) is a curated set of 80 multi-turn dialogue prompts designed to evaluate the conversational and instruction-following capabilities of large language models (LLMs). Each prompt simulates realistic, multi-turn interactions that test a model's ability to maintain context, reason logically, and provide coherent responses across various domains, including general knowledge, reasoning, programming, and open-ended tasks.

### Load Data

In [130]:
# Import datasets from 
# https://huggingface.co/datasets/lmsys/mt_bench_human_judgments
from datasets import load_dataset
ds = load_dataset("lmsys/mt_bench_human_judgments")

In [None]:
# inspect the available splits
print(ds)  
# MT-bench has both human and a gpt4-judge data.
gpt4_pair = ds["gpt4_pair"] 
human = ds["human"] 
# look at the first example
print(gpt4_pair[0])

DatasetDict({
    gpt4_pair: Dataset({
        features: ['question_id', 'model_a', 'model_b', 'winner', 'judge', 'conversation_a', 'conversation_b', 'turn'],
        num_rows: 2400
    })
    human: Dataset({
        features: ['question_id', 'model_a', 'model_b', 'winner', 'judge', 'conversation_a', 'conversation_b', 'turn'],
        num_rows: 3355
    })
})
{'question_id': 81, 'model_a': 'alpaca-13b', 'model_b': 'claude-v1', 'winner': 'model_b', 'judge': 'gpt4_pair', 'conversation_a': [{'content': 'Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.', 'role': 'user'}, {'content': 'I recently had the pleasure of visiting Hawaii and it quickly became one of my favorite places. From the stunning beaches to the lush mountains, this place has it all. The people are incredibly friendly and the culture is alive and well. One of the highlights of my trip was visiting the Polynesian Cultural Center. Here, I was able 

In [132]:
df = human.to_pandas()
df.shape

(3355, 8)

In [133]:
# create a column winner_model_a, which is 1 if model_a is preferred, 0 if model_b is preferred
df['winner_model_a'] = df['winner'].apply(lambda x: 1 if x == 'model_a' else 0)
# create a column called winner_tie that is 1 if the winner is 'tie', else 0
df['winner_tie'] = df['winner'].apply(lambda x: 1 if x == 'tie' else 0)
df.head()

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,turn,winner_model_a,winner_tie
0,81,alpaca-13b,gpt-3.5-turbo,model_b,author_2,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,1,0,0
1,81,alpaca-13b,gpt-3.5-turbo,model_b,author_2,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,2,0,0
2,81,alpaca-13b,gpt-3.5-turbo,model_b,expert_17,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,1,0,0
3,81,alpaca-13b,gpt-3.5-turbo,model_b,expert_17,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,2,0,0
4,81,alpaca-13b,vicuna-13b-v1.2,model_b,expert_0,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,1,0,0


In [134]:
ties = df[df['winner_tie'] == 1]
print(f"Number of ties: {len(ties)}")
# proportion of ties.
print(f"Proportion of ties: {len(ties) / len(df):.2%}")
# note, the proportion of ties is 9.17% for the LLM-as-judge data and 23.25% for the human-as-judge data.

Number of ties: 780
Proportion of ties: 23.25%


In [135]:
# drop rows in df with df['winner_tie'] == 1
df_ties_dropped = df[df['winner_tie'] == 0]
df_ties_dropped.shape
rawBT_noTies = df_ties_dropped[['model_a', 'model_b', 'winner_model_a']]
rawBT_noTies.head() # (2575, 3)
rawBT_noTies.shape

(2575, 3)

In [136]:
# how to get the unique names in both columns
model_a_names = df['model_a'].unique()
model_b_names = df['model_b'].unique()
# combine the two arrays and get the unique names
model_names = np.unique(np.concatenate((model_a_names, model_b_names)))
# print the number of unique model names
print(f"Number of unique model names: {len(model_names)}")

Number of unique model names: 6


In [137]:
for model in model_names:
    filtered = df_ties_dropped[
        (df_ties_dropped['model_a'] == model) | 
        (df_ties_dropped['model_b'] == model)
    ]
    print(f"{model}: {filtered.shape[0]}")

alpaca-13b: 788
claude-v1: 761
gpt-3.5-turbo: 1081
gpt-4: 807
llama-13b: 901
vicuna-13b-v1.2: 812


In [None]:
# make the BT design matrix.
X, y, player_to_id = make_BT_design_matrix(rawBT_noTies)
X.shape, y.shape

((2575, 5), (2575,))

#### Run Top-k Robustness Check.

In [None]:
ks = [5]
results = {}
for k in ks:
    alphaN = 1
    chatbotA = -1
    while chatbotA == -1:
        chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices = isRankingRobust(k, alphaN, X, y)
        results[(k, alphaN)] = (chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices)
        alphaN += 1

testing new matchup:  4 None
testing new matchup:  3 4
testing new matchup:  1 4
testing new matchup:  0 4
testing new matchup:  2 4
testing new matchup:  4 None
testing new matchup:  3 4
testing new matchup:  1 4
testing new matchup:  0 4
testing new matchup:  2 4
testing new matchup:  4 None
testing new matchup:  3 4
testing new matchup:  1 4
testing new matchup:  0 4
testing new matchup:  2 4
testing new matchup:  4 None
testing new matchup:  3 4
testing new matchup:  1 4
testing new matchup:  0 4
testing new matchup:  2 4
testing new matchup:  4 None
testing new matchup:  3 4
testing new matchup:  1 4
testing new matchup:  0 4
testing new matchup:  2 4
testing new matchup:  4 None
testing new matchup:  3 4
testing new matchup:  1 4
testing new matchup:  0 4
testing new matchup:  2 4
testing new matchup:  4 None
testing new matchup:  3 4
testing new matchup:  1 4
testing new matchup:  0 4
testing new matchup:  2 4
testing new matchup:  4 None
testing new matchup:  3 4
testing new ma

In [140]:
# find the (k, alpha N) pairs that are non-robust.
results_nonrobust = {k: v for k, v in results.items() if v[0] != -1}
results_nonrobust

{(5, 111): (4,
  None,
  -1.0035212194110734,
  0.013504736635853335,
  array([ 917, 2446,  235,  397,   85,   84,  878, 1498, 1243,  685,  686,
         1298, 1105,  866,  547,  908,  250,  678,  679,  680,  681,  885,
          906, 1335,  259, 1307,  423, 1796, 1809, 1808,  381,  322,  697,
          683,  684,  696, 1583,  873,  907,  911, 1109,  321, 1765,   37,
          141, 2173,  115,   38, 2168,   36,  143,  142, 2169,  775,  774,
         2461, 2462, 1598, 1486, 2464, 1487, 1597, 1518, 1573,  129,  777,
          776,  856, 1574,  764, 1976,  106,  868,  273, 2431, 1246, 1245,
         1300, 1301,   25, 1106, 2189, 1321, 1073, 1337, 2409, 1485, 2188,
         2187, 2561, 2186, 2407, 2406, 1599, 2480, 2481,  548,  538, 2129,
         1799, 2102, 2014,  237, 2013, 1895,  327,  326, 2012, 2011,  305,
          304]))}

In [None]:
for model in model_names:
    filtered = df_ties_dropped[
        (df_ties_dropped['model_a'] == model) | 
        (df_ties_dropped['model_b'] == model)
    ]
    print(f"{model}: {filtered.shape[0]}")

alpaca-13b: 788
claude-v1: 761
gpt-3.5-turbo: 1081
gpt-4: 807
llama-13b: 901
vicuna-13b-v1.2: 812


In [None]:
from package.RankAMIP.plot_util import *
rankings = return_rankings_list(X, y, results, 1, 27, player_to_id)

<module 'package.RankAMIP.plot_util' from '/Users/JennyH/Desktop/IsRankingRobust/package/RankAMIP/plot_util.py'>

In [None]:
# plot the rankings on the original arena
filename_to_save = 'fig/top6_mtb_llm.png'
plot_title = 'Model Rankings in MT-Bench'
plot_bt_scores(X, y, rankings, alphaN, 6, plot_title, filename_to_save)

MT-Bench is robust to small-fraction $<1\%$ data-dropping.

In [None]:
# 83 evals were dropped to change models 2 and 0.
# player_to_id # (2 + 1 (adding in the none index): 'gpt-4', 1: 'claude-v1')
# load in mt_bench_llm_noTies.csv
mt_bench_llm_noTies = pd.read_csv("data/mt_bench_llm_noTies.csv")
mt_bench_llm_noTies.shape

(2180, 3)

#### Below, we inspect the ranking flip between the first- and second-place models.

In [None]:
# find the two models that changed ranking.
## Count number of games between the first- and second-place models played in total.
is_gpt_claude = (
    (mt_bench_llm_noTies['model_a'].str.contains('gpt-4') & mt_bench_llm_noTies['model_b'].str.contains('claude-v1')) |
    (mt_bench_llm_noTies['model_a'].str.contains('claude-v1') & mt_bench_llm_noTies['model_b'].str.contains('gpt-4'))
)

num_gpt_claude = mt_bench_llm_noTies[is_gpt_claude].shape[0]
print("Number of games between 'gpt-4' and 'claude-v1': ", num_gpt_claude)

Number of games between 'gpt-4' and 'claude-v1':  145


In [None]:
# model pairs
mt_bench_llm_noTies['model_pair'] = mt_bench_llm_noTies.apply(lambda row: tuple(sorted([row['model_a'], row['model_b']])), axis=1)
mt_bench_llm_noTies['model_pair']

0                (alpaca-13b, claude-v1)
1                (alpaca-13b, claude-v1)
2            (alpaca-13b, gpt-3.5-turbo)
3            (alpaca-13b, gpt-3.5-turbo)
4                    (alpaca-13b, gpt-4)
                      ...               
2175        (claude-v1, vicuna-13b-v1.2)
2176    (gpt-3.5-turbo, vicuna-13b-v1.2)
2177    (gpt-3.5-turbo, vicuna-13b-v1.2)
2178            (gpt-4, vicuna-13b-v1.2)
2179            (gpt-4, vicuna-13b-v1.2)
Name: model_pair, Length: 2180, dtype: object

In [None]:
# Compute average games per model pair across the arena.
pair_counts = mt_bench_llm_noTies['model_pair'].value_counts()
average_games_per_pair = pair_counts.mean()
print("Average number of games per model pair:", average_games_per_pair)

Average number of games per model pair: 145.33333333333334


#### Find the win margin between 'gpt-4-0125-preview' and 'gpt-4-1106-preview'


In [129]:
# that is, find all games that are between the two models.
dfFlippedRanking = mt_bench_llm_noTies[is_gpt_claude]
# find the win margin
## Count number of games between that 'gpt-4' won.
gpt4_wins = (
    ((dfFlippedRanking['model_a']=='gpt-4') & (dfFlippedRanking['winner_model_a'] == 1)) |
    ((dfFlippedRanking['model_b']=='gpt-4') & (dfFlippedRanking['winner_model_a'] == 0))
)
num_gpt4_wins = dfFlippedRanking[gpt4_wins].shape[0]
print("Proportion of games that GPT-4 won: ", num_gpt4_wins / dfFlippedRanking.shape[0])

Proportion of games that GPT-4 won:  0.5172413793103449


In [128]:
claude_wins = (
    ((dfFlippedRanking['model_a'] == 'claude-v1') & (dfFlippedRanking['winner_model_a'] == 1)) |
    ((dfFlippedRanking['model_b'] == 'claude-v1') & (dfFlippedRanking['winner_model_a'] == 0))
)
num_claude_wins = dfFlippedRanking[claude_wins].shape[0]
print("Proportion of games that claude won: ", num_claude_wins / dfFlippedRanking.shape[0])

Proportion of games that claude won:  0.4827586206896552


In [72]:
claude_wins = (
    (dfFlippedRanking['model_a'].str.contains('claude-v1') & dfFlippedRanking['winner_model_a'] == 1) |
    (dfFlippedRanking['model_b'].str.contains('claude-v1') & dfFlippedRanking['winner_model_a'] == 0)
)
num_claude_wins = dfFlippedRanking[claude_wins].shape[0]
print("Proportion of games that GPT-4-0125 won: ", num_claude_wins / dfFlippedRanking.shape[0])

Proportion of games that GPT-4-0125 won:  0.6413793103448275


In [73]:
num_gpt4_wins, num_claude_wins

(121, 93)