In [19]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from package.RankAMIP.logistic import run_logistic_regression
from package.RankAMIP.data_script import make_BT_design_matrix
from package.RankAMIP.logistic import LogisticAMIP
from package.RankAMIP.logistic import find_closest_matchups
from package.RankAMIP.logistic import isRankingRobust
from package.RankAMIP.data_script import *

### Is Multi-turn Benchmark Chatbot Arena Data-Dropping Robust?

The MT-Bench (Multi-Turn Benchmark) is a curated set of 80 multi-turn dialogue prompts designed to evaluate the conversational and instruction-following capabilities of large language models (LLMs). Each prompt simulates realistic, multi-turn interactions that test a model's ability to maintain context, reason logically, and provide coherent responses across various domains, including general knowledge, reasoning, programming, and open-ended tasks.

### Data Pre-processing

In [2]:
# Import datasets from 
# https://huggingface.co/datasets/lmsys/mt_bench_human_judgments
from datasets import load_dataset
ds = load_dataset("lmsys/mt_bench_human_judgments")

README.md:   0%|          | 0.00/2.00k [00:00<?, ?B/s]

(…)-00000-of-00001-c0b431264a82ddc0.parquet:   0%|          | 0.00/650k [00:00<?, ?B/s]

(…)-00000-of-00001-25f4910818759289.parquet:   0%|          | 0.00/739k [00:00<?, ?B/s]

Generating gpt4_pair split:   0%|          | 0/2400 [00:00<?, ? examples/s]

Generating human split:   0%|          | 0/3355 [00:00<?, ? examples/s]

In [None]:
# inspect the available splits
print(ds)  
# grab the ‘train’ split (or whatever splits ds has)
gpt4_pair = ds["gpt4_pair"] 
human = ds["human"] 
# look at the first example
print(gpt4_pair[0])

DatasetDict({
    gpt4_pair: Dataset({
        features: ['question_id', 'model_a', 'model_b', 'winner', 'judge', 'conversation_a', 'conversation_b', 'turn'],
        num_rows: 2400
    })
    human: Dataset({
        features: ['question_id', 'model_a', 'model_b', 'winner', 'judge', 'conversation_a', 'conversation_b', 'turn'],
        num_rows: 3355
    })
})
{'question_id': 81, 'model_a': 'alpaca-13b', 'model_b': 'gpt-3.5-turbo', 'winner': 'model_b', 'judge': 'author_2', 'conversation_a': [{'content': 'Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.', 'role': 'user'}, {'content': 'I recently had the pleasure of visiting Hawaii and it quickly became one of my favorite places. From the stunning beaches to the lush mountains, this place has it all. The people are incredibly friendly and the culture is alive and well. One of the highlights of my trip was visiting the Polynesian Cultural Center. Here, I was ab

In [87]:
df = gpt4_pair.to_pandas()
df.head()

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,turn
0,81,alpaca-13b,claude-v1,model_b,gpt4_pair,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,1
1,81,alpaca-13b,claude-v1,model_b,gpt4_pair,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,2
2,81,alpaca-13b,gpt-3.5-turbo,model_b,gpt4_pair,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,1
3,81,alpaca-13b,gpt-3.5-turbo,model_b,gpt4_pair,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,2
4,81,alpaca-13b,gpt-4,model_b,gpt4_pair,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,1


In [90]:
# create a column winner_model_a, which is 1 if model_a is preferred, 0 if model_b is preferred
df['winner_model_a'] = df['winner'].apply(lambda x: 1 if x == 'model_a' else 0)
# create a column called winner_tie that is 1 if the winner is 'tie', else 0
df['winner_tie'] = df['winner'].apply(lambda x: 1 if x == 'tie' else 0)
df.head()

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,turn,winner_model_a,winner_tie
0,81,alpaca-13b,claude-v1,model_b,gpt4_pair,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,1,0,0
1,81,alpaca-13b,claude-v1,model_b,gpt4_pair,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,2,0,0
2,81,alpaca-13b,gpt-3.5-turbo,model_b,gpt4_pair,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,1,0,0
3,81,alpaca-13b,gpt-3.5-turbo,model_b,gpt4_pair,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,2,0,0
4,81,alpaca-13b,gpt-4,model_b,gpt4_pair,[{'content': 'Compose an engaging travel blog ...,[{'content': 'Compose an engaging travel blog ...,1,0,0


In [91]:
ties = df[df['winner_tie'] == 1]
print(f"Number of ties: {len(ties)}")
# proportion of ties.
print(f"Proportion of ties: {len(ties) / len(df):.2%}")
# note, the proportion of ties is 9.17% for the LLM-as-judge data and 

Number of ties: 220
Proportion of ties: 9.17%


In [92]:
# how to get the unique names in both columns
model_a_names = df['model_a'].unique()
model_b_names = df['model_b'].unique()
# combine the two arrays and get the unique names
model_names = np.unique(np.concatenate((model_a_names, model_b_names)))
# print the number of unique model names
print(f"Number of unique model names: {len(model_names)}")

Number of unique model names: 6


In [93]:
def break_ties_randomly(df, seed=6):
    np.random.seed(seed)
    tie_indices = df[df['winner_tie'] == 1].index
    # Random choices: 1 means assign to model_a, 0 to model_b
    assign_to_a = np.random.choice([0, 1], size=len(tie_indices))
    
    for idx, assign in zip(tie_indices, assign_to_a):
        if assign == 1:
            df.at[idx, 'winner_model_a'] = 1
            df.at[idx, 'winner_model_b'] = 0
        else:
            df.at[idx, 'winner_model_a'] = 0
            df.at[idx, 'winner_model_b'] = 1
    
    return df

In [None]:
df_no_ties = break_ties_randomly(df,seed=49)
# df_no_ties.head(20)
df_no_ties.shape

(2400, 11)

In [95]:
# select the model_a, model_b, and winner columns
rawBT = df_no_ties[['model_a', 'model_b', 'winner_model_a']]
rawBT.head()
# make a design matrix.
# X, y, player_to_id = make_BT_design_matrix(rawBT)
###

Unnamed: 0,model_a,model_b,winner_model_a
0,alpaca-13b,claude-v1,0
1,alpaca-13b,claude-v1,0
2,alpaca-13b,gpt-3.5-turbo,0
3,alpaca-13b,gpt-3.5-turbo,0
4,alpaca-13b,gpt-4,0


In [96]:
# make a design matrix.
X, y, player_to_id = make_BT_design_matrix(rawBT)
X.shape, y.shape

((2400, 5), (2400,))

#### Robustness results are dependent on how ties are broken 
changing the random seed changes robustness results.

we could think to simply exclude the games that resulted in ties.

In [122]:
# checking proportion of random seeds (for tie-breaking) that lead to non-robustness 
# of dropping 1% of the data.
rs_results = []
k = 1
alphaN = 50 # all models, 1% of the data.
for rand_seed in range(200):
    df_no_ties = break_ties_randomly(df,seed=rand_seed)

    # select the model_a, model_b, and winner columns
    rawBT = df_no_ties[['model_a', 'model_b', 'winner_model_a']]
    rawBT.head()

    # make a design matrix.
    X, y, player_to_id = make_BT_design_matrix(rawBT)
    
    # run robustness check.
    results_curr_seed = isRankingRobust(k, alphaN, X, y)
    rs_results.append(results_curr_seed)

testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 0
testing new matchup:  1 3
testing new matchup:  1 None
testing new matchup:  1 2
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing new matchup:  1 4
testing n

In [126]:
rs_results
# find all indices of rs_results that are not (-1, -1, -1, -1, [-1])
not_robust_indices = [i for i, result in enumerate(rs_results) if result != (-1, -1, -1, -1, [-1])]
# not_robust_indices

this task arena seems robust to dropping 1% of the data. perhaps it is because the human annotators are 'experts'?

In [127]:
df_no_ties = break_ties_randomly(df,seed=2)

# select the model_a, model_b, and winner columns
rawBT = df_no_ties[['model_a', 'model_b', 'winner_model_a']]
rawBT.head()

# make a design matrix.
X, y, player_to_id = make_BT_design_matrix(rawBT)

# run robustness check.
results = {}
chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices = isRankingRobust(k, alphaN, X, y)
results[(k, alphaN)] = (chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices)

testing new matchup:  1 4


In [128]:
results

{(1, 50): (1,
  4,
  0.34440709377443257,
  -0.07839674209253555,
  array([1306,  617,  587,  646, 1126,  796,  826,  917,  736,  916,  856,
         1290,  720,  780,  570,  631,  630,  571, 1741,  853,  553,  192,
          193, 2022, 2023,  552, 1363,   72, 1362,  732, 1303, 2052, 2053,
         1302, 1752,  733, 1722,  703, 1422, 1423,  613,  103,  102, 1812,
         1602, 1872, 1873,  583, 1692,  582]))}

In [140]:
import importlib
import package.RankAMIP.plot_util
from package.RankAMIP.plot_util import *
# Make changes to your_local_module.py file (load twice!)
# Then reload it
importlib.reload(package.RankAMIP.plot_util)

<module 'package.RankAMIP.plot_util' from '/Users/JennyH/Desktop/IsRankingRobust/package/RankAMIP/plot_util.py'>

In [141]:
from package.RankAMIP.plot_util import *
rankings = return_rankings_list(X, y, results, 1, 50, player_to_id)
rankings

[['gpt-4', 2, 4.049409656077162, 4.354695978917397],
 ['claude-v1', 5, 3.7050025623027296, 4.43309272100994],
 ['gpt-3.5-turbo', 1, 2.523829859145824, 2.875793406431142],
 ['vicuna-13b-v1.2', 4, 1.4534455170535812, 1.6929604791958186],
 ['alpaca-13b', 0, 0.0, 0.0],
 ['llama-13b', 3, -0.8244371999198913, -0.9316622576088127]]

In [142]:
### Plot Results.
filename_to_save = 'fig/mtbench_llm.png'
plot_title = 'Model Rankings on Multi-turn Chatbot Arena (GPT4 Judge)'
plot_bt_scores(X, y, rankings, alphaN, 10, plot_title, filename_to_save)