In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from package.RankAMIP.logistic import run_logistic_regression
from package.RankAMIP.data_script import make_BT_design_matrix
from package.RankAMIP.logistic import LogisticAMIP
from package.RankAMIP.logistic import find_closest_matchups
from package.RankAMIP.logistic import isRankingRobust

### Is LLM-as-the-Judge Chatbot Arena Data-Dropping Robust?

The potsawee/chatbot-arena-llm-judges dataset is a collection of evaluations where large language models (LLMs) act as judges, assessing the quality of responses generated by other LLMs.

Studies have shown that strong LLM judges, like GPT-4, can align with human preferences over 80% of the time, suggesting the viability of this approach for large-scale evaluations.

### Data Pre-processing

In [2]:
# Import datasets from https://huggingface.co/datasets/potsawee/chatbot-arena-llm-judges
from datasets import load_dataset
ds = load_dataset("potsawee/chatbot-arena-llm-judges")

Repo card metadata block was not found. Setting CardData to empty.


In [3]:
# inspect the available splits
print(ds)  
# grab the ‘train’ split (or whatever splits ds has)
train = ds["train"]
# look at the first example
print(train[0])

DatasetDict({
    train: Dataset({
        features: ['id', 'model_a', 'model_b', 'winner_model_a', 'winner_model_b', 'winner_tie', 'question', 'answer_a', 'answer_b'],
        num_rows: 49938
    })
})
{'id': 65089, 'model_a': 'gpt-3.5-turbo-0613', 'model_b': 'mistral-medium', 'winner_model_a': 0, 'winner_model_b': 0, 'winner_tie': 1, 'question': 'explain function calling. how would you call a function?', 'answer_a': 'Function calling is the process of invoking or executing a function in a programming language. To call a function, you need to use the function\'s name followed by parentheses "()". Within the parentheses, you can pass arguments if the function requires any.\n\nHere is an example of how to call a function in Python:\n\n```python\n# Define a function\ndef greet(name):\n    print("Hello, " + name + "!")\n\n# Call the function\ngreet("John")\n```\n\nIn this example, the function "greet" is defined with a single parameter "name" which is used to print a greeting message. To 

In [4]:
df = train.to_pandas()
df.head()

Unnamed: 0,id,model_a,model_b,winner_model_a,winner_model_b,winner_tie,question,answer_a,answer_b
0,65089,gpt-3.5-turbo-0613,mistral-medium,0,0,1,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...
1,96401,llama-2-13b-chat,mistral-7b-instruct,1,0,0,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...
2,198779,koala-13b,gpt-3.5-turbo-0314,0,1,0,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...
3,292873,vicuna-13b,gpt-4-0314,0,1,0,"Construct a rap battle, in the style of Epic R...","[Zeus]\nYo, it's the king of the gods on the m...","(Verse 1 - Zeus)\n\nI'm the king of the gods, ..."
4,313413,mixtral-8x7b-instruct-v0.1,vicuna-13b,1,0,0,Why water is not used in bath tub?,Water is actually used in a bath tub. A bath t...,Water is not used in a bath tub because it is ...


In [5]:
# how to get the unique names in both columns
model_a_names = df['model_a'].unique()
model_b_names = df['model_b'].unique()
# combine the two arrays and get the unique names
model_names = np.unique(np.concatenate((model_a_names, model_b_names)))
# print the number of unique model names
print(f"Number of unique model names: {len(model_names)}")

Number of unique model names: 64


In [6]:
ties = df[df['winner_tie'] == 1]
print(f"Number of ties: {len(ties)}")
# proportion of ties.
print(f"Proportion of ties: {len(ties) / len(df):.2%}")

Number of ties: 15641
Proportion of ties: 31.32%


There are 15641 ties in the data set.

In [7]:
def break_ties_randomly(df, seed):
    np.random.seed(seed)
    tie_indices = df[df['winner_tie'] == 1].index
    # Random choices: 1 means assign to model_a, 0 to model_b
    assign_to_a = np.random.choice([0, 1], size=len(tie_indices))
    
    for idx, assign in zip(tie_indices, assign_to_a):
        if assign == 1:
            df.at[idx, 'winner_model_a'] = 1
            df.at[idx, 'winner_model_b'] = 0
        else:
            df.at[idx, 'winner_model_a'] = 0
            df.at[idx, 'winner_model_b'] = 1
    
    return df

In [8]:
df_no_ties = break_ties_randomly(df,seed=0)
# df_no_ties.head(20)
df_no_ties.shape
# select the model_a, model_b, and winner columns
rawBT = df_no_ties[['model_a', 'model_b', 'winner_model_a']]
rawBT.head()
# make a design matrix.
# X, y, player_to_id = make_BT_design_matrix(rawBT)
###

Unnamed: 0,model_a,model_b,winner_model_a
0,gpt-3.5-turbo-0613,mistral-medium,0
1,llama-2-13b-chat,mistral-7b-instruct,1
2,koala-13b,gpt-3.5-turbo-0314,0
3,vicuna-13b,gpt-4-0314,0
4,mixtral-8x7b-instruct-v0.1,vicuna-13b,1


In [13]:
### Move 'gpt-4-1106-preview' to index 0.
# Find all rows where gpt-4-1106-preview appears in model_a or model_b
mask = (df['model_a'] == 'gpt-4-1106-preview') | (df['model_b'] == 'gpt-4-1106-preview')
df = pd.concat([df[mask], df[~mask]]).reset_index(drop=True)
new_rawBT = df[['model_a', 'model_b', 'winner_model_a']]
new_rawBT.head()

Unnamed: 0,model_a,model_b,winner_model_a
0,gpt-4-1106-preview,wizardlm-70b,1
1,claude-1,gpt-4-1106-preview,1
2,deepseek-llm-67b-chat,gpt-4-1106-preview,1
3,gpt-4-1106-preview,claude-2.1,1
4,gpt-4-1106-preview,gpt-4-0613,0


In [15]:
all_players = pd.concat([new_rawBT.iloc[:, 0], new_rawBT.iloc[:, 1]])
unique_players = all_players.unique()
player_to_id = {player: idx for idx, player in enumerate(unique_players)}
player_to_id

{'gpt-4-1106-preview': 0,
 'claude-1': 1,
 'deepseek-llm-67b-chat': 2,
 'gpt-4-0613': 3,
 'gpt-3.5-turbo-0613': 4,
 'palm-2': 5,
 'gemini-pro': 6,
 'claude-instant-1': 7,
 'claude-2.1': 8,
 'gpt-3.5-turbo-1106': 9,
 'mistral-medium': 10,
 'mixtral-8x7b-instruct-v0.1': 11,
 'llama2-70b-steerlm-chat': 12,
 'claude-2.0': 13,
 'solar-10.7b-instruct-v1.0': 14,
 'llama-2-7b-chat': 15,
 'openchat-3.5': 16,
 'gemini-pro-dev-api': 17,
 'gpt-4-0125-preview': 18,
 'stripedhyena-nous-7b': 19,
 'mistral-7b-instruct': 20,
 'tulu-2-dpo-70b': 21,
 'qwen-14b-chat': 22,
 'nous-hermes-2-mixtral-8x7b-dpo': 23,
 'codellama-34b-instruct': 24,
 'chatglm3-6b': 25,
 'llama-2-70b-chat': 26,
 'gpt-4-0314': 27,
 'llama-2-13b-chat': 28,
 'wizardlm-70b': 29,
 'vicuna-33b': 30,
 'pplx-70b-online': 31,
 'zephyr-7b-beta': 32,
 'openhermes-2.5-mistral-7b': 33,
 'pplx-7b-online': 34,
 'qwen1.5-4b-chat': 35,
 'starling-lm-7b-alpha': 36,
 'vicuna-13b': 37,
 'dolphin-2.2.1-mistral-7b': 38,
 'yi-34b-chat': 39,
 'gpt-3.5-tur

In [16]:
# make a design matrix.
X, y, player_to_id = make_BT_design_matrix(new_rawBT)
X.shape, y.shape

((49938, 63), (49938,))

#### Robustness results are dependent on how ties are broken 
changing the random seed changes robustness results.

In [None]:
rs_results = []
for rand_seed in range(10):
    df_no_ties = break_ties_randomly(df,seed=rand_seed)
    # df_no_ties.head(20)
    df_no_ties.shape
    # select the model_a, model_b, and winner columns
    rawBT = df_no_ties[['model_a', 'model_b', 'winner_model_a']]
    rawBT.head()
    # make a design matrix.
    X, y, player_to_id = make_BT_design_matrix(rawBT)
    ###
    myAMIP = LogisticAMIP(X, y, fit_intercept=False, penalty=None)
    results = myAMIP.AMIP_sign_change(10, 8, 54)
    rs_results.append(results)

In [26]:
# fraction of times the tie-broken arena is non-robust to dropping 10 points
sum([rs_results[i][0] for i in range(len(rs_results))]) / len(rs_results)

0.28

In [17]:
ks = [1]
alphaNs = [10]

results = {}
for k in ks:
    for alphaN in alphaNs:
        chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices = isRankingRobust(k, alphaN, X, y)
        results[(k, alphaN)] = (chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices)

testing new matchup:  17 None


In [18]:
results

{(1, 10): (17,
  None,
  0.0026024924730363454,
  -0.02396859349241591,
  array([4230, 6362, 1141, 4393,  555, 6400,  583,  599, 4505, 5096]))}

In [56]:
import importlib
import package.RankAMIP.plot_util
from package.RankAMIP.plot_util import *
# Make changes to your_local_module.py file
# Then reload it
importlib.reload(package.RankAMIP.plot_util)

<module 'package.RankAMIP.plot_util' from '/Users/JennyH/Desktop/IsRankingRobust/package/RankAMIP/plot_util.py'>

In [None]:
from package.RankAMIP.plot_util import *
rankings = return_rankings_list(X, y, results, 1, 10, player_to_id)
rankings

[['gpt-4-0125-preview', 18, 0.0026024924730363454, -0.023968593492253342],
 ['gpt-4-1106-preview', 0, 0.0, 0.0],
 ['gpt-4-0314', 27, -0.44664797519705846, -0.44973806596932986],
 ['gpt-4-0613', 3, -0.5161884248635662, -0.5196444422371607],
 ['qwen1.5-72b-chat', 46, -0.521835041972785, -0.5258672359642106],
 ['mistral-medium', 10, -0.6171203954360839, -0.6209980561245061],
 ['claude-1', 1, -0.7077837419976245, -0.710994031968644],
 ['claude-2.0', 13, -0.7658476724968879, -0.7689856198288763],
 ['mixtral-8x7b-instruct-v0.1', 11, -0.7732079432536962, -0.7764696103440489],
 ['pplx-70b-online', 31, -0.8085599127047469, -0.8119145372406565],
 ['gemini-pro-dev-api', 17, -0.8102657362736976, -0.8135107100566449],
 ['gemini-pro', 6, -0.810454533058289, -0.813207788956057],
 ['starling-lm-7b-alpha', 36, -0.8151904718828286, -0.8186996566623835],
 ['gpt-3.5-turbo-0613', 4, -0.8651733305639048, -0.8686675671610002],
 ['claude-2.1', 8, -0.8715399571415868, -0.8748098207766041],
 ['wizardlm-70b', 29

In [47]:
# verify that the sign of gpt-4-0125-preview flipped.
playerA, playerB, orig_out, new_out, indices = results[1, 10]
model_full = run_logistic_regression(X, y)
Xd = np.delete(X, indices, axis=0)
yd = np.delete(y, indices, axis=0)
model_d  = run_logistic_regression(Xd, yd)
model_full.coef_[0][playerA], model_d.coef_[0][playerA]

(0.0026024924730363454, -0.023968593492253342)

In [None]:
### Plot Results.
filename_to_save = 'fig/llm_judge.png'
plot_title = 'Model Rankings in LLM Judge Arena'
plot_title = 'Model Rankings in LLM Judge Arena'
plot_bt_scores(X, y, rankings, alphaN, 10, plot_title, filename_to_save)

We find that the score for gpt-4-0125-preview flips with that of gpt-4-1106-preview with only 10 out of 50k matches dropped. These were the original first and second-ranking models, by quite a landslide.

### Inspect the dropped human evals.

In [32]:
# check the human evals that were dropped to change the top model.
# dropped comparisons: 
# get the rows in df that have these indices.
mis_df = df.iloc[indices][['model_a', 'model_b', 'winner_model_a']]
mis_df
# save the df to a csv file.
# mistral_df.to_csv('fig/mistral_df.csv', index=False)


Unnamed: 0,model_a,model_b,winner_model_a
4230,gpt-4-0125-preview,gpt-4-1106-preview,1
6362,gpt-4-0125-preview,gpt-4-1106-preview,1
1141,gpt-4-0125-preview,gpt-4-1106-preview,1
4393,gpt-4-0125-preview,gpt-4-1106-preview,1
555,gpt-4-0125-preview,gpt-4-1106-preview,1
6400,gpt-4-0125-preview,gpt-4-1106-preview,1
583,gpt-4-0125-preview,gpt-4-1106-preview,1
599,gpt-4-0125-preview,gpt-4-1106-preview,1
4505,gpt-4-0125-preview,gpt-4-1106-preview,1
5096,gpt-4-0125-preview,gpt-4-1106-preview,1


In all of the 10 dropped matches, gpt-4-0125-preview wins against gpt-4-1106-preview. Dropping these 10 matches is enough to demotegpt-4-0125-preview to 2nd place!

In [37]:
# MIS: prompts.
for prompt in df.iloc[indices[:5]]['question'].tolist():
    print(prompt)


If you were a gamer, what would your gaming PC be? 
 Who is the antagonist of the game Deus Ex?
   A: Bowser
   B: The Strogg General
   C: Anna Navarre
   D: None of the above
   E: I don't know/there's no such thing.
Make a table that compares the pros and cons of the Apple Vision Pro with the Quest 3 and the Quest Pro
I have 5 oranges today, I ate 3 oranges last week. How many oranges do I have left?
Hi, how are you?


In [39]:
# MIS: response A.
for responseA in df.iloc[indices[:5]]['answer_a'].tolist():
    print(responseA)

If I were a gamer, and could choose the specifications for a gaming PC, I would aim for a high-performance setup that balances power, future-proofing, and cost-effectiveness. Here's an ideal configuration based on the latest technology as of early 2023:

### Processor (CPU)
- **AMD Ryzen 9 7950X** or **Intel Core i9-13900K**
  - Both of these CPUs are top-of-the-line for gaming and productivity, offering a high number of cores and threads, excellent single-core performance (important for gaming), and support for the latest technologies.

### Graphics Card (GPU)
- **NVIDIA GeForce RTX 4080** or **AMD Radeon RX 7900 XT**
  - These GPUs are among the best for gaming, offering fantastic performance at 4K resolution, ray tracing capabilities, and support for the latest graphical technologies and APIs.

### Memory (RAM)
- **32GB DDR5-5600MHz** (or higher)
  - 32GB of RAM is more than enough for gaming and ensures smooth performance even in multitasking scenarios or with memory-intensive appl

In [41]:
# MIS: response B.
for responseB in df.iloc[indices[:5]]['answer_b'].tolist():
    print(responseB)

If I were a gamer, my ideal gaming PC would likely focus on high performance to handle the latest games at their best quality settings. Here's a hypothetical build that would make for a powerful gaming system as of my last update in early 2023:

**Processor (CPU):** AMD Ryzen 9 5900X or Intel Core i9-12900K – Both of these CPUs are top-of-the-line options for gaming and offer plenty of cores and threads for multitasking and content creation.

**Graphics Card (GPU):** NVIDIA GeForce RTX 3080 Ti or AMD Radeon RX 6900 XT – These GPUs are among the best for gaming, offering high frame rates at 4K resolution and support for ray tracing.

**Memory (RAM):** 32GB DDR4 3600MHz – This is ample RAM for gaming and will also be beneficial for any streaming or video editing tasks.

**Storage:**
- Primary SSD: 1TB NVMe PCIe 4.0 SSD for the operating system and frequently played games, which offers fast load times and quick data transfer.
- Secondary Storage: 2TB SATA SSD or a larger HDD for additiona