In [68]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from package.RankAMIP.logistic import run_logistic_regression
from package.RankAMIP.data_script import make_BT_design_matrix
from package.RankAMIP.logistic import LogisticAMIP
from package.RankAMIP.logistic import find_closest_matchups
from package.RankAMIP.logistic import isRankingRobust
from package.RankAMIP.data_script import break_ties_randomly

### Is LLM-as-a-Judge Chatbot Arena Data-Dropping Robust?

The potsawee/chatbot-arena-llm-judges dataset is a collection of evaluations where large language models (LLMs) act as judges, assessing the quality of responses generated by other LLMs.



### Load Data

In [None]:
# Import datasets from https://huggingface.co/datasets/potsawee/chatbot-arena-llm-judges
from datasets import load_dataset
ds = load_dataset("potsawee/chatbot-arena-llm-judges")

In [33]:
# inspect the available splits
print(ds)  
# grab the ‘train’ split (or whatever splits ds has)
train = ds["train"]
# look at the first example
print(train[0])

DatasetDict({
    train: Dataset({
        features: ['id', 'model_a', 'model_b', 'winner_model_a', 'winner_model_b', 'winner_tie', 'question', 'answer_a', 'answer_b'],
        num_rows: 49938
    })
})
{'id': 65089, 'model_a': 'gpt-3.5-turbo-0613', 'model_b': 'mistral-medium', 'winner_model_a': 0, 'winner_model_b': 0, 'winner_tie': 1, 'question': 'explain function calling. how would you call a function?', 'answer_a': 'Function calling is the process of invoking or executing a function in a programming language. To call a function, you need to use the function\'s name followed by parentheses "()". Within the parentheses, you can pass arguments if the function requires any.\n\nHere is an example of how to call a function in Python:\n\n```python\n# Define a function\ndef greet(name):\n    print("Hello, " + name + "!")\n\n# Call the function\ngreet("John")\n```\n\nIn this example, the function "greet" is defined with a single parameter "name" which is used to print a greeting message. To 

In [34]:
df = train.to_pandas()
df.head()

Unnamed: 0,id,model_a,model_b,winner_model_a,winner_model_b,winner_tie,question,answer_a,answer_b
0,65089,gpt-3.5-turbo-0613,mistral-medium,0,0,1,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...
1,96401,llama-2-13b-chat,mistral-7b-instruct,1,0,0,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...
2,198779,koala-13b,gpt-3.5-turbo-0314,0,1,0,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...
3,292873,vicuna-13b,gpt-4-0314,0,1,0,"Construct a rap battle, in the style of Epic R...","[Zeus]\nYo, it's the king of the gods on the m...","(Verse 1 - Zeus)\n\nI'm the king of the gods, ..."
4,313413,mixtral-8x7b-instruct-v0.1,vicuna-13b,1,0,0,Why water is not used in bath tub?,Water is actually used in a bath tub. A bath t...,Water is not used in a bath tub because it is ...


In [None]:
# Move 'gpt-4-1106-preview' to index 0 (so that it is the baseline model).
mask = (df['model_a'] == 'gpt-4-1106-preview') | (df['model_b'] == 'gpt-4-1106-preview')
df = pd.concat([df[mask], df[~mask]]).reset_index(drop=True)

In [None]:
# the unique model names
model_a_names = df['model_a'].unique()
model_b_names = df['model_b'].unique()
model_names = np.unique(np.concatenate((model_a_names, model_b_names)))

print(f"Number of unique model names: {len(model_names)}")

Number of unique model names: 64


In [54]:
ties = df[df['winner_tie'] == 1]
print(f"Number of ties: {len(ties)}")
# proportion of ties.
print(f"Proportion of ties: {len(ties) / len(df):.2%}")

Number of ties: 15641
Proportion of ties: 31.32%


#### Drop the rows that correspond to ties.

In [70]:
# drop rows in df with df['winner_tie'] == 1
df_ties_dropped = df[df['winner_tie'] == 0]
df_ties_dropped.shape
rawBT_noTies = df_ties_dropped[['model_a', 'model_b', 'winner_model_a']]
rawBT_noTies.head()

Unnamed: 0,model_a,model_b,winner_model_a
0,gpt-4-1106-preview,wizardlm-70b,1
3,gpt-4-1106-preview,claude-2.1,1
4,gpt-4-1106-preview,gpt-4-0613,0
6,gpt-3.5-turbo-0613,gpt-4-1106-preview,0
7,palm-2,gpt-4-1106-preview,0


In [None]:
# create BT design matrix.
X, y, player_to_id = make_BT_design_matrix(rawBT_noTies)
X.shape, y.shape

((34297, 63), (34297,))

#### Run Top-k Robustness Check.

In [None]:
ks = [1, 3, 5, 10, 20]

results = {}
for k in ks:
    alphaN = 1
    chatbotA = -1
    while chatbotA == -1:
        chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices = isRankingRobust(k, alphaN, X, y)
        results[(k, alphaN)] = (chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices)
        alphaN += 1

In [73]:
# find the (k, alpha N) pairs that are non-robust.
results_nonrobust = {k: v for k, v in results.items() if v[0] != -1}
results_nonrobust

{(1, 4): (24,
  None,
  0.013837807361522918,
  -0.004433233431111085,
  array([1977, 1796,  823, 1069])),
 (3, 15): (21,
  42,
  0.16061075763847033,
  -0.0005652617887471623,
  array([22886, 26632, 33446, 31675, 13794,  7375, 32166, 25601, 13680,
         11507, 15159, 16105, 16276, 31866, 24294])),
 (5, 10): (42,
  7,
  0.0920873022068236,
  -0.006170522526522726,
  array([2193, 1084, 1636, 4485, 3098, 1113, 4005, 4004, 2361, 4057])),
 (10, 2): (2,
  8,
  0.006779695894059357,
  -0.00013762951326445894,
  array([ 149, 4412])),
 (20, 2): (3,
  25,
  0.005544624303261525,
  -0.0019872249816650367,
  array([7858, 8478]))}

In [161]:
from package.RankAMIP.plot_util import *
rankings = return_rankings_list(X, y, results, 1, 4, player_to_id)

In [None]:
# plot the rankings for the full arena.
filename_to_save = 'fig/top20_LLMAren.png'
plot_title = 'Model Rankings in LLM Arena'
plot_bt_scores(X, y, rankings, alphaN, 20, plot_title, filename_to_save)

#### summary statistics on the pair that changed 1st, 2nd place rank

In [None]:
### Load in results.
import pickle

with open("results/LLMArenaNonrobust.pkl", "rb") as f:
    LLMArenaDataDropped = pickle.load(f)

In [91]:
# 4 evals were dropped to change models 24 and None (aka, 0).
# player_to_id # (24 + 1 (adding in the none index): 'gpt-4-0125-preview', 0: 'gpt-4-1106-preview')
# load in chatBotArena_noTies.csv
LLMArena_noTies = pd.read_csv("data/LLMArena_noTies.csv")
LLMArena_noTies.head()
LLMArena_noTies.shape

(34297, 9)

In [82]:
## Count number of games between the two models that changed rankings.
is_gpt41106_gpt40125 = (
    (LLMArena_noTies['model_a'].str.contains('gpt-4-0125-preview') & LLMArena_noTies['model_b'].str.contains('gpt-4-1106-preview')) |
    (LLMArena_noTies['model_a'].str.contains('gpt-4-1106-preview') & LLMArena_noTies['model_b'].str.contains('gpt-4-0125-preview'))
)

num_gpt41106_gpt40125 = LLMArena_noTies[is_gpt41106_gpt40125].shape[0]
print("Number of games between GPT-4-1106 and GPT-4-0125: ", num_gpt41106_gpt40125)

Number of games between GPT-4-1106 and GPT-4-0125:  58


In [84]:
# model pairs (sorted to group symmetric pairs)
LLMArena_noTies['model_pair'] = LLMArena_noTies.apply(lambda row: tuple(sorted([row['model_a'], row['model_b']])), axis=1)

# Count number of games per model pair
pair_counts = LLMArena_noTies['model_pair'].value_counts()

# Compute average
average_games_per_pair = pair_counts.mean()
print("Average number of games per model pair:", average_games_per_pair)

Average number of games per model pair: 27.793354943273908


In [88]:
# Find the win margin between 'gpt-4-0125-preview' and 'gpt-4-1106-preview'
# that is, find all games that are between the two models.
dfFlippedRanking = LLMArena_noTies[is_gpt41106_gpt40125]
# find the win margin
## Count number of games between that 'gpt-4-0125-preview' won.
gpt40125_wins = (
    (dfFlippedRanking['model_a'].str.contains('gpt-4-0125-preview') & dfFlippedRanking['winner_model_a'] == 1) |
    (dfFlippedRanking['model_b'].str.contains('gpt-4-0125-preview') & dfFlippedRanking['winner_model_b'] == 1)
)
num_gpt40125_wins = dfFlippedRanking[gpt40125_wins].shape[0]
print("Proportion of games that GPT-4-0125 won: ", num_gpt40125_wins / num_gpt41106_gpt40125)

Proportion of games that GPT-4-0125 won:  0.5517241379310345


#### Player involvement in dropped matches

In [None]:
old_tuple = LLMArenaDataDropped[(1,4)] # = 0 # change none to a -1 (gpt-4-1106-preview).
new_tuple = tuple(-1 if i == 1 and val is None else val for i, val in enumerate(old_tuple))
results_nonrobust[(1, 4)] = new_tuple

In [136]:
results_nonrobust

{(1, 4): (24,
  -1,
  0.013837807361522918,
  -0.004433233431111085,
  array([1977, 1796,  823, 1069])),
 (3, 15): (21,
  42,
  0.16061075763847033,
  -0.0005652617887471623,
  array([22886, 26632, 33446, 31675, 13794,  7375, 32166, 25601, 13680,
         11507, 15159, 16105, 16276, 31866, 24294])),
 (5, 10): (42,
  7,
  0.0920873022068236,
  -0.006170522526522726,
  array([2193, 1084, 1636, 4485, 3098, 1113, 4005, 4004, 2361, 4057])),
 (10, 2): (2,
  8,
  0.006779695894059357,
  -0.00013762951326445894,
  array([ 149, 4412])),
 (20, 2): (3,
  25,
  0.005544624303261525,
  -0.0019872249816650367,
  array([7858, 8478]))}

In [137]:
# Construct the DataFrame
rows = []
for (k, aN), (playerA, playerB, original_beta_diff, new_beta_diff_refit, indices) in results_nonrobust.items():
    rows.append({
        "k-aN": (k, aN),
        "playerA": playerA + 1, # to account for the None index.
        "playerB": playerB + 1,
        "original_beta_diff": original_beta_diff,
        "new_beta_diff_refit": new_beta_diff_refit,
        "indices": indices
    })
llm_arena_results = pd.DataFrame(rows)
llm_arena_results.head()

Unnamed: 0,k-aN,playerA,playerB,original_beta_diff,new_beta_diff_refit,indices
0,"(1, 4)",25,0,0.013838,-0.004433,"[1977, 1796, 823, 1069]"
1,"(3, 15)",22,43,0.160611,-0.000565,"[22886, 26632, 33446, 31675, 13794, 7375, 3216..."
2,"(5, 10)",43,8,0.092087,-0.006171,"[2193, 1084, 1636, 4485, 3098, 1113, 4005, 400..."
3,"(10, 2)",3,9,0.00678,-0.000138,"[149, 4412]"
4,"(20, 2)",4,26,0.005545,-0.001987,"[7858, 8478]"


In [None]:
# reverse the mapping.
id_to_player = {v: k for k, v in player_to_id.items()}

In [139]:
llm_arena_results['playerA'].map(id_to_player)

0    gpt-4-0125-preview
1            gpt-4-0314
2      qwen1.5-72b-chat
3            gemini-pro
4      claude-instant-1
Name: playerA, dtype: object

In [140]:
llm_arena_results['playerB'].map(id_to_player)

0            gpt-4-1106-preview
1              qwen1.5-72b-chat
2                mistral-medium
3    mixtral-8x7b-instruct-v0.1
4               pplx-70b-online
Name: playerB, dtype: object

In [145]:
# read in the results.
llm_arena_results = pd.read_csv("results/LLMArenaNonrobust.csv")
llm_arena_results.head()
llm_arena_results['playerA_Name'] = llm_arena_results['playerA'].map(id_to_player)
llm_arena_results['playerB_Name'] = llm_arena_results['playerB'].map(id_to_player)

In [None]:
# for each index, find the in
indices = [7858, 8478]
LLMArena_noTies.iloc[indices]
# find the proportion of games where neither 

Unnamed: 0,id,model_a,model_b,winner_model_a,winner_model_b,winner_tie,question,answer_a,answer_b
7858,460124099,chatglm3-6b,pplx-70b-online,1,0,0,consider any natural number. if the number is ...,"Yes, this process will end in a loop for some ...",Consider a natural number. If the number is ev...
8478,549514947,pplx-70b-online,chatglm3-6b,0,1,0,Come up with a conspiracy theory about what is...,"A conspiracy theory about ""Q*"" in the context ...","Sure, here's a conspiracy theory about ""Q*"" in..."


[149, 4412]: this is a case where the points that were dropped are instances where *one* of the models (gemini-pro) beat a super strong model (gpt-4-1106-preview) twice.

[7858, 8478]: this is a case where one of the models (pplx-70b-online) lost twice to a super low-ranked model. 

In [178]:
llm_arena_results_with_props = add_match_proportions(llm_arena_results, LLMArena_noTies)
llm_arena_results_with_props.head()

Unnamed: 0,k-aN,playerA,playerB,original_beta_diff,new_beta_diff_refit,indices,playerA_Name,playerB_Name,prop_both,prop_one,prop_neither
0,"(1, 4)",25,0,0.013838,-0.004433,[1977 1796 823 1069],gpt-4-0125-preview,gpt-4-1106-preview,1.0,0.0,0.0
1,"(3, 15)",22,43,0.160611,-0.000565,[22886 26632 33446 31675 13794 7375 32166 256...,gpt-4-0314,qwen1.5-72b-chat,0.0,1.0,0.0
2,"(5, 10)",43,8,0.092087,-0.006171,[2193 1084 1636 4485 3098 1113 4005 4004 2361 ...,qwen1.5-72b-chat,mistral-medium,0.0,1.0,0.0
3,"(10, 2)",3,9,0.00678,-0.000138,[ 149 4412],gemini-pro,mixtral-8x7b-instruct-v0.1,0.0,1.0,0.0
4,"(20, 2)",4,26,0.005545,-0.001987,[7858 8478],claude-instant-1,pplx-70b-online,0.0,1.0,0.0
