In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from package.RankAMIP.logistic import run_logistic_regression
from package.RankAMIP.data_script import make_BT_design_matrix
from package.RankAMIP.logistic import LogisticAMIP
from package.RankAMIP.logistic import find_closest_matchups
from package.RankAMIP.logistic import isRankingRobust
from package.RankAMIP.plot_util import *

### Is ChatBot Arena Data Robust?

### Load Data.

In [55]:
# Import datasets from https://huggingface.co/datasets/lmarena-ai/arena-human-preference-55k
from datasets import load_dataset
ds = load_dataset("lmarena-ai/arena-human-preference-55k")

In [None]:
# inspect the available splits
print(ds)  
# grab the ‘train’ split
train = ds["train"]

DatasetDict({
    train: Dataset({
        features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie'],
        num_rows: 57477
    })
})
{'id': 30192, 'model_a': 'gpt-4-1106-preview', 'model_b': 'gpt-4-0613', 'prompt': '["Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer."]', 'response_a': '["The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity, and discrimination.\\n\\nHere are some arguments in favor of and against such policies:\\n\\n**Arguments in favor:**\\n\\n1. **Correcting Historical Inequities:** Women have historically been underrepresented in leadership roles due to various cultural, institutional, and social barriers. Aiming for a specific percentage can

In [57]:
df = train.to_pandas()
df.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [58]:
# how to get the unique names in both columns
model_a_names = df['model_a'].unique()
model_b_names = df['model_b'].unique()
# combine the two arrays and get the unique names
model_names = np.unique(np.concatenate((model_a_names, model_b_names)))
# print the number of unique model names
print(f"Number of unique model names: {len(model_names)}")

Number of unique model names: 64


In [59]:
ties = df[df['winner_tie'] == 1]
print(f"Number of ties: {len(ties)}")
# proportion of ties.
print(f"Proportion of ties: {len(ties) / len(df):.2%}")

Number of ties: 17761
Proportion of ties: 30.90%


#### Drop the rows that correspond to ties.

In [60]:
# drop rows in df with df['winner_tie'] == 1
df_ties_dropped = df[df['winner_tie'] == 0]
df_ties_dropped.shape
rawBT_noTies = df_ties_dropped[['model_a', 'model_b', 'winner_model_a']]
rawBT_noTies.head()

Unnamed: 0,model_a,model_b,winner_model_a
0,gpt-4-1106-preview,gpt-4-0613,1
1,koala-13b,gpt-4-0613,0
3,llama-2-13b-chat,mistral-7b-instruct,1
4,koala-13b,gpt-3.5-turbo-0314,0
5,vicuna-13b,gpt-4-0314,0


In [None]:
# count number of times each model appears in a match.
for model in model_names:
    filtered = df_ties_dropped[
        (df_ties_dropped['model_a'] == model) | 
        (df_ties_dropped['model_b'] == model)
    ]
    print(f"{model}: {filtered.shape[0]}")

RWKV-4-Raven-14B: 799
alpaca-13b: 982
chatglm-6b: 826
chatglm2-6b: 393
chatglm3-6b: 695
claude-1: 2792
claude-2.0: 1741
claude-2.1: 3969
claude-instant-1: 2922
codellama-34b-instruct: 1027
deepseek-llm-67b-chat: 519
dolly-v2-12b: 556
dolphin-2.2.1-mistral-7b: 240
falcon-180b-chat: 203
fastchat-t5-3b: 702
gemini-pro: 1008
gemini-pro-dev-api: 1043
gpt-3.5-turbo-0125: 576
gpt-3.5-turbo-0314: 968
gpt-3.5-turbo-0613: 4866
gpt-3.5-turbo-1106: 2317
gpt-4-0125-preview: 798
gpt-4-0314: 2923
gpt-4-0613: 4306
gpt-4-1106-preview: 5360
gpt4all-13b-snoozy: 277
guanaco-33b: 457
koala-13b: 1102
llama-13b: 392
llama-2-13b-chat: 1769
llama-2-70b-chat: 2315
llama-2-7b-chat: 1174
llama2-70b-steerlm-chat: 458
mistral-7b-instruct: 1098
mistral-7b-instruct-v0.2: 72
mistral-medium: 2231
mixtral-8x7b-instruct-v0.1: 2406
mpt-30b-chat: 404
mpt-7b-chat: 643
nous-hermes-2-mixtral-8x7b-dpo: 235
oasst-pythia-12b: 1068
openchat-3.5: 1066
openchat-3.5-0106: 159
openhermes-2.5-mistral-7b: 609
palm-2: 1433
pplx-70b-onli

In [None]:
# make design matrix for BT.
X, y, player_to_id = make_BT_design_matrix(rawBT_noTies)
X.shape, y.shape

((39716, 63), (39716,))

#### Run Top-k Robustness Check.

In [None]:
ks = [1, 3, 5, 10, 20]
results = {}
for k in ks:
    alphaN = 1
    chatbotA = -1
    while chatbotA == -1:
        chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices = isRankingRobust(k, alphaN, X, y)
        results[(k, alphaN)] = (chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices)
        alphaN += 1

In [None]:
rankings = return_rankings_list(X, y, results, 10, 3, player_to_id)

In [None]:
# plot top-20 models on full chatbot arena.
filename_to_save = 'fig/top20_cba.png'
plot_title = 'Model Rankings in Chatbot Arena'
plot_bt_scores(X, y, rankings, alphaN, 20, plot_title, filename_to_save)

In [None]:
# find the (k, alpha N) pairs that are top-k non-robust.
results_nonrobust = {k: v for k, v in results.items() if v[0] != -1}
results_nonrobust

{(1, 9): (16,
  None,
  0.03365783604990017,
  -0.0012388871146928558,
  array([ 2370,  3227, 38242, 10155, 11353, 17714, 18638, 19742, 32828])),
 (3, 24): (6,
  30,
  0.2132132330351283,
  -0.007985048942084516,
  array([24342, 29361, 38576, 36203, 12188, 15215,  3643, 36863, 27987,
          2443, 14012, 12035, 17662,  9137,  4812, 15290, 15514, 27250,
         36462,   469, 32825,  7193, 27463, 34265])),
 (5, 5): (30,
  38,
  0.03657306106365754,
  -0.00615664236957858,
  array([ 9754, 19106, 20511,  9565, 26870])),
 (10, 3): (39,
  4,
  0.011066418306820314,
  -0.0020685804166893362,
  array([16514,   601,  7082])),
 (20, 2): (56,
  19,
  0.015632264920975247,
  -0.01379039709834684,
  array([ 2783, 22323]))}

#### Below, we inspect the ranking flip between the first- and second-place models.

In [None]:
### Load in results.
import pickle
with open("results/ChatbotArenaNonrobust.pkl", "rb") as f:
    chatBotArenaDataDropped = pickle.load(f)

In [None]:
chatBotArena_noTies = pd.read_csv("data/chatBotArena_noTies.csv")
chatBotArena_noTies.head()


Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
3,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0
4,292873,vicuna-13b,gpt-4-0314,"[""Construct a rap battle, in the style of Epic...","[""[Zeus]\nYo, it's the king of the gods on the...","[""(Verse 1 - Zeus)\n\nI'm the king of the gods...",0,1,0


9 evals were dropped to flip model i.d. 16 and i.d. None (the reference model).

The name of these models are: 
('gpt-4-0125-preview', 0: 'gpt-4-1106-preview')

In [None]:
## Count number of games between the two models that changed ranks.
is_gpt41106_gpt40125 = (
    (chatBotArena_noTies['model_a'].str.contains('gpt-4-0125-preview') & chatBotArena_noTies['model_b'].str.contains('gpt-4-1106-preview')) |
    (chatBotArena_noTies['model_a'].str.contains('gpt-4-1106-preview') & chatBotArena_noTies['model_b'].str.contains('gpt-4-0125-preview'))
)

num_gpt41106_gpt40125 = chatBotArena_noTies[is_gpt41106_gpt40125].shape[0]
print("Number of games between GPT-4-1106 and GPT-4-0125: ", num_gpt41106_gpt40125)

Number of games between GPT-4-1106 and GPT-4-0125:  67


In [None]:
# model pairs (sorted to group symmetric pairs)
chatBotArena_noTies['model_pair'] = chatBotArena_noTies.apply(lambda row: tuple(sorted([row['model_a'], row['model_b']])), axis=1)
chatBotArena_noTies['model_pair']

0                (gpt-4-0613, gpt-4-1106-preview)
1                         (gpt-4-0613, koala-13b)
2         (llama-2-13b-chat, mistral-7b-instruct)
3                 (gpt-3.5-turbo-0314, koala-13b)
4                        (gpt-4-0314, vicuna-13b)
                           ...                   
39711                      (claude-1, gpt-4-0613)
39712              (claude-2.0, llama-2-13b-chat)
39713                      (alpaca-13b, claude-1)
39714                    (palm-2, tulu-2-dpo-70b)
39715    (gemini-pro-dev-api, gpt-4-1106-preview)
Name: model_pair, Length: 39716, dtype: object

In [85]:
# Count number of games per model pair
pair_counts = chatBotArena_noTies['model_pair'].value_counts()

# Compute average
average_games_per_pair = pair_counts.mean()
print("Average number of games per model pair:", average_games_per_pair)

Average number of games per model pair: 31.900401606425703


In [None]:
# Find the win margin between 'gpt-4-0125-preview' and 'gpt-4-1106-preview'
# that is, find all games that are between the two models.
dfFlippedRanking = chatBotArena_noTies[is_gpt41106_gpt40125]
## Count number of games between that 'gpt-4-0125-preview' won.
gpt40125_wins = (
    (dfFlippedRanking['model_a'].str.contains('gpt-4-0125-preview') & dfFlippedRanking['winner_model_a'] == 1) |
    (dfFlippedRanking['model_b'].str.contains('gpt-4-0125-preview') & dfFlippedRanking['winner_model_b'] == 1)
)
num_gpt40125_wins = dfFlippedRanking[gpt40125_wins].shape[0]
print("Proportion of games that GPT-4-0125 won: ", num_gpt40125_wins / num_gpt41106_gpt40125)

Proportion of games that GPT-4-0125 won:  0.5373134328358209


#### Player involvement in dropped matches

In [159]:
chatBotArenaDataDropped

{(1, 9): (16,
  None,
  0.03365783604990017,
  -0.0012388871146928558,
  array([ 2370,  3227, 38242, 10155, 11353, 17714, 18638, 19742, 32828])),
 (3, 24): (6,
  30,
  0.2132132330351283,
  -0.007985048942084516,
  array([24342, 29361, 38576, 36203, 12188, 15215,  3643, 36863, 27987,
          2443, 14012, 12035, 17662,  9137,  4812, 15290, 15514, 27250,
         36462,   469, 32825,  7193, 27463, 34265])),
 (5, 5): (30,
  38,
  0.03657306106365754,
  -0.00615664236957858,
  array([ 9754, 19106, 20511,  9565, 26870])),
 (10, 3): (39,
  4,
  0.011066418306820314,
  -0.0020685804166893362,
  array([16514,   601,  7082])),
 (20, 2): (56,
  19,
  0.015632264920975247,
  -0.01379039709834684,
  array([ 2783, 22323]))}

In [161]:
old_tuple = chatBotArenaDataDropped[(1, 9)] # = 0 # change none to -1 (gpt-4-1106-preview).
new_tuple = tuple(-1 if i == 1 and val is None else val for i, val in enumerate(old_tuple))
results_nonrobust[(1, 9)] = new_tuple

In [None]:
rows = []
for (k, aN), (playerA, playerB, original_beta_diff, new_beta_diff_refit, indices) in results_nonrobust.items():
    rows.append({
        "k-aN": (k, aN),
        "playerA": playerA + 1, # to account for the reference index.
        "playerB": playerB + 1,
        "original_beta_diff": original_beta_diff,
        "new_beta_diff_refit": new_beta_diff_refit,
        "indices": indices
    })
cba_results = pd.DataFrame(rows)
cba_results.head()

Unnamed: 0,k-aN,playerA,playerB,original_beta_diff,new_beta_diff_refit,indices
0,"(1, 9)",17,0,0.033658,-0.001239,"[2370, 3227, 38242, 10155, 11353, 17714, 18638..."
1,"(3, 24)",7,31,0.213213,-0.007985,"[24342, 29361, 38576, 36203, 12188, 15215, 364..."
2,"(5, 5)",31,39,0.036573,-0.006157,"[9754, 19106, 20511, 9565, 26870]"
3,"(10, 3)",40,5,0.011066,-0.002069,"[16514, 601, 7082]"
4,"(20, 2)",57,20,0.015632,-0.01379,"[2783, 22323]"


In [None]:
# reverse the mapping.
id_to_player = {v: k for k, v in player_to_id.items()}

In [None]:
# read in the results.
cba_results.head()
cba_results['playerA_Name'] = cba_results['playerA'].map(id_to_player)
cba_results['playerB_Name'] = cba_results['playerB'].map(id_to_player)

In [None]:
# for each index, find the corresponding row in the original dataframe.
indices = [2783, 22323] # [149, 4412] 
chatBotArena_noTies.iloc[indices]
# find the proportion of games where neither 

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
2783,301329900,gpt-4-1106-preview,nous-hermes-2-mixtral-8x7b-dpo,"[""Your task is to perform relation and entity ...","[""GutMicrobiota(x) \u2227 GoodHealth(y) \u2192...","[""Extraction 1:\n\nGutMicrobiota(x) \u2227 Goo...",0,1,0
22323,2387156917,gpt-4-1106-preview,nous-hermes-2-mixtral-8x7b-dpo,"[""In a context of plant encyclopedia where we ...","[""1. **Unusual Foliage Coloration**: The White...","[""1. White Firs (Abies concolor) are native to...",0,1,0


### Inspect the dropped human evals.

In [174]:
chatbot_arena_results_with_props = add_match_proportions(cba_results, chatBotArena_noTies)
chatbot_arena_results_with_props.head()

Unnamed: 0,k-aN,playerA,playerB,original_beta_diff,new_beta_diff_refit,indices,playerA_Name,playerB_Name,prop_both,prop_one,prop_neither
0,"(1, 9)",17,0,0.033658,-0.001239,"[2370, 3227, 38242, 10155, 11353, 17714, 18638...",gpt-4-0125-preview,gpt-4-1106-preview,1.0,0.0,0.0
1,"(3, 24)",7,31,0.213213,-0.007985,"[24342, 29361, 38576, 36203, 12188, 15215, 364...",gpt-4-0314,qwen1.5-72b-chat,0.0,1.0,0.0
2,"(5, 5)",31,39,0.036573,-0.006157,"[9754, 19106, 20511, 9565, 26870]",qwen1.5-72b-chat,mistral-medium,0.0,1.0,0.0
3,"(10, 3)",40,5,0.011066,-0.002069,"[16514, 601, 7082]",yi-34b-chat,gemini-pro,1.0,0.0,0.0
4,"(20, 2)",57,20,0.015632,-0.01379,"[2783, 22323]",nous-hermes-2-mixtral-8x7b-dpo,vicuna-33b,0.0,1.0,0.0


In the rank-flipping between mistral and qwen, a total of 5 evaluations where qwen played and won against gpt-4-1106-preview was enough to move its ranking to above that of mistral.

In [None]:
# Let's investigate the responses that changed the Top-5 models.
mistral_qwen_inds = chatbot_arena_results_with_props["indices"].iloc[2]
chatBotArena_noTies.iloc[mistral_qwen_inds]

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
9754,1051544564,qwen1.5-72b-chat,gpt-4-1106-preview,"[""\nread this content. Do not change the conte...","[""6. Use AI Writing Tools\n\nIncorporating AI ...","[""6. Use AI to Your Advantage\n\nIn this age o...",1,0,0
19106,2049273519,gpt-4-1106-preview,qwen1.5-72b-chat,"[""Which is the newest model that also contain ...","[""I'm sorry, but your question isn't clear. Wh...","[""As an AI, I don't have real-time access to u...",0,1,0
20511,2199125192,qwen1.5-72b-chat,gpt-4-1106-preview,"[""Please answer this questions in 8 sentences ...","[""1. When a legal description appears clear in...","[""1. The type of ambiguity encountered is like...",1,0,0
9565,1029897539,gpt-4-1106-preview,qwen1.5-72b-chat,"[""Is it possible to give a transformer custom ...","[""Yes, it is possible to give a transformer mo...","[""Yes, it is possible to give a transformer mo...",0,1,0
26870,2888250053,gpt-4-1106-preview,qwen1.5-72b-chat,"[""How to use Poetry to install packages in pyt...","[""Poetry is a tool for dependency management a...","[""Poetry is a dependency manager for Python th...",0,1,0


In [194]:
# MIS: prompts.
# for prompt in chatBotArena_noTies.iloc[gpt4top2_inds]['prompt']:
#     print(prompt)
import textwrap
for i, prompt in enumerate(chatBotArena_noTies.loc[mistral_qwen_inds]['prompt']):
    print(f"\n=== prompt {i+1} ===\n")
    print(textwrap.fill(prompt, width=100))


=== prompt 1 ===

["\nread this content. Do not change the content written. Add 5 more tips in the same tone of voice
and style. \nhave one be use AI \n\nHey freelance writers: Let's talk about writer's block.\n\nI
know, I know. You don't want to say it out loud. It's like \"Voldemort\" or actors saying
\"Macbeth\" out loud (that's why they call it \"The Scottish Play\"). But, like the common cold,
writer's block is out there ... and odds are, it's going to get every freelance writer at some
point.\n\nThe Terror of Writer's Block\n\nYou've probably had it happen to you at one time or
another. You're sitting at your desk, a cup of coffee or cold beverage nearby, ready to start
writing, knocking off those assignments, and adding some zeroes to the bank account. The words are
ready to go ... but somehow, someway, they just won't come out. The blank page remains there,
looking at you, taunting you.\n\nThen the terror starts. \"Oh no,\" you think. \"How long will this
last? Will I get para

In [195]:
# MIS: Answer ModelA.
# for response_A in chatBotArena_noTies.iloc[gpt4top2_inds]['response_a']:
#     print(response_A)
import textwrap

for i, response_a in enumerate(chatBotArena_noTies.loc[mistral_qwen_inds]['response_a']):
    print(f"\n=== Response {i+1} ===\n")
    print(textwrap.fill(response_a, width=100))


=== Response 1 ===

["6. Use AI Writing Tools\n\nIncorporating AI writing tools can occasionally help break the creative
gridlock. There are various AI assistants that can generate prompts, suggest sentences, or even
complete paragraphs for you. Of course, you'll want to maintain your unique voice and edit AI-
generated content, but it's a useful starting point to overcome writer's block. It can spark new
ideas or simply provide a gentle nudge to get your creative juices flowing.\n\n7. Engage in Physical
Exercise\n\nPhysical activity can help clear your mind and boost your energy levels. Go for a run,
hit the gym, or do a brief workout session. Exercise releases endorphins that can improve your mood
and stimulate your brain, making it easier to overcome writer's block. Plus, you'll feel refreshed
and ready to tackle your writing once you're done.\n\n8. Set Realistic Writing Goals\n\nSometimes,
the pressure of a looming deadline or an overly ambitious goal can contribute to writer's bl

In [196]:
import textwrap

for i, response_b in enumerate(chatBotArena_noTies.loc[mistral_qwen_inds]['response_b']):
    print(f"\n=== Response {i+1} ===\n")
    print(textwrap.fill(response_b, width=100))


=== Response 1 ===

["6. Use AI to Your Advantage\n\nIn this age of technology, leverage the power of AI to overcome
writer's block. AI writing tools can provide prompts, continue a thought, or even help outline your
piece. It's like having a brainstorming buddy at your fingertips, one who's available 24\/7 and
doesn't mind you bouncing ideas off of them, no matter how rough they might be. Just type in a
sentence or a theme, and let the artificial intelligence give you a nudge in the right direction.
Remember, it's not about replacing your creativity but enhancing it.\n\n7. Set Small, Achievable
Goals\n\nSometimes writer's block stems from feeling overwhelmed by the size of the project. Break
it down into small, manageable pieces. Set a goal for a paragraph, a page, or maybe just a great
opening sentence. Achieving these smaller goals can give you a sense of progress and help build
momentum. As you tick off these mini milestones, the larger project won't seem so daunting.\n\n8.
Engage