In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from package.RankAMIP.logistic import run_logistic_regression
from package.RankAMIP.data_script import make_BT_design_matrix
from package.RankAMIP.logistic import LogisticAMIP
from package.RankAMIP.logistic import find_closest_matchups
from package.RankAMIP.logistic import isRankingRobust
from package.RankAMIP.plot_util import *

### Is ChatBot Arena Data Robust? (Version with Ties, As Implementated on Live Chatbot Arena)
https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=mSizG3Pzglte

### Load Data.

In [2]:
# Import datasets from https://huggingface.co/datasets/lmarena-ai/arena-human-preference-55k
from datasets import load_dataset
ds = load_dataset("lmarena-ai/arena-human-preference-55k")

In [3]:
# inspect the available splits
print(ds)  
# grab the ‘train’ split
train = ds["train"]

DatasetDict({
    train: Dataset({
        features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie'],
        num_rows: 57477
    })
})


In [4]:
df = train.to_pandas()
df.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


Create Weighted Dataset (duplicate wins/losses, and add split the ties -- A wins one, B wins one).

In [None]:
# Step 1: Add a preference_id column.
df['preference_id'] = np.arange(1, len(df) + 1)

# Step 2: Split into ties and non-ties
df_ties = df[df['winner_tie'] == 1].copy()
df_non_ties = df[df['winner_tie'] == 0].copy()

# Step 3: Duplicate non-tie rows
df_non_ties_duplicated = pd.concat([df_non_ties, df_non_ties], ignore_index=True)

# Step 4: Flip winner_model_a for tie rows
df_ties_flipped = df_ties.copy()
df_ties_flipped['winner_model_a'] = 1 - df_ties_flipped['winner_model_a']

# Tag each version for sorting
df_ties['version'] = 0  # Original tie
df_ties_flipped['version'] = 1  # Flipped tie

# Combine ties with proper grouping
df_ties_combined = pd.concat([df_ties, df_ties_flipped], ignore_index=True)

# Step 5: Combine all rows
df_non_ties_duplicated['version'] = -1  # So non-ties always come first
df_weighted = pd.concat([df_non_ties_duplicated, df_ties_combined], ignore_index=True)

# Step 6: Sort by preference_id and version to ensure tie + flipped_tie adjacency
df_weighted = df_weighted.sort_values(by=['preference_id', 'version']).reset_index(drop=True)

# (Optional) Drop 'version' if no longer needed
df_weighted = df_weighted.drop(columns='version')

df_weighted[df_weighted['winner_tie'] == 1].head(10)

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,preference_id
4,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1,3
5,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",1,0,1,3
28,862324,vicuna-13b,koala-13b,"[""Write me a poem in urdu in the style of Iqba...","[""Jism ko kisi ka paisa nahi,\nWo kisi ka jaan...","[""In the realm of selflessness dwells the true...",0,0,1,15
29,862324,vicuna-13b,koala-13b,"[""Write me a poem in urdu in the style of Iqba...","[""Jism ko kisi ka paisa nahi,\nWo kisi ka jaan...","[""In the realm of selflessness dwells the true...",1,0,1,15
36,933555,llama-2-13b-chat,claude-instant-1,"[""Mary has 6 others siblings named Monday, Tue...","[""Hello! I'm happy to help you with your quest...","[""The last child's name is likely Saturday.""]",0,0,1,19
37,933555,llama-2-13b-chat,claude-instant-1,"[""Mary has 6 others siblings named Monday, Tue...","[""Hello! I'm happy to help you with your quest...","[""The last child's name is likely Saturday.""]",1,0,1,19
40,1256092,claude-2.1,vicuna-13b,"[""Write a python code that calculates sum of 5...","[""Here is the python code that calculates the ...","[""Here is a Python code that calculates the su...",0,0,1,21
41,1256092,claude-2.1,vicuna-13b,"[""Write a python code that calculates sum of 5...","[""Here is the python code that calculates the ...","[""Here is a Python code that calculates the su...",1,0,1,21
44,1440765,llama2-70b-steerlm-chat,gpt-4-0125-preview,"[""Can you explain what the Cypher Query Langua...","[""The Cypher Query Language is a declarative l...","[""Certainly! The Cypher Query Language, often ...",0,0,1,23
45,1440765,llama2-70b-steerlm-chat,gpt-4-0125-preview,"[""Can you explain what the Cypher Query Langua...","[""The Cypher Query Language is a declarative l...","[""Certainly! The Cypher Query Language, often ...",1,0,1,23


In [7]:
df_weighted.shape[0]

114954

In [None]:
# get the unique names in both columns
model_a_names = df_weighted['model_a'].unique()
model_b_names = df_weighted['model_b'].unique()
# combine the two arrays and get the unique names
model_names = np.unique(np.concatenate((model_a_names, model_b_names)))
# print the number of unique model names
print(f"Number of unique model names: {len(model_names)}")

Number of unique model names: 64


In [9]:
rawBT = df_weighted[['model_a', 'model_b', 'winner_model_a']]
rawBT.head()

Unnamed: 0,model_a,model_b,winner_model_a
0,gpt-4-1106-preview,gpt-4-0613,1
1,gpt-4-1106-preview,gpt-4-0613,1
2,koala-13b,gpt-4-0613,0
3,koala-13b,gpt-4-0613,0
4,gpt-3.5-turbo-0613,mistral-medium,0


In [10]:
rawBT_orig = df[['model_a', 'model_b', 'winner_model_a']]
rawBT_orig.head()

Unnamed: 0,model_a,model_b,winner_model_a
0,gpt-4-1106-preview,gpt-4-0613,1
1,koala-13b,gpt-4-0613,0
2,gpt-3.5-turbo-0613,mistral-medium,0
3,llama-2-13b-chat,mistral-7b-instruct,1
4,koala-13b,gpt-3.5-turbo-0314,0


#### Run Top-k Robustness Check.

In [None]:
ks = [1, 3, 5, 10, 20]
results = {}
for k in ks:
    alphaN = 1
    chatbotA = -1
    while chatbotA == -1:
        chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices = isRankingRobust(k, alphaN, X_dup, y_dup)
        results[(k, alphaN)] = (chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices)
        alphaN += 1
        print(f'alphaN: {alphaN}')

alphaN: 2
alphaN: 3
alphaN: 4
alphaN: 2
alphaN: 3
alphaN: 4
alphaN: 5
alphaN: 6
alphaN: 7
alphaN: 8
alphaN: 9
alphaN: 10
alphaN: 11
alphaN: 12
alphaN: 13
alphaN: 14
alphaN: 15
alphaN: 16
alphaN: 17
alphaN: 18
alphaN: 19
alphaN: 20
alphaN: 21
alphaN: 22
alphaN: 23
alphaN: 24
alphaN: 25
alphaN: 26
alphaN: 2
alphaN: 3
alphaN: 4
alphaN: 5
alphaN: 6
alphaN: 7
alphaN: 8
alphaN: 9
alphaN: 10
alphaN: 11
alphaN: 12
alphaN: 13
alphaN: 14
alphaN: 15
alphaN: 16
alphaN: 17
alphaN: 18
alphaN: 2
alphaN: 2


In [None]:
# plot top-20 models on full chatbot arena.
filename_to_save = 'fig/top20_cba.png'
plot_title = 'Model Rankings in Chatbot Arena'
# plot_bt_scores(X, y, rankings, alphaN, 20, plot_title, filename_to_save)

In [127]:
# find the (k, alpha N) pairs that are top-k non-robust.
results_nonrobust = {k: v for k, v in results.items() if v[0] != -1}
results_nonrobust

{(1, 3): (21,
  None,
  997.4844821153303,
  1001.2768504277008,
  array([46259,  3527,  6212])),
 (3, 25): (6,
  41,
  50.71912577777253,
  -1.216407540959752,
  array([35121, 52352, 55829, 42442, 40406,  3483, 20172, 25481, 17355,
         13244, 22020,  6887, 22343, 39307, 17575,  5270, 53348, 21912,
         52752, 49539, 10429, 39612, 47469,   661, 37846])),
 (5, 17): (41,
  47,
  31.472403260684565,
  -1.7693612459962704,
  array([49509, 49513, 54538, 56334, 29584, 14102, 50133, 55511, 13835,
         20425, 27552, 38755, 51339, 27090, 12812, 31461, 27279])),
 (10, 1): (5, 4, -0.47590682407854246, 0.335368506890088, array([47669])),
 (20, 1): (29, 48, -3.3594514105587336, 0.27216327930572604, array([38092]))}

In [None]:
import pickle

# Save to pickle
with open('ChatbotArenaWithTiesNonrobust.pkl', 'wb') as f:
    pickle.dump(results, f)

#### Below, we inspect the ranking flip between the first- and second-place models.

In [30]:
### Load in results.
import pickle
with open("results/ChatbotArenaWithTiesNonrobust.pkl", "rb") as f:
    chatBotArenaDataDropped = pickle.load(f)

In [245]:
chatBotArena_noTies = pd.read_csv("data/chatBotArena_noTies.csv")
chatBotArena_noTies.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
3,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0
4,292873,vicuna-13b,gpt-4-0314,"[""Construct a rap battle, in the style of Epic...","[""[Zeus]\nYo, it's the king of the gods on the...","[""(Verse 1 - Zeus)\n\nI'm the king of the gods...",0,1,0


9 evals were dropped to flip model i.d. 16 and i.d. None (the reference model).

The name of these models are: 
('gpt-4-0125-preview', 0: 'gpt-4-1106-preview')

In [33]:
## Count number of games between the two models that changed ranks.
is_gpt41106_gpt40125 = (
    (df['model_a'].str.contains('gpt-4-0125-preview') & df['model_b'].str.contains('gpt-4-1106-preview')) |
    (df['model_a'].str.contains('gpt-4-1106-preview') & df['model_b'].str.contains('gpt-4-0125-preview'))
)

num_gpt41106_gpt40125 = df[is_gpt41106_gpt40125].shape[0]
print("Number of games between GPT-4-1106 and GPT-4-0125: ", num_gpt41106_gpt40125)

Number of games between GPT-4-1106 and GPT-4-0125:  134


In [34]:
# model pairs (sorted to group symmetric pairs)
df['model_pair'] = df.apply(lambda row: tuple(sorted([row['model_a'], row['model_b']])), axis=1)
df['model_pair']

0                (gpt-4-0613, gpt-4-1106-preview)
1                         (gpt-4-0613, koala-13b)
2            (gpt-3.5-turbo-0613, mistral-medium)
3         (llama-2-13b-chat, mistral-7b-instruct)
4                 (gpt-3.5-turbo-0314, koala-13b)
                           ...                   
57472                      (claude-1, gpt-4-0613)
57473              (claude-2.0, llama-2-13b-chat)
57474                      (alpaca-13b, claude-1)
57475                    (palm-2, tulu-2-dpo-70b)
57476    (gemini-pro-dev-api, gpt-4-1106-preview)
Name: model_pair, Length: 57477, dtype: object

In [35]:
# Count number of games per model pair
pair_counts = df['model_pair'].value_counts()

# Compute average
average_games_per_pair = pair_counts.mean()
print("Average number of games per model pair:", average_games_per_pair)

Average number of games per model pair: 45.08


In [36]:
# Find the win margin between 'gpt-4-0125-preview' and 'gpt-4-1106-preview'
# that is, find all games that are between the two models.
dfFlippedRanking = df[is_gpt41106_gpt40125]
## Count number of games between that 'gpt-4-0125-preview' won.
gpt40125_wins = (
    (dfFlippedRanking['model_a'].str.contains('gpt-4-0125-preview') & dfFlippedRanking['winner_model_a'] == 1) |
    (dfFlippedRanking['model_b'].str.contains('gpt-4-0125-preview') & dfFlippedRanking['winner_model_b'] == 1)
)
num_gpt40125_wins = dfFlippedRanking[gpt40125_wins].shape[0]
print("Proportion of games that GPT-4-0125 won: ", num_gpt40125_wins / num_gpt41106_gpt40125)

Proportion of games that GPT-4-0125 won:  0.26865671641791045


#### Player involvement in dropped matches

In [42]:
chatBotArenaDataDropped

{(1, 3): (21,
  None,
  -0.004146461890131415,
  0.0031921260689085195,
  array([46259,  3527,  6212])),
 (3, 25): (6,
  41,
  0.17525045894466218,
  -0.0030410188523184445,
  array([35121, 52352, 55829, 42442, 40406,  3483, 20172, 25481, 17355,
         13244, 22020,  6887, 22343, 39307, 17575,  5270, 53348, 21912,
         52752, 49539, 10429, 39612, 47469,   661, 37846])),
 (5, 17): (41,
  47,
  0.013760738510641968,
  -0.004423403115031421,
  array([49509, 49513, 54538, 56334, 29584, 14102, 50133, 55511, 13835,
         20425, 27552, 38755, 51339, 27090, 12812, 31461, 27279])),
 (10, 1): (5, 4, 0.000824661678016203, -0.003135680049496492, array([28908])),
 (20, 1): (29, 48, 0.01478311487693984, -0.01627043204034595, array([53747]))}

In [283]:
# Filter out rows where either model_a or model_b is one of the specified models
exclude_models = ['gpt-4-0125-preview', 'gpt-4-1106-preview']

not_gpt_previews = ~(
    chatBotArena_noTies['model_a'].isin(exclude_models) |
    chatBotArena_noTies['model_b'].isin(exclude_models)
)

chatBotArena_nogpt = chatBotArena_noTies[not_gpt_previews]

In [285]:
chatBotArena_nogpt.shape[0] # run the top-1 procedure on the filtered data.

33625

In [None]:
results_wout_gpt_preview = {}
k = 1
alphaN = 1
chatbotA = -1
while chatbotA == -1:
    chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices = isRankingRobust(k, alphaN, X, y)
    results_wout_gpt_preview[(k, alphaN)] = (chatbotA, chatbotB, chatbotOriginalBetaDiff, chatNewBetaDiff, chatIndices)
    alphaN += 1

#### Player involvement w out in-question matches

In [290]:
# make design matrix for BT.
rawBT_chatBotArena_nogpt = chatBotArena_nogpt[['model_a', 'model_b', 'winner_model_a']]

newX, newy, player_to_id = make_BT_design_matrix(rawBT_chatBotArena_nogpt)
newX.shape, newy.shape

((33625, 61), (33625,))

In [291]:
# run logistic regression on X, y
gpt_AMIP = LogisticAMIP(newX, newy, fit_intercept=False, penalty=None)

In [294]:
# compare the two GPT-preview models.
model_1 = 16
model_2 = None
alphaN = 1
sign_change_refit = False
while sign_change_refit == False: # increment alphaN until we find a sign change.
        print("testing alphaN: ", alphaN)
        sign_change_amip, sign_change_refit, original_beta_diff, new_beta_diff_amip, new_beta_diff_refit, indices = LogisticAMIP.AMIP_sign_change(gpt_AMIP, alphaN, model_1, model_2, 
                         method = "1sN", refit = True)
        alphaN += 1

testing alphaN:  1
testing alphaN:  2
testing alphaN:  3
testing alphaN:  4
testing alphaN:  5
testing alphaN:  6
testing alphaN:  7
testing alphaN:  8
testing alphaN:  9
testing alphaN:  10
testing alphaN:  11
testing alphaN:  12
testing alphaN:  13
testing alphaN:  14
testing alphaN:  15
testing alphaN:  16
testing alphaN:  17
testing alphaN:  18
testing alphaN:  19
testing alphaN:  20
testing alphaN:  21
testing alphaN:  22
testing alphaN:  23
testing alphaN:  24
testing alphaN:  25
testing alphaN:  26
testing alphaN:  27
testing alphaN:  28
testing alphaN:  29
testing alphaN:  30
testing alphaN:  31
testing alphaN:  32
testing alphaN:  33
testing alphaN:  34
testing alphaN:  35
testing alphaN:  36
testing alphaN:  37
testing alphaN:  38
testing alphaN:  39
testing alphaN:  40
testing alphaN:  41
testing alphaN:  42
testing alphaN:  43
testing alphaN:  44
testing alphaN:  45
testing alphaN:  46
testing alphaN:  47
testing alphaN:  48
testing alphaN:  49
testing alphaN:  50
testing a

In [296]:
# it takes 
sign_change_amip, sign_change_refit, original_beta_diff, new_beta_diff_amip, new_beta_diff_refit, indices
print("it takes ", alphaN, " iterations w/out the specified models to find a sign change.")

it takes  229  iterations w/out the specified models to find a sign change.


### Manuel check (for each k) on the MIS! 

In [107]:
chatBotArenaDataDropped = results_nonrobust

In [108]:
old_tuple = chatBotArenaDataDropped[(1, 3)] # = 0 # change none to -1 (gpt-4-1106-preview).
new_tuple = tuple(-1 if i == 1 and val is None else val for i, val in enumerate(old_tuple))
chatBotArenaDataDropped[(1, 3)] = new_tuple

In [109]:
chatBotArenaDataDropped

{(1, 3): (21,
  -1,
  -0.006288794711674339,
  0.0031921260689085195,
  array([46259,  3527,  6212])),
 (3, 25): (6,
  41,
  0.12679781444443133,
  -0.0030410188523184445,
  array([35121, 52352, 55829, 42442, 40406,  3483, 20172, 25481, 17355,
         13244, 22020,  6887, 22343, 39307, 17575,  5270, 53348, 21912,
         52752, 49539, 10429, 39612, 47469,   661, 37846])),
 (5, 17): (41,
  47,
  0.07868100815171142,
  -0.004423403115031421,
  array([49509, 49513, 54538, 56334, 29584, 14102, 50133, 55511, 13835,
         20425, 27552, 38755, 51339, 27090, 12812, 31461, 27279])),
 (10, 1): (5,
  4,
  -0.0011897670601963561,
  0.0008384212669726443,
  array([34831])),
 (20, 1): (29,
  48,
  -0.008398628526396834,
  0.0006804081985379851,
  array([38092]))}

In [110]:
rows = []
for (k, aN), (playerA, playerB, original_beta_diff, new_beta_diff_refit, indices) in chatBotArenaDataDropped.items():
    rows.append({
        "k-aN": (k, aN),
        "playerA": playerA + 1, # to account for the reference index.
        "playerB": playerB + 1,
        "original_beta_diff": original_beta_diff,
        "new_beta_diff_refit": new_beta_diff_refit,
        "indices": indices
    })
cba_results = pd.DataFrame(rows)
cba_results.head()

Unnamed: 0,k-aN,playerA,playerB,original_beta_diff,new_beta_diff_refit,indices
0,"(1, 3)",22,0,-0.006289,0.003192,"[46259, 3527, 6212]"
1,"(3, 25)",7,42,0.126798,-0.003041,"[35121, 52352, 55829, 42442, 40406, 3483, 2017..."
2,"(5, 17)",42,48,0.078681,-0.004423,"[49509, 49513, 54538, 56334, 29584, 14102, 501..."
3,"(10, 1)",6,5,-0.00119,0.000838,[34831]
4,"(20, 1)",30,49,-0.008399,0.00068,[38092]


In [112]:
mis_inds = cba_results['indices'][4]
X_new = np.delete(X, mis_inds, axis=0)
y_new = np.delete(y, mis_inds, axis=0)

res_full = run_logistic_regression(X,
                                    y,
                                    fit_intercept=False, 
                                    penalty=None
                                    )
res_dropped = run_logistic_regression(X_new,
                                y_new,
                                fit_intercept=False, 
                                penalty=None
                                )

In [113]:
player1 = cba_results['playerA'][4] - 1
player2 = cba_results['playerB'][4] - 1
beta_diff = res_full.coef_[0][player1] - res_full.coef_[0][player2]
beta_diff_refit = res_dropped.coef_[0][player1] - res_dropped.coef_[0][player2]
# res_full.coef_[0][player1], res_dropped.coef_[0][player1] # for the "None" version
beta_diff, beta_diff_refit

(-0.008398628526396834, 0.0006804081988217581)

In [46]:
cba_results['indices'][0]

array([46259,  3527,  6212])

In [47]:
# reverse the mapping.
id_to_player = {v: k for k, v in player_to_id.items()}

In [49]:
# read in the results.
cba_results.head()
cba_results['playerA_Name'] = cba_results['playerA'].map(id_to_player)
cba_results['playerB_Name'] = cba_results['playerB'].map(id_to_player)