In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from glicko2 import Player

In [None]:
csv_path='data/0825.csv'

In [None]:

# モデルのレーティングを保持する辞書
models = {}

# レーティング変更履歴を保存するリスト
rating_history = []

def get_or_create_player(model_name):
    if model_name not in models:
        models[model_name] = Player(rating=1500, rd=350, vol=0.06)
    return models[model_name]

def update_ratings(model1, model2, result, index):
    player1 = get_or_create_player(model1)
    player2 = get_or_create_player(model2)
    
    # 更新前のレーティングとRDを保存
    rating1, rd1 = player1.rating, player1.rd
    rating2, rd2 = player2.rating, player2.rd
    
    # result: 1 = model1の勝利, 0.5 = 引き分け, 0 = model2の勝利
    player1.update_player([rating2], [rd2], [result])
    player2.update_player([rating1], [rd1], [1 - result])
    
    # レーティング変更履歴を保存
    rating_history.append({
        'index': index,
        'model1': model1,
        'model2': model2,
        'rating1_before': rating1,
        'rating1_after': player1.rating,
        'rating2_before': rating2,
        'rating2_after': player2.rating,
        'result': result
    })


In [None]:
#モデルの対戦回数と勝率
total_battle_count = {}
win_count = {}
total_battles=0
# CSVファイルを読み込む
with open(csv_path, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # ヘッダーをスキップ
    
    for index, row in enumerate(reader, start=1):
        try:
            evaluation = int(row[3])
        except:
            continue  # 評価が空白のものはスキップ
        model1 = row[5]
        model2 = row[6]
        total_battles+=1
        
        if evaluation == 1:
            update_ratings(model1, model2, 1, index)
        elif evaluation == 2:
            update_ratings(model1, model2, 0, index)
        elif evaluation == 3:  # 両方良い場合（引き分け）
            update_ratings(model1, model2, 0.5, index)
        elif evaluation == 0:  # 両方良い場合（引き分け）
            update_ratings(model1, model2, 0.5, index)


        # evaluation が 0 の場合は更新しない（両方悪い場合）

        # モデルの対戦回数と勝率を更新
        if model1 not in total_battle_count:
            total_battle_count[model1] = 0
            win_count[model1] = 0
        if model2 not in total_battle_count:
            total_battle_count[model2] = 0
            win_count[model2] = 0
        total_battle_count[model1] += 1
        total_battle_count[model2] += 1
        if evaluation == 1:
            win_count[model1] += 1
        elif evaluation == 2:
            win_count[model2] += 1

total_battle_count,win_count

In [None]:
sum(total_battle_count.values())/2,total_battles

In [None]:
#boundには1σの値を入れる
sigma_val=1

# 結果をレーティング順にソートして表示
print("Model Ratings (sorted by rating):")
sorted_models = sorted(models.items(), key=lambda x: x[1].rating, reverse=True)

def rename_model(model):
    if model.find("/")!=-1:
        model_=model.split("/")[1]
    else:
        model_=model
    if model.find("plamo")!= -1:
        model_="PLAMO-100B"
    model_=model_.replace("gpt","GPT")
    model_=model_.replace("3-5","3.5")

    model_=model_[0].upper()+model_[1:]
    return model_

sorted_data=[]
for model, player in sorted_models:
    rating = player.rating
    rd = player.rd
    lower_bound = rating - sigma_val * rd
    upper_bound = rating + sigma_val * rd
    print(f"{model}: Rating = {rating:.2f}, RD = {rd:.2f} (95% CI: {lower_bound:.2f} - {upper_bound:.2f})")
    
    #先頭文字は大文字
    model_=rename_model(model)
    #勝率
    win_rate=win_count[model]/total_battle_count[model]
    d={"name":model_,"name_original":model,
       "win_rate":win_rate,
       "total_battle_count":total_battle_count[model],
       "rating":rating,"rd":rd,
       "lower_bound":lower_bound,"upper_bound":upper_bound,

       }
      
       
    sorted_data.append(d)

"""
# レーティング変更履歴をCSVファイルに保存
with open('data/rating_history.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=[
        'index', 'model1', 'model2', 'rating1_before', 'rating1_after',
        'rating2_before', 'rating2_after', 'result'
    ])
    writer.writeheader()
    writer.writerows(rating_history)

print("Rating history has been saved to 'rating_history.csv'")
"""

In [None]:
#2sigmaでのグラフ
df=pd.DataFrame(sorted_data)
plt.figure(figsize=(8,5),dpi=150)
sns.barplot(x="rating",y="name",data=df)
#エラーバーを表示. プロットは打たない
plt.errorbar(df["rating"],df["name"],xerr=df["rd"]*2,fmt='None',color='black',capsize=5,elinewidth=1,alpha=0.5)
plt.xlim(1100,1900)
plt.xlabel("Glicko-2 Rating")
plt.ylabel("Model name")

In [None]:
#1sigmaでのグラフ
df=pd.DataFrame(sorted_data)
plt.figure(figsize=(8,5),dpi=150)
sns.barplot(x="rating",y="name",data=df)
#エラーバーを表示. プロットは打たない
plt.errorbar(df["rating"],df["name"],xerr=df["rd"],fmt='None',color='black',capsize=5,elinewidth=1,alpha=0.5)
plt.xlim(1100,1800)
plt.xlabel("Glicko-2 Rating")
plt.ylabel("Model name")

In [None]:
#信頼区間での順位を予測
import copy

df=pd.DataFrame(sorted_data)
df["lowerst_rank"]=0
df["highest_rank"]=0
model_list=df["name"].tolist()


for model in model_list:
    df_lowest=copy.deepcopy(df)
    #当該モデルにとって最も悲観的なレーティングを設定
    df_lowest["rating"]=df_lowest["upper_bound"]
    df_lowest.loc[df_lowest["name"] == model, "rating"] = df_lowest.loc[df_lowest["name"] == model, "lower_bound"]

    #レーティング順にソート
    df_lowest=df_lowest.sort_values("rating",ascending=False).reset_index(drop=True)
    
    #当該モデルの順位を取得
    rank=df_lowest[df_lowest["name"]==model].index[0]+1
    df.loc[df["name"]==model,"lowerst_rank"]=rank

    #当該モデルにとって最も楽観的なレーティングを設定
    df_highest=copy.deepcopy(df)
    df_highest["rating"]=df_highest["lower_bound"]
    df_highest.loc[df_highest["name"] == model, "rating"] = df_highest.loc[df_highest["name"] == model, "upper_bound"]

    #レーティング順にソート
    df_highest=df_highest.sort_values("rating",ascending=False).reset_index(drop=True)

    #当該モデルの順位を取得
    rank=df_highest[df_highest["name"]==model].index[0]+1
    df.loc[df["name"]==model,"highest_rank"]=rank


df["rank_range"]=df["highest_rank"].astype(str)+"-"+df["lowerst_rank"].astype(str)
df["rating"]=df["rating"].astype(int)
df["lower_bound"]=df["lower_bound"].astype(int)
df["upper_bound"]=df["upper_bound"].astype(int)
df["rd"]=df["rd"].astype(int)
df["win_rate"]=df["win_rate"].map(lambda x:round(x,2))
df=df.drop(["lowerst_rank","highest_rank","name_original","lower_bound","upper_bound"],axis=1)


#leaderboard3の結果も反映
ld3_score_dict={
'Claude-3.5-sonnet-20240620':0.82,
 'GPT-4o-2024-05-13':0.78,
 'Tanuki-8x8B-dpo-v1.0':0.57,
 'Gemini-1.5-pro':0.73,
 'Gemini-1.5-flash':0.70,
 'GPT-4o-mini-2024-07-18':0.72,
 'Calm3-22b-chat':0.65,
 'Tanuki-8B-dpo-v1.0':0.52,
 'PLAMO-100B':-1,
 'Llama-3-ELYZA-JP-8B':0.62,
 'Karakuri-lm-8x7b-chat-v0.1':0.6,
 'GPT-3.5-turbo':0.58,
 'Llama-3-Swallow-70B-Instruct-v0.1':0.65,
}
df["LD3-ave"]=df["name"].map(ld3_score_dict)

ld3_jaster_zero_shot_score_dict={
'Claude-3.5-sonnet-20240620':0.7,
 'GPT-4o-2024-05-13':0.69,
 'Tanuki-8x8B-dpo-v1.0':0.32,
 'Gemini-1.5-pro':0.59,
 'Gemini-1.5-flash':0.57,
 'GPT-4o-mini-2024-07-18':0.64,
 'Calm3-22b-chat':0.57,
 'Tanuki-8B-dpo-v1.0':0.22,
 'PLAMO-100B':-1,
 'Llama-3-ELYZA-JP-8B':0.42,
 'Karakuri-lm-8x7b-chat-v0.1':0.48,
 'GPT-3.5-turbo':0.44,
 'Llama-3-Swallow-70B-Instruct-v0.1':0.5,
}
df["LD3-Jaster-zero-shot"]=df["name"].map(ld3_jaster_zero_shot_score_dict)

#ld3のmt benchも反映
ld3_mt_score_dict={
'Claude-3.5-sonnet-20240620':8.7,
 'GPT-4o-2024-05-13':8.6,
 'Tanuki-8x8B-dpo-v1.0':7.0,
 'Gemini-1.5-pro':7.9,
 'Gemini-1.5-flash':7.6,
 'GPT-4o-mini-2024-07-18':8.3,
 'Calm3-22b-chat':6.9,
 'Tanuki-8B-dpo-v1.0':6.6,
 'PLAMO-100B':-1,
 'Llama-3-ELYZA-JP-8B':6.1,
 'Karakuri-lm-8x7b-chat-v0.1':5.8,
 'GPT-3.5-turbo':6.8,
 'Llama-3-Swallow-70B-Instruct-v0.1':6.2,
}
df["JMT-Bench"]=df["name"].map(ld3_mt_score_dict)

df=df.sort_values("win_rate",ascending=False).reset_index(drop=True)
df.to_csv("data/leaderboard.csv",index=False)
df

In [None]:
#メモリを内向き
plt.figure(figsize=(4,4),dpi=150)
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'

sns.scatterplot(x="JMT-Bench",y="win_rate",data=df[df["JMT-Bench"]!=-1])
plt.xlabel("Japanese MT-Bench")
plt.ylabel("Win rate")
#内向きメモリ


In [None]:

#メモリを内向き
plt.figure(figsize=(4,4),dpi=150)
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'

sns.scatterplot(x="LD3-Jaster-zero-shot",y="rating",data=df[df["LD3-Jaster-zero-shot"]!=-1])
plt.xlabel("Jaster-zero-shot")
#plt.ylabel("Win rate")
#内向きメモリ



In [None]:

#メモリを内向き
plt.figure(figsize=(4,4),dpi=150)
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'

sns.scatterplot(x="JMT-Bench",y="rating",data=df[df["JMT-Bench"]!=-1])
#yはエラーバーも表示, rdの2倍
sel_df=df[df["JMT-Bench"]!=-1]
plt.errorbar(sel_df["JMT-Bench"],sel_df["rating"],yerr=sel_df["rd"],
             fmt='None',color='gray',capsize=5,elinewidth=1,alpha=0.5)
plt.xlabel("Japanese MT-Bench")
plt.ylabel("Rating")
#内向きメモリ


In [None]:
df

In [None]:

#メモリを内向き
plt.figure(figsize=(4,4),dpi=150)
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'

sns.scatterplot(x="LD3-ave",y="rating",data=df[df["JMT-Bench"]!=-1])
plt.xlabel("Leaderboard3 average score")
#plt.ylabel("Win rate")
#内向きメモリ


In [None]:
#相関係数
sel_df=df[df["JMT-Bench"]!=-1]
sel_df[["win_rate","rating",
        "JMT-Bench","LD3-Jaster-zero-shot",]].corr()

In [None]:

#メモリを内向き
plt.figure(figsize=(4,4),dpi=150)
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'

sns.scatterplot(x="LD3-Jaster-zero-shot",y="JMT-Bench",data=df[df["JMT-Bench"]!=-1])
plt.xlabel("LD3-Jaster-zero-shot")
plt.ylabel("JMT-Bench")



In [None]:

#メモリを内向き
plt.figure(figsize=(4,4),dpi=150)
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.errorbar(df["win_rate"],df["rating"],yerr=df["rd"],
             fmt='None',color='gray',capsize=5,elinewidth=1,alpha=0.5)
#sns.scatterplot(x="win_rate",y="rating",data=df)

#回帰直線をseabornで引く
sns.regplot(x="win_rate",y="rating",data=df,ci=0,line_kws={"alpha":0.4})

plt.xlabel("win_rate")
plt.ylabel("rating")



In [None]:
df

# chatbot arena本家でのrating

In [None]:
import csv
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import math
from tqdm import tqdm
import plotly.express as px

def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
    ptbl_a_win = pd.pivot_table(
        df[df["winner"] == "model_a"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )
    
    if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
        ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
    else:
        ptbl_tie = pd.pivot_table(
            df[df["winner"].isin(["tie", "tie (bothbad)"])],
            index="model_a",
            columns="model_b",
            aggfunc="size",
            fill_value=0,
        )
        ptbl_tie = ptbl_tie + ptbl_tie.T

    ptbl_b_win = pd.pivot_table(
        df[df["winner"] == "model_b"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )
    ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie

    models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)

    p = len(models)
    X = np.zeros([p * (p - 1) * 2, p])
    Y = np.zeros(p * (p - 1) * 2)

    cur_row = 0
    sample_weights = []
    for m_a in ptbl_win.index:
        for m_b in ptbl_win.columns:
            if m_a == m_b:
                continue
            if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
                continue
            X[cur_row, models[m_a]] = +math.log(BASE)
            X[cur_row, models[m_b]] = -math.log(BASE)
            Y[cur_row] = 1.0
            sample_weights.append(ptbl_win.loc[m_a, m_b])

            X[cur_row + 1, models[m_a]] = math.log(BASE)
            X[cur_row + 1, models[m_b]] = -math.log(BASE)
            Y[cur_row + 1] = 0.0
            sample_weights.append(ptbl_win.loc[m_b, m_a])
            cur_row += 2
    X = X[:cur_row]
    Y = Y[:cur_row]

    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
    lr.fit(X, Y, sample_weight=sample_weights)
    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
    if "weblab-GENIAC/Tanuki-8B-dpo-v1.0" in models.index:
        elo_scores += 1000 - elo_scores[models["weblab-GENIAC/Tanuki-8B-dpo-v1.0"]]
    return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)

def get_bootstrap_result(battles, func_compute_elo, num_round):
    rows = []
    for i in tqdm(range(num_round), desc="bootstrap"):
        rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True)))
    df = pd.DataFrame(rows)
    return df[df.median().sort_values(ascending=False).index]

def visualize_bootstrap_scores(df, title):
    bars = pd.DataFrame(dict(
        lower = df.quantile(.025),
        rating = df.quantile(.5),
        upper = df.quantile(.975))).reset_index(names="model").sort_values("rating", ascending=False)
    bars['error_y'] = bars['upper'] - bars["rating"]
    bars['error_y_minus'] = bars['rating'] - bars["lower"]
    bars['rating_rounded'] = np.round(bars['rating'], 2)
    bars["model_name"]=bars["model"].map(rename_model)
    fig = px.scatter(bars, x="model_name", y="rating", error_y="error_y",
                     error_y_minus="error_y_minus", text="rating_rounded",
                     title=title)
    fig.update_layout(xaxis_title="Model", yaxis_title="Rating",
                      height=600)
    return fig

# CSVファイルを読み込む
battles = []
with open(csv_path, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # ヘッダーをスキップ
    
    for index, row in enumerate(reader, start=1):
        try:
            evaluation = int(row[3])
        except:
            continue  # 評価が空白のものはスキップ
        model_a = row[5]
        model_b = row[6]
        
        if evaluation == 1:
            winner = "model_a"
        elif evaluation == 2:
            winner = "model_b"
        elif evaluation == 3:  # 両方良い場合（引き分け）
            winner = "tie"
        elif evaluation == 0:  # 両方悪い場合（引き分け）
            winner = "tie (bothbad)"
        
        battles.append({
            "model_a": model_a,
            "model_b": model_b,
            "winner": winner
        })

# DataFrameに変換
df_battles = pd.DataFrame(battles)

# MLEを使用してEloレーティングを計算
elo_mle_ratings = compute_mle_elo(df_battles)

# 結果を表示
print("Model Ratings (sorted by MLE Elo rating):")
for i, (model, rating) in enumerate(elo_mle_ratings.items(), start=1):
    print(f"{i}. {model}: MLE Elo Rating = {rating:.2f}")

# レーティングをCSVファイルに保存
elo_mle_ratings.to_csv('mle_elo_ratings.csv', header=True)
print("MLE Elo ratings have been saved to 'mle_elo_ratings.csv'")

# ブートストラップ法を使用して信頼区間を計算
BOOTSTRAP_ROUNDS = 100
np.random.seed(42)
bootstrap_elo_lu = get_bootstrap_result(df_battles, compute_mle_elo, BOOTSTRAP_ROUNDS)

# 結果を可視化
fig = visualize_bootstrap_scores(bootstrap_elo_lu, "Bootstrap of MLE Elo Rating Estimates")
fig.show()

# 信頼区間を含めた結果を表示
confidence_intervals = pd.DataFrame({
    'Lower CI': bootstrap_elo_lu.quantile(0.025),
    'Median': bootstrap_elo_lu.quantile(0.5),
    'Upper CI': bootstrap_elo_lu.quantile(0.975)
}).sort_values('Median', ascending=False)

elo_dict=elo_mle_ratings.to_dict()

print("\nModel Ratings with Confidence Intervals:")
for model in confidence_intervals.index:
    lower = confidence_intervals.loc[model, 'Lower CI']
    median = confidence_intervals.loc[model, 'Median']
    upper = confidence_intervals.loc[model, 'Upper CI']
    print(f"{model}: Median = {median:.2f}, 95% CI = [{lower:.2f}, {upper:.2f}]")

# 結果をCSVファイルに保存
confidence_intervals.to_csv('mle_elo_ratings_with_ci.csv', header=True)
print("MLE Elo ratings with confidence intervals have been saved to 'mle_elo_ratings_with_ci.csv'")

In [None]:
elo_dict_={rename_model(k):v for k,v in elo_dict.items()}

#dfに追加
show_df=copy.deepcopy(df)
show_df["Rating"]=show_df["name"].map(elo_dict_).astype(int)
show_df=show_df.drop(["rd","rank_range",],axis=1)
show_df

In [None]:
show_df[show_df["JMT-Bench"]!=-1].drop(["name","total_battle_count"],axis=1).corr()
#show_df.drop(["name","total_battle_count"],axis=1).corr()

In [None]:
elo_dict
glicko_dict={k:models[k].rating for k in models.keys()}

#EloとGlickoの比較
compare_df=pd.DataFrame({"Elo":elo_dict,"Glicko":glicko_dict})
compare_df.corr()

In [None]:
sns.regplot(x="Elo",y="Glicko",data=compare_df,ci=0)
plt.xlabel("Bradley-Terry")