# Introduction
In this notebook, we will compute the agreement between humans and GPT-4 judge with MT-bench human judgement dataset (https://huggingface.co/datasets/lmsys/mt_bench_human_judgments). Our results show that humans and GPT-4 judge achieve over 80\% agreement, the same level of agreement between humans.


In [None]:
# import packages
!pip install datasets

import argparse
import json
import os

import numpy as np
from datasets import load_dataset

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/486.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m481.3/486.2 kB[0m [31m19.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-non

# Download data


In [None]:
dataset = load_dataset("lmsys/mt_bench_human_judgments")
dataset["human"].to_json("human_judgments.json")
dataset["gpt4_pair"].to_json("gpt4_pair_judgments.json")


Downloading readme:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/lmsys___parquet/lmsys--mt_bench_human_judgments-06854dea3e1614aa/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/650k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/739k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating gpt4_pair split:   0%|          | 0/2400 [00:00<?, ? examples/s]

Generating human split:   0%|          | 0/3355 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/lmsys___parquet/lmsys--mt_bench_human_judgments-06854dea3e1614aa/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

11356420

# Agreement Computation Code

In [None]:
def get_judge_name(judge):
    if isinstance(judge, list) and judge[0] == "gpt-4" and judge[1].startswith("pair"):
        return "gpt4-pair"
    if judge.startswith("expert"):
        return "human"
    if judge.startswith("author"):
        return "author"
    return judge


def revert(vote):
    if vote == "model_a":
        return "model_b"
    elif vote == "model_b":
        return "model_a"
    return vote


def get_mt_bench_votes_data(raw_votes):
    data = [{}, {}]

    for judge_votes in raw_votes:
        for vote in judge_votes:
            turn = vote["turn"] - 1
            if vote["model_a"] < vote["model_b"]:
                key = (vote["question_id"], vote["model_a"], vote["model_b"])
                winner = vote["winner"]
            else:
                key = (vote["question_id"], vote["model_b"], vote["model_a"])
                winner = revert(vote["winner"])
            judge = get_judge_name(vote["judge"])
            if key not in data[turn]:
                data[turn][key] = {}
            if judge not in data[turn][key]:
                data[turn][key][judge] = []
            data[turn][key][judge].append(winner)

    return data


def convertvote(vote):
    if "tie" in vote:
        return "tie"
    return vote


def equalvote(vote1, vote2):
    if "tie" in vote1 and "tie" in vote2:
        return True
    return vote1 == vote2


# data: Dict[qid -> List[vote]]
def get_mt_bench_agreement(data, judge1, judge2, ban):
    if judge1.startswith("gpt4") and judge2 == "human":
        stats = [0, 0]
        for votes in data.values():
            if judge1 not in votes or judge2 not in votes: continue
            assert len(votes[judge1]) == 1
            if convertvote(votes[judge1][0]) in ban: continue
            for v in votes[judge2]:
                if convertvote(v) in ban: continue
                stats[1] += 1
                stats[0] += equalvote(votes[judge1][0], v)
        return stats[0], stats[1]
    elif judge1 == "human" and judge2 == "human":
        stats = [0, 0]
        for votes in data.values():
            if "human" not in votes: continue
            for i in range(len(votes["human"]) - 1):
                for j in range(i + 1, len(votes["human"])):
                    if convertvote(votes["human"][i]) in ban or convertvote(votes["human"][j]) in ban:
                        continue
                    stats[1] += 1
                    stats[0] += equalvote(votes["human"][i], votes["human"][j])
        return stats[0], stats[1]
    else:
        raise Exception("Unsupported judges.")


def run_mt_bench_agreement(judges, votefiles):
    # votes[i]: List of votes
    votes = []
    for filename in votefiles:
        data = []
        for line in open(filename, "r"):
            data.append(json.loads(line))
        votes.append(data)

    data = get_mt_bench_votes_data(votes)

    agree, total = get_mt_bench_agreement(data[0], judges[0], judges[1], ban=[])
    print(f"turn 1 with tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}")
    agree, total = get_mt_bench_agreement(data[0], judges[0], judges[1], ban=["tie"])
    print(f"turn 1 without tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}")
    agree, total = get_mt_bench_agreement(data[1], judges[0], judges[1], ban=[])
    print(f"turn 2 with tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}")
    agree, total = get_mt_bench_agreement(data[1], judges[0], judges[1], ban=["tie"])
    print(f"turn 2 without tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}")

# Results

In [None]:
# Compute agrement between GPT-4 and humans
run_mt_bench_agreement(["gpt4_pair", "human"], ["gpt4_pair_judgments.json", "human_judgments.json"])

turn 1 with tie. #total: 1343, #agree: 886, ratio: 0.66
turn 1 without tie. #total: 859, #agree: 727, ratio: 0.85
turn 2 with tie. #total: 1325, #agree: 871, ratio: 0.66
turn 2 without tie. #total: 864, #agree: 731, ratio: 0.85


In [None]:
# Compute agrement between humans and humans
run_mt_bench_agreement(["human", "human"], ["human_judgments.json"])

turn 1 with tie. #total: 721, #agree: 454, ratio: 0.63
turn 1 without tie. #total: 479, #agree: 388, ratio: 0.81
turn 2 with tie. #total: 707, #agree: 471, ratio: 0.67
turn 2 without tie. #total: 474, #agree: 388, ratio: 0.82
