This notebook is modified based on the original mt_metrics_eval notebook. 

In order to run the script, please modify the meta_info.py by adding the following meta to include the dataset:

'''
LITEVAL = MetaInfo('src', {'sys': 'mqm_score', 'seg': 'mqm_score'}, set(), set())

LITEVAL_PRIMARIES = {'mqm_score', 'rating', 'cometkiwi', 'comet_xl', 'comet_xxl', 'm-pro', 'treqa',
       'openai_gpt-4o-mini_final_set_with_Qlevel_step',
       'openai_gpt-4o-mini_final_set_with_Qlevel_step_weighted',
       'meta-llama_llama-3.3-70b-instruct_PAR3-final_set_with_plevel_stepv2',
       'meta-llama_llama-3.3-70b-instruct_PAR3-final_set_with_plevel_stepv2_weighted',
       'openai_gpt-4o-mini_final_set_with_plevel_stepv2',
       'openai_gpt-4o-mini_final_set_with_plevel_stepv2_weighted',
       'openai_gpt-4o-mini_PAR3-final_set_with_plevel_stepv2',
       'openai_gpt-4o-mini_PAR3-final_set_with_plevel_stepv2_weighted',
       'meta-llama_llama-3.3-70b-instruct_PAR3-final_set_with_QA',
       'meta-llama_llama-3.3-70b-instruct_PAR3-final_set_with_QA_weighted',
       'openai_gpt-4o-mini_final_set_with_QA',
       'openai_gpt-4o-mini_final_set_with_QA_weighted',
       'Qwen_Qwen2.5-32B-Instruct_final_set_with_plevel_stepv2',
       'Qwen_Qwen2.5-32B-Instruct_final_set_with_plevel_stepv2_weighted',
       'meta-llama_llama-3.1-405b-instruct_final_set_with_plevel_stepv2',
       'meta-llama_llama-3.1-405b-instruct_final_set_with_plevel_stepv2_weighted',
       'openai_gpt-4o-mini_PAR3-final_set_with_QA',
       'openai_gpt-4o-mini_PAR3-final_set_with_QA_weighted',
       'qwen_qwen-2.5-72b-instruct_final_set_with_plevel_stepv2',
       'qwen_qwen-2.5-72b-instruct_final_set_with_plevel_stepv2_weighted',
       'meta-llama_llama-3.3-70b-instruct_final_set_with_plevel_stepv2',
       'meta-llama_llama-3.3-70b-instruct_final_set_with_plevel_stepv2_weighted',
       'quar_full_20250320_2116568550', 'quar_fr_en_20250320_19440812300',
       'half_xxen_20250320_16390712500', 'quar_xxen_20250320_21051212150',
       'half_full_20250321_11562622350', 'half_fr_en_20250320_16162112500',
}

LITEVAL_BASELINES = LITEVAL_PRIMARIES
'''


\\\\\\\\\\\\\
This is a demo colab for MTME. It assumes you have mt_metrics_eval installed on your runtime, and have downloaded the data onto that machine. Run the cells below in order.




# Preliminaries

In [None]:
import pandas as pd
import os
import numpy as np
import re
folder = "datasets/"
df_list = pd.read_csv(os.path.join(folder, "LitEval_human_annotated_init_bench.csv")) # score from previous study
df_list["source_"] = df_list["source"].apply(lambda x: x.replace("\n ", "").replace("\n", "")[:50])
df_list['source'] = df_list.source.apply(lambda x: x.replace("\n", "  ")).fillna("") # remove all spaces in source
df_list['tgt'] = df_list.tgt.apply(lambda x: x.replace("\n", "  ")).fillna("") # remove all spaces in sou
df_list['mqm_score'] = df_list.mqm_score.astype(float)
df_list['rating'] = df_list.rating.astype(float)

In [None]:
# load treqa
import json
files = ["bench-lit2-1000.jsonl", "bench-lit2-2000.jsonl", "bench-lit2-3666.jsonl",]
res = []
for file in files:
    with open("eval_results/treqa/" + file, "r") as f:
        for line in f:
            d = json.loads(line)
            res.append(d)
res = pd.DataFrame(res)
res["treqa"] = res["per_q_scores"].apply(lambda x: np.mean(x))

In [None]:
# load m-pro
mp = pd.read_csv("eval_results/mpro/m-premetheous.csv")
def extract_number(text):
    match = re.search(r"\[RESULT\]\s*(\d+)", text)
    if match:
        number = int(match.group(1))
        return number
    elif text[0].isdigit():
        return int(text[0])
    else:
        # extract number from text
        match = re.search(r"(\d+)", text)
        if match:
            number = int(match.group(1))
            return number
        else:
            return None

mp["score"] = mp["llm_response"].apply(extract_number)

In [None]:
#subset only the test data: exclude del set
del_set = pd.read_csv("datasets/sampled_benchmark.csv")
del_set_liteval = del_set[del_set["dataset"] == "LitEval"]
#n = 0
#for src in del_set_liteval["src"].unique():
#    if not src[:50] in df_list["source"].values:
#       n = n+1
del_set.groupby(["dataset"]).size()
del_set.groupby("dataset")["pair", "ID"].nunique()

  del_set.groupby("dataset")["pair", "ID"].nunique()


Unnamed: 0_level_0,pair,ID
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
LitEval,4,70
literarytran,18,99


In [None]:
# load new data
df = pd.read_csv("datasets/benchmark_dataset_all_src_tgt.csv")
df_comet = pd.read_csv("eval_results/xcomet/comet_df_full.csv")
assert all(df_comet["ID"] == df["ID"])
df["cometkiwi"] = df_comet["cometkiwi"]
df["comet_xl"] = df_comet["comet_xl"]
df["comet_xxl"] = df_comet["comet_xxl"]
df["m-pro"] = mp["score"]
df["treqa"] = res["treqa"]

In [56]:
df.columns

Index(['pair', 'model', 'ID', 'dataset', 'cometkiwi', 'comet_xl', 'comet_xxl',
       'm-pro', 'treqa'],
      dtype='object')

In [57]:
import re
def parse_answers(completion):
    """Extract YES/NO/MAYBE counts from a JSON response.
    Handles both single and double quotes in the JSON string.
    Removes markdown code block markers if present.
    """
    try:
        # Remove markdown code block markers if present
        completion = completion.strip().replace("```", "").replace("json\n", "").replace('""', '"').replace("\\n", "\n").replace(" \n", "\n").replace("\n\n", "\n") # Remove ```json
        completion = completion.strip()
        try:
            completion = "{" + completion.split("}\n{")[1]
        except:
            completion = "{" + completion.split("{")[1]
        if "}" not in completion:
            completion = completion + "}"
        completion = completion.replace("'", '"')
        answers = json.loads(completion)
            # Count YES/NO/MAYBE
        counts = {
            "YES": sum(1 for v in answers.values() if v == "YES"),
            "NO": sum(1 for v in answers.values() if v == "NO"),
            "MAYBE": sum(1 for v in answers.values() if v == "MAYBE")
        }
        score = counts["YES"] + 0.5*counts["MAYBE"]-counts["NO"]
        return counts, answers, score
    except:
        completion = completion.replace('"', "'")  
        matches = re.findall(r"'(\d+)':\s*'(YES|NO|MAYBE)'", completion)
        answers = {k: v for k, v in matches}
        counts = {
            "YES": sum(1 for v in answers.values() if v == "YES"),
            "NO": sum(1 for v in answers.values() if v == "NO"),
            "MAYBE": sum(1 for v in answers.values() if v == "MAYBE")
        }
        score = counts["YES"] + 0.5*counts["MAYBE"]-counts["NO"]
        return counts, answers, score
    finally:
        print("Error parsing answers:", repr(completion))

def parse_answers_simple(completion):
    completion = completion.replace('"', "'")  
    matches = re.findall(r"'(\d+)':\s*'(YES|NO|MAYBE)'", completion)
    answers = {k: v for k, v in matches}
    counts = {
            "YES": sum(1 for v in answers.values() if v == "YES"),
            "NO": sum(1 for v in answers.values() if v == "NO"),
            "MAYBE": sum(1 for v in answers.values() if v == "MAYBE")
        }
    score = counts["YES"] + 0.5*counts["MAYBE"]-counts["NO"]
    return counts, answers, score

def score_function(df_list):
    res = []
    for r in df_list["response"]:
        res.append(parse_answers_simple(r))
    df_list["score"] = [i[2] if i is not None else None for i in res ]
    df_list["answers"] = [i[1] if i is not None else None for i in res]
    df_list["counts"] = [i[0] if i is not None else None for i in res]
    
    return df_list

In [None]:
# load prompt results
folder = "eval_results/litransproqa/"
for file in os.listdir(folder):
    if file.endswith(".csv"):
        col = file.replace(".csv", "")
        weights = pd.read_csv("datasets/QA_weights.csv").score.values
        tmp = pd.read_csv(os.path.join(folder, file))
        tmp = score_function(tmp)
        df[col] = tmp["score"]/5
        df_ = pd.DataFrame(tmp["answers"].apply(lambda x: list(x.values())).tolist())
        df[col + "_weighted"] = df_.apply(lambda x: x.map({"YES": 1, "NO": 0, "MAYBE": 0.5})).mul(weights).mean(axis=1)
df_ = df

In [None]:
import json
import re
folder = "eval_results/xcomet_ranking/"
for file in os.listdir(folder):
    if file.endswith(".json"):
        with open(os.path.join(folder, file), "r") as f:
            tmp = json.load(f)
            col = file.replace(".json", "").replace("xcomet_ranking_", "").replace("_checkpoints_checkpoint-", "").replace("xcomet_ranking_", "").replace("referencefree_output_", "")
            df[col] = tmp["segment_scores"]

In [60]:
df_list = pd.concat([df_list, df[df["dataset"] == "LitEval"].iloc[:, 4:].reset_index(drop=True)], axis=1)

In [None]:
#subset only the test data: exclude del set
del_set = pd.read_csv("datasets/sampled_benchmark.csv")
del_set_liteval = del_set[del_set["dataset"] == "LitEval"]
df_list["ID"] = df_list.apply(lambda x: x["source"][:15].replace(" ","") + "-" + x["pair"] + "-" + x["model_x"], axis = 1)
subshape = df_list.set_index("ID").loc[del_set_liteval["ID"].unique()].shape[0]
assert subshape == del_set_liteval.shape[0]

In [62]:
mask = [True if not id in del_set_liteval["ID"].values else False for id in df_list["ID"]]
df_list = df_list[mask]
df_list.shape

(1976, 42)

In [63]:
df_list.columns

Index(['source', 'tgt', 'model', 'mqm_score', 'rating', 'pair', 'model_x',
       'source_', 'cometkiwi', 'comet_xl', 'comet_xxl', 'm-pro', 'treqa',
       'openai_gpt-4o-mini_final_set_with_Qlevel_step',
       'openai_gpt-4o-mini_final_set_with_Qlevel_step_weighted',
       'meta-llama_llama-3.3-70b-instruct_PAR3-final_set_with_plevel_stepv2',
       'meta-llama_llama-3.3-70b-instruct_PAR3-final_set_with_plevel_stepv2_weighted',
       'openai_gpt-4o-mini_final_set_with_plevel_stepv2',
       'openai_gpt-4o-mini_final_set_with_plevel_stepv2_weighted',
       'openai_gpt-4o-mini_PAR3-final_set_with_plevel_stepv2',
       'openai_gpt-4o-mini_PAR3-final_set_with_plevel_stepv2_weighted',
       'meta-llama_llama-3.3-70b-instruct_PAR3-final_set_with_QA',
       'meta-llama_llama-3.3-70b-instruct_PAR3-final_set_with_QA_weighted',
       'openai_gpt-4o-mini_final_set_with_QA',
       'openai_gpt-4o-mini_final_set_with_QA_weighted',
       'Qwen_Qwen2.5-32B-Instruct_final_set_with_plevel_ste

In [15]:
metrics = ["mqm_score",  'gemba_all_orig', 'comet_xl', 'comet_xxl', 'm-pro', "cometkiwi", 'treqa',
           'openai_gpt-4o-mini_final_set_with_QA', # vanilla
       'openai_gpt-4o-mini_final_set_with_QA_weighted',
       'openai_gpt-4o-mini_final_set_with_plevel_stepv2', # promptstep
       'openai_gpt-4o-mini_final_set_with_plevel_stepv2_weighted',
       'openai_gpt-4o-mini_final_set_with_Qlevel_step', # questionstep
       'openai_gpt-4o-mini_final_set_with_Qlevel_step_weighted',    
       'quar_fr_en_20250320_19440812300', # quater xcomet Fr-en
       'quar_xxen_20250320_21051212150', # multi xcomet xx-en
]

In [None]:
#build mt_metrics_eval dataset format
ref = "src"
df_format = df_list.pivot(index=["pair", "source_"], columns=["model"], values="mqm_score").reset_index()
df_format = df_format.merge(df_list[["pair", "source_", "source"]], on=["pair", "source_"], how="left").drop_duplicates(subset=["pair", "source_"], keep="first").reset_index(drop=True)
lst_src = df_format["source"].values
lst_src_ = df_format["source_"].values
lst_pair = df_format["pair"].values
lst_model = df_format.columns[2:-1]

for p, g in df_format.groupby(["pair"]):
    os.makedirs(f"benchmark_dataset/liteval/sources/", exist_ok=True)
    os.makedirs(f"benchmark_dataset/liteval/references/", exist_ok=True)
    g[["source"]].fillna("NaN").to_csv(f"benchmark_dataset/liteval/sources/{p}.txt", sep=" ", index=False, header=False)
    g[["source"]].fillna("NaN").to_csv(f"benchmark_dataset/liteval/references/{p}-{ref}.txt", sep=" ", index=False, header=False)

    os.makedirs(f"benchmark_dataset/liteval/documents/", exist_ok=True)
    pd.DataFrame().to_csv(f"benchmark_dataset/liteval/documents/{p}.docs", sep=" ", index=False, header=False)


df_format = df_list.pivot(index=["pair", "source_"], columns=["model"], values="tgt").reset_index()
df_format = df_format.set_index(["source_", "pair"]).loc[(lst_src_, lst_pair), lst_model]

for p, g in df_format.groupby(["pair"]):
    for model in set(lst_model):
        os.makedirs(f"benchmark_dataset/liteval/system-outputs/{p}/", exist_ok=True)
        g[[model]].fillna("NaN").to_csv(f"benchmark_dataset/liteval/system-outputs/{p}/{model}.txt", index=False, sep=" ", header=False)


for score in metrics:
    df_format = df_list.pivot(index=["pair", "source_"], columns=["model"], values=score).reset_index()
    df_format = df_format.set_index(["source_", "pair"]).loc[(lst_src_, lst_pair), lst_model]
    print(score)
    for p, g in df_format.groupby(["pair"]):
        print(p)
        os.makedirs(f"benchmark_dataset/liteval/metric-scores/{p}/", exist_ok=True)
        os.makedirs(f"benchmark_dataset/liteval/human-scores/", exist_ok=True)
        output_concat = pd.DataFrame()
        for model in set(lst_model):
            tmp = g[[model]]
            tmp["model"] = model
            tmp.columns = ["score", "model"]
            output_concat = pd.concat([output_concat, tmp])
        if not score in ["mqm_score", "rating"]:
            output_concat[["model", "score"]].fillna("NaN").to_csv(f"benchmark_dataset/liteval/metric-scores/{p}/{score}-{ref}.seg.score", index=False, sep=" ", header=False)
            output_concat.groupby("model", as_index=False)["score"].mean()[["model", "score"]].to_csv(f"benchmark_dataset/liteval/metric-scores/{p}/{score}-{ref}.sys.score", index=False, sep=" ", header=False)
        else:
            output_concat[["model", "score"]].fillna("NaN").to_csv(f"benchmark_dataset/liteval/human-scores/{p}.{score}.seg.score", index=False, sep=" ", header=False)
            output_concat.groupby("model", as_index=False)["score"].mean()[["model", "score"]].to_csv(f"benchmark_dataset/liteval/human-scores/{p}.{score}.sys.score", index=False, sep=" ", header=False)


In [1]:
# @title Imports

import numpy as np
import scipy.stats
from pathlib import Path
import sys
# Setup COMET path
COMET_ROOT = Path("mt_metrics_eval")
sys.path.append(str(COMET_ROOT))
from mt_metrics_eval import meta_info
from mt_metrics_eval import data
from mt_metrics_eval import stats
from mt_metrics_eval import tasks

In [2]:
# @title Print all available evalsets

for testset in meta_info.DATA: # dictionary
  print(f'{testset}:', ' '.join(lp for lp in meta_info.DATA[testset]))


wmt24pp: en-ar_EG en-ar_SA en-bg_BG en-bn_IN en-ca_ES en-cs_CZ en-da_DK en-de_DE en-el_GR en-es_MX en-et_EE en-fa_IR en-fi_FI en-fil_PH en-fr_CA en-fr_FR en-gu_IN en-he_IL en-hi_IN en-hr_HR en-hu_HU en-id_ID en-is_IS en-it_IT en-ja_JP en-kn_IN en-ko_KR en-lt_LT en-lv_LV en-ml_IN en-mr_IN en-nl_NL en-no_NO en-pa_IN en-pl_PL en-pt_BR en-pt_PT en-ro_RO en-ru_RU en-sk_SK en-sl_SI en-sr_RS en-sv_SE en-sw_KE en-sw_TZ en-ta_IN en-te_IN en-th_TH en-tr_TR en-uk_UA en-ur_PK en-vi_VN en-zh_CN en-zh_TW en-zu_ZA
wmt24: en-de en-es ja-zh cs-uk en-cs en-hi en-is en-ja en-ru en-uk en-zh
wmt23.sent: en-de
wmt23: en-de he-en zh-en cs-uk de-en en-cs en-he en-ja en-ru en-uk en-zh ja-en ru-en uk-en
wmt22: en-de en-ru zh-en cs-en cs-uk de-en de-fr en-cs en-hr en-ja en-liv en-uk en-zh fr-de ja-en liv-en ru-en ru-sah sah-ru uk-cs uk-en
wmt21.news: en-cs en-de en-ha en-is en-ja en-ru en-zh cs-en de-en de-fr fr-de ha-en is-en ja-en ru-en zh-en
wmt21.tedtalks: en-de en-ru zh-en
wmt21.flores: bn-hi hi-bn xh-zu zu

In [3]:
# @title Load data for WMT21 language pairs scored with MQM

all_evs = {}  # name/lp -> evs
for testset in meta_info.DATA:
  if not testset.startswith('liteval'): continue
  for lp in meta_info.DATA[testset]:
    if 'mqm_score' in meta_info.DATA[testset][lp].std_gold.values():
      all_evs[f'{testset}/{lp}'] = data.EvalSet(testset, lp, True, path="./benchmark_dataset")

print('\n'.join(all_evs.keys()))

# @title Print summaries for all loaded evalsets

print(f'{"name":<20}  segs sys metrics gold  refs std')
for name, evs in all_evs.items():
  nsegs = len(evs.src)
  nsys = len(evs.sys_names)
  nmetrics = len(evs.metric_basenames)
  gold = evs.StdHumanScoreName('sys')
  nrefs = len(evs.ref_names)
  std_ref = evs.std_ref

  print(f'{name:<20} {nsegs:5d} {nsys:3d} {nmetrics:7d} '
        f'{gold:5} {nrefs:4d} {std_ref}')

liteval/de-en
liteval/en-de
liteval/en-zh
liteval/de-zh
name                  segs sys metrics gold  refs std
liteval/de-en           45  12      14 mqm_score    0 src
liteval/en-de           46  12      14 mqm_score    0 src
liteval/en-zh           48  12      14 mqm_score    0 src
liteval/de-zh           44  12      14 mqm_score    0 src


# Comparing metrics

In [21]:
# @title Set up for comparing metrics

# There are many different ways to evaluate the performance of MT metrics. The
# most obvious question is what correlation statistic we should use to capture
# the similarity between a vector of metric scores and a vector of gold scores
# (human ratings). A less obvious question is where those vectors come from.
# We'll defer the choice of correlation statistic to later cells, and begin
# by setting some parameters that precisely define the vectors we're interested
# in comparing.

# Use all evalsets that we've loaded.
evs_list = all_evs.values()

# Choose the version of each metric that uses the standard reference for each
# evalset.
main_refs = [{evs.std_ref} for evs in evs_list]
print(main_refs)


# Include 'human' systems (ie, reference translations) among systems to be
# scored. This can make the task more challenging, since some metrics are
# biased against less literal references.
include_human = True

# Don't include systems considered to be outliers. These are systems that are
# much better or worse than all other systems, so they are easy for all metrics
# to rank correctly).
include_outliers = False

# Use MQM ratings as gold scores rather than the scores provided by the main
# WMT task. Metrics tasks have used MQM for main results since 2021.
gold_name = 'mqm_score'

# Only compare metrics that have been designated as primary submissions. This
# removes metric variants that are similar to each other, and reduces the size
# of the comparison matrix.
primary_metrics = True

# Don't limit the results to a particular domain. In WMT21, domains are treated
# as separate test-sets, so this is a no-op (WMT22 is a different story).
domain = None

# Set the number of resampling runs for determining whether one metric is better
# than another according to the permutation test. We'll use 5 to make the demo
# finish quickly, but at least 1000 is required for stable results.
k = 1000

# Set the size of blocks for 'early stopping' checks during resampling. If
# you're using k = 1000, this can speed up the computation, usually with
# only minimal changes to the results.
psd = stats.PermutationSigDiffParams(block_size = 100)

# Set the p-value for deciding wheter metrics are considered to be significantly
# different. Lower values make the test more stringent.
pval = 0.05

[{'src'}, {'src'}, {'src'}, {'src'}]


In [None]:
# @title Evaluate metrics using segment-level Kendall correlation

# Kendall correlation is similar to pairwise accuracy, except that it is
# normalized differently. The function calls are identical to the previous one,
# except that we set the 'level' parameter to 'seg', and specify Kendall rather
# than Pearson. The value of the 'average_by' parameter also matters here, as it
# specifies how system x segment score matrices get converted into vectors for
# comparison. We will use 'none', which just flattens the matrices.

# The resulting ranking is similar to the ranking from accuracy. One noticeable
# difference is that the significance clusters are smaller because they are
# based on more data (much larger vectors). Notice that BLEU is absent because
# it isn't available at the segment level.

# need to fill na with some score
dct = pd.DataFrame()

for p in ["en-de", "de-en", "en-zh", "de-zh"]:
    print(p)
    evs = all_evs["liteval/" + p]
    corrs = data.GetCorrelations(
        evs, 'seg', {evs.std_ref}, {'src'}, include_human, include_outliers,
        gold_name, primary_metrics, domain)
    ranks, matrix, _, _ = data.CompareMetrics(
        corrs, stats.KendallVariants, 'none', k, psd, pval, perm_test = "pairs")

    print(data.PrintMetricComparison(ranks, matrix, pval, evs))
    tmp = pd.DataFrame(ranks).T[[0]]
    tmp["pair"] = p
    tmp["metric"] = "segment-level_Kendall"
    m = pd.DataFrame(matrix)
    m.columns = tmp.index
    tmp = pd.concat([tmp.reset_index(drop = False), m.reset_index(drop = True)], axis = 1)
    dct = pd.concat([dct, tmp])

en-de
*openai_gpt-4o-mini_final_set_with_QA_weighted[noref]             1  0.6994126  . > > > > > > > > > > > > >
*openai_gpt-4o-mini_final_set_with_QA[noref]                      2  0.6944583  . . = = > > > > > > > > > >
*openai_gpt-4o-mini_final_set_with_Qlevel_step_weighted[noref]    2  0.6933177  . . . = > > > > > > > > > >
*openai_gpt-4o-mini_final_set_with_plevel_stepv2_weighted[noref]  2  0.6895099  . . . . > > > > > > > > > >
*openai_gpt-4o-mini_final_set_with_Qlevel_step[noref]             3  0.6888958  . . . . . = > > > > > > > >
*openai_gpt-4o-mini_final_set_with_plevel_stepv2[noref]           3  0.6881700  . . . . . . > > > > > > > >
*m-pro[noref]                                                     4  0.6673118  . . . . . . . = = = = = = >
*gemba_all_orig[noref]                                            4  0.6392037  . . . . . . . . = = = = = >
*comet_xxl[noref]                                                 4  0.5992598  . . . . . . . . . > > > > >
*cometkiwi[noref]     

In [None]:
# @title Evaluate metrics using seg-level accuracy with optimized tie threshold.

# This is an implementation of the acc*_eq pairwise ranking accuracy proposed in
# https://arxiv.org/abs/2305.14324. This is similar to global accuracy, but it
# additionally gives metrics credit for predicting ties in gold scores, which
# arise frequently in MQM segment-level data. To avoid bias due to differences
# in scoring precision for different metrics, an optimal threshold for assigning
# ties is automatically computed for each metric and test set.

# For demo purposes we disable significance testing by setting k to 0.
# (Significance testing works but is currently very slow.) Note that the
# optimization procedure uses sampling, so results can change across different
# runs.


for p in ["en-de", "de-en", "en-zh", "de-zh"]:
    print(p)
    evs = all_evs["liteval/" + p]
    corrs = data.GetCorrelations(
        evs, 'seg', {evs.std_ref}, {'src'}, include_human, include_outliers,
        gold_name, primary_metrics, domain)
    ranks, matrix, _, _ = data.CompareMetrics(
        corrs, stats.KendallWithTiesOpt, 'item', k, psd, pval, variant='acc23',
        sample_rate=1, perm_test = "pairs")

    print(data.PrintMetricComparison(ranks, matrix, pval, evs))
    tmp = pd.DataFrame(ranks).T[[0]]
    tmp["pair"] = p
    tmp["metric"] = "acc_eq"
    m = pd.DataFrame(matrix)
    m.columns = tmp.index
    tmp = pd.concat([tmp.reset_index(drop = False), m.reset_index(drop = True)], axis = 1)
    dct = pd.concat([dct, tmp])

en-de
*openai_gpt-4o-mini_final_set_with_QA_weighted[noref]             1  0.6284585  . = = > > > > > > > > > > >
*comet_xxl[noref]                                                 1  0.6248353  . . = = = > > > > > > > > >
*openai_gpt-4o-mini_final_set_with_plevel_stepv2_weighted[noref]  1  0.6235178  . . . = > > > > > > > > > >
*openai_gpt-4o-mini_final_set_with_QA[noref]                      2  0.6212121  . . . . = > = > > > > > > >
*openai_gpt-4o-mini_final_set_with_Qlevel_step_weighted[noref]    2  0.6179183  . . . . . > > > > > > > > >
*openai_gpt-4o-mini_final_set_with_plevel_stepv2[noref]           3  0.6175889  . . . . . . = = = = > > > >
*openai_gpt-4o-mini_final_set_with_Qlevel_step[noref]             3  0.6149539  . . . . . . . > > > > > > >
*comet_xl[noref]                                                  4  0.5757576  . . . . . . . . = = > > > >
*cometkiwi[noref]                                                 4  0.5731225  . . . . . . . . . = = > > >
*quar_fr_en_20250320_1

In [None]:
dct.to_csv("significance_test.csv", index=False)
dct.drop(columns = ["level_0"], inplace=True)

In [49]:
#dct.reset_index(inplace=True, drop = True)
dct.columns = ["metric", "score", "pair", "corr", 'openai_gpt-4o-mini_final_set_with_QA_weighted-src',
                           'openai_gpt-4o-mini_final_set_with_QA-src',
         'openai_gpt-4o-mini_final_set_with_Qlevel_step_weighted-src',
       'openai_gpt-4o-mini_final_set_with_plevel_stepv2_weighted-src',
                  'openai_gpt-4o-mini_final_set_with_Qlevel_step-src',
                'openai_gpt-4o-mini_final_set_with_plevel_stepv2-src',
                                                          'm-pro-src',
                                                 'gemba_all_orig-src',
                                                      'comet_xxl-src',
                                                      'cometkiwi-src',
                                                       'comet_xl-src',
                                'quar_fr_en_20250320_19440812300-src',
                                 'quar_xxen_20250320_21051212150-src',
                                                          'treqa-src']

In [50]:
t1 = dct.groupby(["corr", "metric"], as_index=False)["score"].mean()
# pivot table 
t1.pivot(index = "metric", columns = "corr", values = "score")

corr,acc_eq,segment-level_Kendall
metric,Unnamed: 1_level_1,Unnamed: 2_level_1
comet_xl-src,0.527984,0.386816
comet_xxl-src,0.540267,0.399597
cometkiwi-src,0.552289,0.455079
gemba_all_orig-src,0.534419,0.560933
m-pro-src,0.444877,0.569929
openai_gpt-4o-mini_final_set_with_QA-src,0.606393,0.605087
openai_gpt-4o-mini_final_set_with_QA_weighted-src,0.615658,0.604629
openai_gpt-4o-mini_final_set_with_Qlevel_step-src,0.595377,0.594319
openai_gpt-4o-mini_final_set_with_Qlevel_step_weighted-src,0.599794,0.596584
openai_gpt-4o-mini_final_set_with_plevel_stepv2-src,0.58519,0.585067


In [15]:
# check if human scores are ranked higher than system scores for each source
def check_humanvsllm(df_list, metric, source_col = "source_", human_col = ["translator1", "translator2", "translator3"], llm_col = ["gpt4o", "google_translate", "qwen2", "deepl"]):
    lst = []
    for p in df_list["pair"].unique():
        n_humanllm = 0
        n_humanmt = 0
        n_smt = 0
        count = 0
        for source_, g in df_list[df_list["pair"] == p].groupby([source_col]):
            g.drop_duplicates(subset=["model", source_col], inplace=True)
            human_m = set(g["model"]).intersection(set(human_col))
            mt_m = set(g["model"]).difference(human_m)
            smt_m = set(g["model"]).difference(human_m).difference(set(llm_col))
            human_max = g.set_index(["model"]).loc[list(human_m), metric].max()
            mt_max = g.set_index(["model"]).loc[list(mt_m), metric].max()
            if human_max > mt_max:
                n_humanmt += 1
            if llm_col:
                llm_m = set(g["model"]).intersection(set(llm_col))
                llm_max = g.set_index(["model"]).loc[list(llm_m), metric].max()
                smt_max = g.set_index(["model"]).loc[list(smt_m), metric].max()
                if human_max > llm_max:
                    n_humanllm += 1
                if human_max > smt_max:
                    n_smt += 1
            count += 1
        if count > 0:
            lst.append((p, metric, n_humanllm/count, n_humanmt/count, n_smt/count))
        else:
            print(source_, g.model.unique())
            print(p, metric, "no human scores")
            break
    return lst


In [52]:
import warnings
warnings.filterwarnings("ignore")
adequacy_df = pd.DataFrame()
for m in metrics:
    print(m)
    tmp = pd.DataFrame(check_humanvsllm(df_list, m), columns=["pair", "metric", "humanllm", "humanmt", "humansmallmt"])
    adequacy_df = pd.concat([adequacy_df, tmp])

mqm_score
gemba_all_orig
comet_xl
comet_xxl
m-pro
cometkiwi
treqa
openai_gpt-4o-mini_final_set_with_QA
openai_gpt-4o-mini_final_set_with_QA_weighted
openai_gpt-4o-mini_final_set_with_plevel_stepv2
openai_gpt-4o-mini_final_set_with_plevel_stepv2_weighted
openai_gpt-4o-mini_final_set_with_Qlevel_step
openai_gpt-4o-mini_final_set_with_Qlevel_step_weighted
quar_fr_en_20250320_19440812300
quar_xxen_20250320_21051212150


In [53]:
pd.set_option('display.max_colwidth', None)
adequacy_df.groupby(["metric"], as_index=False)[["humanllm", "humanmt", "humansmallmt"]].mean()

Unnamed: 0,metric,humanllm,humanmt,humansmallmt
0,comet_xl,0.170256,0.119619,0.544845
1,comet_xxl,0.266853,0.239065,0.61176
2,cometkiwi,0.07349,0.062253,0.525899
3,gemba_all_orig,0.061027,0.061027,0.63124
4,m-pro,0.164847,0.148054,0.566608
5,mqm_score,0.452761,0.436215,0.868286
6,openai_gpt-4o-mini_final_set_with_QA,0.387029,0.370483,0.856596
7,openai_gpt-4o-mini_final_set_with_QA_weighted,0.414118,0.403128,0.856717
8,openai_gpt-4o-mini_final_set_with_Qlevel_step,0.258561,0.225349,0.801377
9,openai_gpt-4o-mini_final_set_with_Qlevel_step_weighted,0.269925,0.236712,0.807058
