In [1]:
from mt_metrics_eval import data
from itertools import combinations
import pandas as pd
import xxhash


def example_id(src_lang, tgt_lang, src, ref, hyp, system):
    return xxhash.xxh128_hexdigest(f'{src_lang}-{tgt_lang}@{system}##{src}##{ref}##{hyp}')

In [3]:
data.meta_info.DATA

{'wmt24pp': {'en-ar_EG': MetaInfo(std_ref='posteditA', std_gold={}, outlier_systems=set(), primary_metrics={'MetricX-24-QE', 'XCOMET', 'ChrF', 'Gemini-DA-QE', 'MetricX-24', 'Gemini-DA', 'COMETKiwi-23', 'BLEU', 'XCOMET-QE'}, baseline_metrics=None),
  'en-ar_SA': MetaInfo(std_ref='posteditA', std_gold={}, outlier_systems=set(), primary_metrics={'MetricX-24-QE', 'XCOMET', 'ChrF', 'Gemini-DA-QE', 'MetricX-24', 'Gemini-DA', 'COMETKiwi-23', 'BLEU', 'XCOMET-QE'}, baseline_metrics=None),
  'en-bg_BG': MetaInfo(std_ref='posteditA', std_gold={}, outlier_systems=set(), primary_metrics={'MetricX-24-QE', 'XCOMET', 'ChrF', 'Gemini-DA-QE', 'MetricX-24', 'Gemini-DA', 'COMETKiwi-23', 'BLEU', 'XCOMET-QE'}, baseline_metrics=None),
  'en-bn_IN': MetaInfo(std_ref='posteditA', std_gold={}, outlier_systems=set(), primary_metrics={'MetricX-24-QE', 'XCOMET', 'ChrF', 'Gemini-DA-QE', 'MetricX-24', 'Gemini-DA', 'COMETKiwi-23', 'BLEU', 'XCOMET-QE'}, baseline_metrics=None),
  'en-ca_ES': MetaInfo(std_ref='posteditA

In [17]:
def get_all_data(eval_set: data.EvalSet, score_name: str = 'mqm'):
    result = []
    src_lang = eval_set.src_lang
    tgt_lang = eval_set.tgt_lang
    scores = eval_set.Scores(level='seg', scorer=score_name)
    hypotheses = eval_set.sys_outputs
    systems = set(hypotheses.keys()) - {'synthetic_ref', 'refB'}
    for i, (src, ref) in enumerate(zip(eval_set.src, eval_set.all_refs[eval_set.std_ref])):
        for system in systems:
            hyp = hypotheses[system][i]
            score = scores[system][i] if len(scores[system]) > i else None
            if score is not None:
                result.append({
                    'lp': f"{src_lang}-{tgt_lang}",
                    'src': src,
                    'ref': ref,
                    'hyp': hyp,
                    'system': system,
                    'score': score,
                    'score_name': score_name,
                    'example_id': example_id(src_lang, tgt_lang, src, ref, hyp, system)
                })
    return result
    

In [7]:
def get_lps(comp: str, score: str = 'mqm'):
    lps = []
    for lp, meta in data.meta_info.DATA[comp].items():
        if 'seg' in meta.std_gold and meta.std_gold['seg'] == score:
            lps.append(lp)

    return lps

In [8]:
wmt22_appraise = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt22', lp), score_name='wmt-appraise')) for lp in get_lps('wmt22', 'wmt-appraise')
])
wmt22_appraise['source'] = "wmt22_da"

In [9]:
wmt22_mqm = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt22', lp), score_name='mqm')) for lp in get_lps('wmt22', 'mqm')
])
wmt22_mqm['source'] = "wmt22_mqm"

In [10]:
wmt23_da_sqm = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt23', lp), score_name='da-sqm')) for lp in get_lps('wmt23', 'da-sqm')
])
wmt23_da_sqm['source'] = "wmt23_da"

In [11]:
wmt23_mqm = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt23', lp), score_name='mqm')) for lp in get_lps('wmt23', 'mqm')
])
wmt23_mqm['source'] = "wmt23_mqm"

In [12]:
wmt24_esa = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt24', lp), score_name='esa')) for lp in get_lps('wmt24', 'esa')
])
wmt24_esa['source'] = "wmt24_esa"


In [13]:
wmt24_mqm = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt24', lp), score_name='mqm')) for lp in get_lps('wmt24', 'mqm')
])
wmt24_mqm['source'] = "wmt24_mqm"

In [22]:
wmt21news_mqm = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt21.news', lp), score_name='mqm')) for lp in get_lps('wmt21.news', 'mqm')
])
wmt21news_mqm['source'] = "wmt21.news_mqm"


In [23]:
wmt21news_da = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt21.news', lp), score_name='wmt-raw')) for lp in get_lps('wmt21.news', 'wmt-raw')
])
wmt21news_da['source'] = "wmt21.news_da"

In [24]:
wmt21tedtalks_mqm = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt21.tedtalks', lp), score_name='mqm')) for lp in get_lps('wmt21.tedtalks', 'mqm')
])
wmt21tedtalks_mqm['source'] = "wmt21.tedtalks_mqm"

In [26]:
wmt21flores_da = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt21.flores', lp), score_name='wmt-raw')) for lp in get_lps('wmt21.flores', 'wmt-raw')
])
wmt21flores_da['source'] = "wmt21.flores_da"

In [27]:
wmt20_da = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt20', lp), score_name='wmt-raw')) for lp in get_lps('wmt20', 'wmt-raw')
])
wmt20_da['source'] = "wmt20_da"

wmt19_da = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt19', lp), score_name='wmt-raw')) for lp in get_lps('wmt19', 'wmt-raw')
])
wmt19_da['source'] = "wmt19_da"

In [29]:
import datasets as ds

In [30]:
wmt = ds.DatasetDict({
    'train': ds.Dataset.from_pandas(
        pd.concat([
            wmt21news_mqm,
            wmt21news_da,
            wmt21tedtalks_mqm,
            wmt21flores_da,
            wmt20_da,
            wmt19_da,
            wmt22_appraise,
            wmt22_mqm,
            wmt23_da_sqm.query('lp != "en-cs"'),
            wmt23_mqm,
            wmt24_esa,
            wmt24_mqm
        ])
    )
})

In [31]:
wmt = wmt.remove_columns(['__index_level_0__'])

In [32]:
from collections import Counter
Counter(wmt['train']['source'])

Counter({'wmt19_da': 307563,
         'wmt20_da': 210705,
         'wmt21.news_da': 169138,
         'wmt22_da': 91019,
         'wmt24_esa': 78967,
         'wmt23_da': 77646,
         'wmt22_mqm': 68890,
         'wmt23_mqm': 35472,
         'wmt24_mqm': 26511,
         'wmt21.news_mqm': 25437,
         'wmt21.tedtalks_mqm': 22492,
         'wmt21.flores_da': 14443})

In [33]:
wmt.push_to_hub(
    "Rexhaif/wmt-metrics-19-24"
)

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/565 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/59.0M [00:00<?, ?B/s]

Creating parquet from Arrow format:   0%|          | 0/565 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/59.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Rexhaif/wmt-metrics-19-24/commit/c42112f314c3f75ec36d729f16ecf6f14df2dc4d', commit_message='Upload dataset', commit_description='', oid='c42112f314c3f75ec36d729f16ecf6f14df2dc4d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Rexhaif/wmt-metrics-19-24', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Rexhaif/wmt-metrics-19-24'), pr_revision=None, pr_num=None)

In [70]:
wmt['train'][-1]

{'lp': 'ja-zh',
 'src': 'ところでこれに対して廻座の人々には、自分たちがかつては、小さくとも一城の主だったという矜恃がある。それで廻座という別格の位置を楯に横車を押す事が多かった。',
 'ref': '然而对此回座的诸位则认为，自己这些人，再不济也曾是一城之主，有自己的尊严。因而常常利用回座的特别身份强推无理蛮横之事。',
 'hyp': '不过对这些人而言，他们心中怀有一种自豪：他们曾经是小城的领主。所以他们总是利用这种特殊的身份横行无忌。',
 'system': 'Unbabel-Tower70B',
 'score': -0.0,
 'score_name': 'mqm',
 'example_id': 'ba9bbacef8208fb3a36fa92ddb68f83c',
 'source': 'wmt24_mqm'}

In [51]:
wmt['train'].to_pandas().query('source == "wmt22_da" and lp == "sah-ru"').sample(n=1000).to_dict(orient='records')

[{'lp': 'sah-ru',
  'src': 'Федерация Сэбиэтин социальнай бэлиитикэҕэ кэмитиэтин солбуйааччы Елена Бибикова: «Бу инфляция хаамыытыттан балтараа төгүл улахан.',
  'ref': 'Заместитель председателя комитета Совета Федерации по социальной политике Елена Бибикова заявила: «Это в полтора раза выше темпа инфляции.',
  'hyp': 'Собеседник Совета Федерации по социальной политике Елена Бибикова: "Будет большая проблема с инфляцией.',
  'system': 'Lan-Bridge',
  'score': 2.0,
  'score_name': 'wmt-appraise',
  'example_id': '32069b9859ed6d61702e8dbf98a8c604',
  'source': 'wmt22_da'},
 {'lp': 'sah-ru',
  'src': 'Талакааҥҥа түспүппүтүгэр биһигини оптуобус кэтэһэн турара.Онно олордон баран, үлэлиэхтээх сирбитигэр илтилэр.',
  'ref': 'В Талакане нас ждал автобус, который довез нас до места работы.',
  'hyp': 'Когда мы приземлились в Талакане, нас ждал автобус.Посадили его туда и отвезли на место, где он должен был работать.',
  'system': 'Online-G',
  'score': 80.0,
  'score_name': 'wmt-appraise',
  'e