In [2]:
from mt_metrics_eval import data
from itertools import combinations
import pandas as pd
import xxhash


def example_id(src_lang, tgt_lang, src, ref, hyp, system):
    return xxhash.xxh128_hexdigest(f'{src_lang}-{tgt_lang}@{system}##{src}##{ref}##{hyp}')

In [3]:
eval_s = data.EvalSet('wmt23', 'cs-uk')

In [4]:
def get_all_data(eval_set: data.EvalSet, score_name: str = 'mqm'):
    result = []
    src_lang = eval_set.src_lang
    tgt_lang = eval_set.tgt_lang
    scores = eval_set.Scores(level='seg', scorer=score_name)
    hypotheses = eval_set.sys_outputs
    systems = set(hypotheses.keys()) - {'synthetic_ref', 'refB'}
    for i, (src, ref) in enumerate(zip(eval_set.src, eval_set.all_refs[eval_set.std_ref])):
        for system in systems:
            hyp = hypotheses[system][i]
            score = scores[system][i]
            if score is not None:
                result.append({
                    'lp': f"{src_lang}-{tgt_lang}",
                    'src': src,
                    'ref': ref,
                    'hyp': hyp,
                    'system': system,
                    'score': score,
                    'score_name': score_name,
                    'example_id': example_id(src_lang, tgt_lang, src, ref, hyp, system)
                })
    return result
    

In [5]:
def get_lps(comp: str, score: str = 'mqm'):
    lps = []
    for lp, meta in data.meta_info.DATA[comp].items():
        if 'seg' in meta.std_gold and meta.std_gold['seg'] == score:
            lps.append(lp)

    return lps

In [6]:
all_data_sqm = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt23', lp), score_name='da-sqm')) for lp in get_lps('wmt23', 'da-sqm')
])

In [152]:
all_data_mqm = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt24', lp), score_name='mqm')) for lp in get_lps('wmt24', 'mqm')
])

In [154]:
import datasets as ds

  from .autonotebook import tqdm as notebook_tqdm


In [159]:
wmt23_24 = ds.DatasetDict({
    'train': ds.Dataset.from_pandas(all_data_sqm),
    'test': ds.Dataset.from_pandas(all_data_mqm)
})

In [160]:
wmt23_24 = wmt23_24.remove_columns(['__index_level_0__'])

In [162]:
help(wmt23_24.push_to_hub)

Help on method push_to_hub in module datasets.dataset_dict:

push_to_hub(repo_id, config_name: str = 'default', set_default: Optional[bool] = None, data_dir: Optional[str] = None, commit_message: Optional[str] = None, commit_description: Optional[str] = None, private: Optional[bool] = None, token: Optional[str] = None, revision: Optional[str] = None, create_pr: Optional[bool] = False, max_shard_size: Union[str, int, NoneType] = None, num_shards: Optional[dict[str, int]] = None, embed_external_files: bool = True) -> huggingface_hub.hf_api.CommitInfo method of datasets.dataset_dict.DatasetDict instance
    Pushes the [`DatasetDict`] to the hub as a Parquet dataset.
    The [`DatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed.

    Each dataset split will be pushed independently. The pushed dataset will keep the original split names.

    The resulting Parquet files are self-contained by default: if your dataset contains [`Image`] or [`A

In [163]:
wmt23_24.push_to_hub(
    "Rexhaif/wmt23-24"
)

Creating parquet from Arrow format: 100%|██████████| 96/96 [00:00<00:00, 2154.52ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.88s/it]
Creating parquet from Arrow format: 100%|██████████| 27/27 [00:00<00:00, 1550.00ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Rexhaif/wmt23-24/commit/f86bd4f46c832238dcf5da256fe3851d60c0799a', commit_message='Upload dataset', commit_description='', oid='f86bd4f46c832238dcf5da256fe3851d60c0799a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Rexhaif/wmt23-24', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Rexhaif/wmt23-24'), pr_revision=None, pr_num=None)