In [1]:
from mt_metrics_eval import data
from itertools import combinations
import pandas as pd
import xxhash


def example_id(src_lang, tgt_lang, src, ref, hyp, system):
    return xxhash.xxh128_hexdigest(f'{src_lang}-{tgt_lang}@{system}##{src}##{ref}##{hyp}')

In [2]:
def get_all_data(eval_set: data.EvalSet, score_name: str = 'mqm'):
    result = []
    src_lang = eval_set.src_lang
    tgt_lang = eval_set.tgt_lang
    scores = eval_set.Scores(level='seg', scorer=score_name)
    hypotheses = eval_set.sys_outputs
    systems = set(hypotheses.keys()) - {'synthetic_ref', 'refB'}
    for i, (src, ref) in enumerate(zip(eval_set.src, eval_set.all_refs[eval_set.std_ref])):
        for system in systems:
            hyp = hypotheses[system][i]
            score = scores[system][i]
            if score is not None:
                result.append({
                    'lp': f"{src_lang}-{tgt_lang}",
                    'src': src,
                    'ref': ref,
                    'hyp': hyp,
                    'system': system,
                    'score': score,
                    'score_name': score_name,
                    'example_id': example_id(src_lang, tgt_lang, src, ref, hyp, system)
                })
    return result
    

In [3]:
def get_lps(comp: str, score: str = 'mqm'):
    lps = []
    for lp, meta in data.meta_info.DATA[comp].items():
        if 'seg' in meta.std_gold and meta.std_gold['seg'] == score:
            lps.append(lp)

    return lps

In [17]:
get_lps("wmt22", 'wmt')

['cs-en', 'de-en', 'ja-en', 'ru-en', 'uk-en']

In [28]:
wmt22_appraise = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt22', lp), score_name='wmt-appraise')) for lp in get_lps('wmt22', 'wmt-appraise')
])
wmt22_appraise['source'] = "wmt22_appraise"

In [29]:
wmt22_mqm = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt22', lp), score_name='mqm')) for lp in get_lps('wmt22', 'mqm')
])
wmt22_mqm['source'] = "wmt22_mqm"

In [30]:
wmt23_da_sqm = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt23', lp), score_name='da-sqm')) for lp in get_lps('wmt23', 'da-sqm')
])
wmt23_da_sqm['source'] = "wmt23_da_sqm"

In [31]:
wmt23_mqm = pd.concat([
    pd.DataFrame(get_all_data(data.EvalSet('wmt23', lp), score_name='mqm')) for lp in get_lps('wmt23', 'mqm')
])
wmt23_mqm['source'] = "wmt23_mqm"

In [32]:
import datasets as ds

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
wmt = ds.DatasetDict({
    'train': ds.Dataset.from_pandas(
        pd.concat([
            wmt22_appraise,
            wmt22_mqm,
            wmt23_da_sqm,
            wmt23_mqm
        ])
    )
})

In [36]:
wmt = wmt.remove_columns(['__index_level_0__'])

In [38]:
from collections import Counter
Counter(wmt['train']['source'])

Counter({'wmt23_da_sqm': 95214,
         'wmt22_appraise': 91019,
         'wmt22_mqm': 68890,
         'wmt23_mqm': 35472})

In [39]:
help(wmt.push_to_hub)

Help on method push_to_hub in module datasets.dataset_dict:

push_to_hub(repo_id, config_name: str = 'default', set_default: Optional[bool] = None, data_dir: Optional[str] = None, commit_message: Optional[str] = None, commit_description: Optional[str] = None, private: Optional[bool] = None, token: Optional[str] = None, revision: Optional[str] = None, create_pr: Optional[bool] = False, max_shard_size: Union[str, int, NoneType] = None, num_shards: Optional[dict[str, int]] = None, embed_external_files: bool = True) -> huggingface_hub.hf_api.CommitInfo method of datasets.dataset_dict.DatasetDict instance
    Pushes the [`DatasetDict`] to the hub as a Parquet dataset.
    The [`DatasetDict`] is pushed using HTTP requests and does not need to have neither git or git-lfs installed.
    
    Each dataset split will be pushed independently. The pushed dataset will keep the original split names.
    
    The resulting Parquet files are self-contained by default: if your dataset contains [`Image`

In [40]:
wmt.push_to_hub(
    "Rexhaif/wmt22-23"
)

Creating parquet from Arrow format: 100%|██████████| 291/291 [00:00<00:00, 1667.19ba/s]
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
100%|██████████| 1/1 [00:02<00:00,  2.16s/it]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.79s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Rexhaif/wmt22-23/commit/f3fdd29c8cb1f63816b694490fab25979c58216e', commit_message='Upload dataset', commit_description='', oid='f3fdd29c8cb1f63816b694490fab25979c58216e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Rexhaif/wmt22-23', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Rexhaif/wmt22-23'), pr_revision=None, pr_num=None)