In [1]:
import datasets as ds
import pandas as pd
from itertools import combinations
from rich.progress import track
from matplotlib import pyplot as plt
from rich import print

%matplotlib inline

In [2]:
dataset = ds.load_dataset("Rexhaif/wmt23-24")

In [3]:
def make_pairs(dataset):
    data = dataset.to_pandas()
    result = []
    for src, subset in track(data.groupby('src'), description='Processing', show_speed=True):
        pairs = list(combinations(subset.to_dict(orient='records'), 2))
        for item1, item2 in pairs:
            result.append({
                'lp': item1['lp'],
                'dataset': "mt-ranking",
                'src': item1['src'],
                'ref': item1['ref'],
                'hyp0': item1['hyp'],
                'hyp1': item2['hyp'],
                'score0': item1['score'],
                'score1': item2['score'],
                'system0': item1['system'],
                'system1': item2['system'],
                'score_diff': abs(abs(item1['score']) - abs(item2['score'])),
                'score_name': item1['score_name'],
                'best_hyp': 0 if item1['score'] > item2['score'] else 1,
            })
    return pd.DataFrame(result)


In [4]:
pairs_train = make_pairs(dataset['train'])

Output()

In [5]:
pairs_train = pairs_train[pairs_train.lp != 'en-cs']

In [6]:
pairs_train[pairs_train.score_diff > 20].shape

(270354, 13)

In [7]:
pairs_train = ds.Dataset.from_pandas(pairs_train)


In [8]:
pairs_train = pairs_train.filter(lambda x: x['score_diff'] > 20)

Filter:   0%|          | 0/893383 [00:00<?, ? examples/s]

In [9]:
pairs_train = pairs_train.remove_columns(['__index_level_0__'])

In [10]:
pairs_train = pairs_train.shuffle(seed=42)

In [11]:
import sys
sys.path.append('..')
sys.path.append('../training')
from metric_utils import DEFAULT_INSTRUCTION, JUDGE_PROMPT_NO_THINKING, JUDGE_PROMPT_THINKING, LANG_CODES, SYSTEM_NO_CHOSEN, SYSTEM_CHOSEN

def transform_fn(example, tokenizer=None):
        lang1, lang2 = example['lp'].split('-')
        source_text = example['src']
        instruction = DEFAULT_INSTRUCTION.format(
            source_language=LANG_CODES[lang1],
            target_language=LANG_CODES[lang2],
            source_text=source_text
        )

        input_message = JUDGE_PROMPT_THINKING.format(
            instruction=instruction,
            assistant_a_response=example["hyp0"],
            assistant_b_response=example["hyp1"]
        )

        answer = "A" if example["best_hyp"] == 0 else "B"
        answer = f"{answer}"

        return {
            "messages": [
                {"role": "system", "content": SYSTEM_NO_CHOSEN},
                {"role": "user", "content": input_message},
                {"role": "assistant", "content": answer}
            ]
        }

In [12]:
pairs_train.column_names

['lp',
 'dataset',
 'src',
 'ref',
 'hyp0',
 'hyp1',
 'score0',
 'score1',
 'system0',
 'system1',
 'score_diff',
 'score_name',
 'best_hyp']

In [13]:
pairs_train = pairs_train.map(transform_fn, remove_columns=pairs_train.column_names)

Map:   0%|          | 0/270354 [00:00<?, ? examples/s]

In [14]:
pairs_train[0]

{'messages': [{'content': 'You are a helpful translation evaluator. You will provide a verdict in a strict format, do not include any other text. Just letter "A" or "B".',
   'role': 'system'},
  {'content': "Please act as an impartial judge and evaluate the quality of the translations provided by two AI assistants in response to the user's request below.\nSelect the assistant that best adheres to the user's instructions while producing the highest-quality translation overall.\nBegin by comparing the two translations and reason before you answer.\nAvoid personal opinions or biases, and do not favor one assistant over the other.\nYour judgment should be based solely on the quality of the translations and their alignment with the user's instructions.\nBe objective and impartial. If both translations are equally good, you can choose the one that you prefer.\n\nAfter providing your explanation, response strictly in this format: \n\n<think>\n... your reasoning process ...\n</think>\n<answer

In [15]:

pairs_train.push_to_hub('Rexhaif/wmt23-pairs-sft')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/271 [00:00<?, ?ba/s]

Uploading files as bytes or binary IO objects is not supported by Xet Storage. Falling back to HTTP upload.


README.md:   0%|          | 0.00/354 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Rexhaif/wmt23-pairs-sft/commit/d8e8aa723483ad34e1a7af51fa716287fee9c7de', commit_message='Upload dataset', commit_description='', oid='d8e8aa723483ad34e1a7af51fa716287fee9c7de', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Rexhaif/wmt23-pairs-sft', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Rexhaif/wmt23-pairs-sft'), pr_revision=None, pr_num=None)

In [None]:
import datasets as ds
