# Dependencies

In [None]:
# @title Imports

import sys
import numpy as np
import scipy.stats
import glob
from mt_metrics_eval import meta_info
from mt_metrics_eval import data
from mt_metrics_eval import stats

In [None]:
# @title Task-handling utils

def task_to_dict(taskname):
  """Convert a task name into an attribute:value dict."""
  return dict(tuple(s.split('=')) for s in taskname.split())

def get_val(taskname, attr):
  """Get the value for a given attribution from a task name."""
  return task_to_dict(taskname)[attr]

def attr_vals(tasknames):
  """Return an attribute: {values} map for a collection of task names."""
  attr_val_dict = {}
  for name in tasknames:
    for k, v in task_to_dict(taskname):
      if k not in attr_val_dict:
        attr_val_dict[k] = set()
      attr_val_dict[k].add(v)
  return attr_val_dict

def partition_by_attribute(tasknames, attr):
  """Partition a collection of task names by values of a given attribute."""
  partition = {}  # val -> [task names where attr = val]
  for name in tasknames:
    attr_val = get_val(name, attr)
    if attr_val not in partition:
      partition[attr_val] = []
    partition[attr_val].append(name)
  return partition

def sort_by_attrs(tasknames, attr_list):
  """Sort a collection of tasknames by attributes, in order."""
  if not attr_list:
    return tasknames
  ret = []
  partition = partition_by_attribute(tasknames, attr_list[0])
  for names in partition.values():
    ret.extend(sort_by_attrs(names, attr_list[1:]))
  return ret


In [None]:
# @title Main evaluation function

def reformat(results):
  """Reformat CompareMetrics() results to match mtme's format."""
  metrics, sig_matrix = results
  res = {}
  for i, (m, (corr, rank)) in enumerate(metrics.items()):
    sigs = ['1' if p < 0.05 else '0' for p in sig_matrix[i]]
    sigs = ['x'] * (i + 1) + sigs[i + 1:]
    res[m] = (rank, corr, ' '.join(sigs))
  return res

def eval_metrics(eval_sets, langs, levels, primary_only, k, gold_name='std',
                 include_domains=True, seg_level_no_avg=False,
                 include_human_with_acc=False):
  """Evaluate all metrics for eval sets, across multiple task settings.

  Args:
    eval_sets: Map from lang-pair to eval_set objects.
    langs: List of language pairs (eg 'en-de') for which to compute results.
    levels: List of levels for which to compute results, allowed elements are
      'sys' and 'seg'.
    primary_only: Include only primary metrics.
    k: Number of boostrap draws. If 0, no significance tests for metric-score
      differences are run, and execution is much faster.
    gold_name: Name of gold scores to use, standard scores if 'std'.
    include_domains: Generate domain-specific results in addition to global
      results.
    seg_level_no_avg: If True, use only the average_by=None setting for segment-
      level correlations
    include_human_with_acc: If True, include human outputs in accuracy tasks.

  Returns:
    Map from task names to metric -> (rank, corr, sig_string) stats.
  """
  results = {}

  # First task is global accuracy, iff more than one language is given.
  if len(langs) > 1:
    evs_list = [eval_sets[lp] for lp in langs]
    main_refs = [{evs.std_ref} for evs in evs_list]
    close_refs = [set() for evs in evs_list]
    if gold_name == 'std':
      gold = evs_list[0].StdHumanScoreName('sys')
    else:
      gold = gold_name
    humans = [True, False] if include_human_with_acc else [False]
    for human in humans:
      taskname = data.MakeTaskName(
          'wmt22', langs, None, 'sys', human, 'none', 'accuracy', k, gold,
          main_refs, close_refs, False, primary_only)
      print(taskname)
      res = data.CompareMetricsWithGlobalAccuracy(
          evs_list, main_refs, close_refs, include_human=human,
          include_outliers=False, gold_name=gold,
          primary_metrics=primary_only,
          domain=None, k=k, pval=0.05)
      results[taskname] = reformat(res)
  
  # Remaining tasks are specific to language, domain, etc.
  for lp in langs:
    evs = eval_sets[lp]
    main_refs = {evs.std_ref}
    close_refs = set()
    for domain in [None] + (list(evs.domain_names) if include_domains else []):
      for level in levels:
        gold = evs.StdHumanScoreName(level) if gold_name == 'std' else gold_name
        for avg in 'none', 'sys', 'item':
          if (level == 'sys' or seg_level_no_avg) and avg != 'none': continue
          for human in True, False:
            if human == True and len(evs.ref_names) == 1: continue  # Single ref
            for corr in 'pearson', 'kendall':             
              corr_fcn = {'pearson': scipy.stats.pearsonr,
                          'kendall': scipy.stats.kendalltau}[corr]
              taskname = data.MakeTaskName(
                  'wmt22', lp, domain, level, human, avg, corr, k, gold,
                   main_refs, close_refs, False, primary=primary_only) 
              print(taskname)
              corrs = data.GetCorrelations(
                  evs=evs, level=level, main_refs={evs.std_ref},
                  close_refs=close_refs, include_human=human,
                  include_outliers=False, gold_name=gold_name,
                  primary_metrics=primary_only, domain=domain)
              metrics, sig_matrix = data.CompareMetrics(
                  corrs, corr_fcn, average_by=avg, k=k, pval=0.05)
              # Make compatible with accuracy results.
              metrics = {evs.DisplayName(m): v for m, v in metrics.items()}
              results[taskname] = reformat((metrics, sig_matrix))

  return results


In [None]:
# @title Load data

import sys

eval_sets = {}
for lp in meta_info.DATA['wmt22']:
  print(lp, file=sys.stderr)
  eval_sets[lp] = data.EvalSet('wmt22', lp, True)

In [None]:
# @title Define more global vars

focus_lps = ['en-de', 'en-ru', 'zh-en']
focus_first_lps = focus_lps + [lp for lp in eval_sets if lp not in focus_lps]

# Define order of attributes for grouping, etc.
main_attributes = ['lang', 'domain', 'level', 'human', 'avg_by', 'corr']

# System scores

In [None]:
# @title System performance

def rank_systems(evs, system_list, scorer):
  """Get map from sys->(rank, score) for each sys in system list."""
  score_map = evs.Scores('sys', scorer)
  pairs_to_sort = []
  for sy in system_list:
    sys_score = score_map[sy][0] if sy in score_map else None
    if sys_score is None:
      sys_score = -1000
    pairs_to_sort.append((sys_score, sy))
  ranked_pairs = enumerate(sorted(pairs_to_sort, reverse=True))
  return {sy: (i + 1, score) for i, (score, sy) in ranked_pairs}

def format(scorer, score):
  if scorer == 'mqm':
    return f'{-score:<0.3f}'
  elif scorer == 'wmt-z':
    return f'{score:<0.3f}'
  elif scorer.startswith(('BLEURT', 'COMET')):
    score *= 100.0
  return f'{score:<0.1f}'

metrics = ['BLEU', 'chrF', 'BLEURT-20', 'COMET-20']

for lp in focus_first_lps:
  evs = eval_sets[lp]

  std_gold = evs.StdHumanScoreName('sys')
  if not std_gold:  # Some lps have no human scores.
    continue

  std_gold_scores = evs.Scores('sys', std_gold)
  systems = [s for s in std_gold_scores if std_gold_scores[s][0] is not None]

  other_gold = [h for h in evs.human_score_names if h != std_gold]
  full_metrics = [f'{m}-{evs.std_ref}' for m in metrics]

  print(lp)
  scorer_list = '\t'.join([std_gold] + other_gold + metrics)
  print(f'system\t{scorer_list}')
  std_gold_ranking = rank_systems(evs, systems, std_gold)
  other_rankings = {scorer: rank_systems(evs, systems, scorer) 
                    for scorer in other_gold + full_metrics}

  for sys, (rank, score) in std_gold_ranking.items():
    print(f'{sys}\t{format(std_gold, score)} ({rank})', end='\t')
    for scorer in other_gold + full_metrics:
      rank, sc = other_rankings[scorer][sys]
      if sc == -1000:
        print('---', end='\t')
      else:   
        print(f'{format(scorer, sc)} ({rank})', end='\t')
    print('')
  print()



In [None]:
# @title Per-domain system performance

def _num(v):
  return v if v is not None else -1000

def _pos(v):
  return -v if v < 0 else v

for lp in focus_lps:
  evs = eval_sets[lp]

  human_scores = ['mqm']
  if lp == 'en-ru':
    human_scores.append('mqm.unb')  # Unbabel scores w/ Google weights.

  for gold in human_scores:
  
    # Sort systems by global MQM score.
    scores = evs.Scores('sys', gold)
    scores = sorted(((s, _num(v[0])) for s, v in scores.items()),
                    key=lambda x: -x[1])
    scores = {k: [v] for k, v in scores}
   
    # Add domain-specfic MQM scores.
    domain_header = ''
    if 'domain' in evs.levels:
      domain_header = ' '.join(evs.domain_names) + ' '
      for i, d in enumerate(evs.domain_names):
        for system in scores:
          scores[system].append(_num(evs.Scores('domain', gold)[system][i]))

    # Add system-level scores for selected metrics.
    metrics = ['BLEU', 'chrF', 'BLEURT-20', 'COMET-20']
    for metric in metrics:
      metric_scores = evs.Scores('sys', f'{metric}-{evs.std_ref}')
      assert metric_scores, metric
      for s in scores:
        scores[s].append(metric_scores[s][0] if s in metric_scores else 0)

    print(lp, gold)
    print(f'sys ALL {domain_header}{" ".join(metrics)}')
    for sysname in scores:
      print(sysname, ' '.join(f'{_pos(s):0.3f}' for s in scores[sysname]))
    print()

# Main results

In [None]:
# @title MQM correlations for primary metrics

# This is VERY slow unless k is < 5, mostly just provided for example.
# Main results are computed in parallel using mtme, and read in using the cell
# below. They should exactly match the results computed here, modulo variance in
# rank assignment due to sampling if k > 0.

main_results = eval_metrics(
    eval_sets, ['en-de', 'en-ru', 'zh-en'], ['sys', 'seg'], 
    primary_only=True, k=1)


In [None]:
# @title Read pre-computed MQM correlations for primary metrics

# Also see above cell.
# Runs were generated by wmt22-metric-ranking.sh.

corpus = '/usr/local/google/home/fosterg/corpora/mt-metrics-eval/work'
run = '/metric-ranking.wmt22'

main_results = {}  # task -> metric -> (rank, corr, sigs)
for filename in glob.glob(f'{corpus}/{run}/task*.out'):
  with open(filename) as f:
    lines = [line.strip() for line in f]
  taskname = lines[0]
  print(taskname)
  res = {}
  for line in lines[1:]:
    m, rank, corr, sigs = line.split(' ', maxsplit=3)
    res[m] = (int(rank), float(corr), sigs)
  main_results[taskname] = res 


In [None]:
# @title Print MQM sig matrices for primary metrics

# Canonical task order is a sort by main attributes.
ordered_tasks = sort_by_attrs(main_results, main_attributes)

# This works with results from borg or from eval_metrics() above.
for taskname in ordered_tasks:
  print(taskname)
  for m, (rank, corr, sigs) in main_results[taskname].items():
    print(m, rank, corr, sigs)
  print()

In [None]:
# @title Compute global task weights

ordered_attributes = [
    'test_set', 'lang', 'domain', 'level', 'human', 'avg_by', 'corr']

def distribute_mass(task_names, attr_list, weight):
  """Recursively distribute mass to tasks according to ordered attributes."""
  if len(attr_list) == 0:
    weight /= len(task_names)
    return {name: weight for name in task_names}
  partition = partition_by_attribute(task_names, attr_list[0])
  weight /= len(partition)
  weight_map = {}
  for attr_val, names in partition.items():
    sub_map = distribute_mass(names, attr_list[1:], weight)
    weight_map.update(sub_map)
  return weight_map  # task -> weight

task_weights = distribute_mass(main_results, ordered_attributes, 1.0)

# Check and print.
total_weight = 0
for task, wt in task_weights.items():
  print(task, wt)
  total_weight += wt
print(f'{total_weight=}')

In [None]:
# @title Average ranks for various MQM task partitions

# These are the main results for the eval.

def rank_metrics(tasknames, results, task_weights):
  """Return metric -> avg_rank map, rank==None for metrics not in all tasks."""
  ranks, counts = {}, {}
  total_weight = 0
  for task in tasknames:
    total_weight += task_weights[task]
    for metric, (rank, _, _) in results[task].items():
      if metric not in ranks:
        ranks[metric], counts[metric] = 0, 0
      ranks[metric] += task_weights[task] * rank
      counts[metric] += 1

  def _key(metric_rank):
    metric, rank = metric_rank
    # Metrics that don't have values for all tasks go last.
    return rank if counts[metric] == len(tasknames) else 1000000
  ranks = dict(sorted(ranks.items(), key=_key))

  renorm = 1 / total_weight
  for m in ranks:
    if counts[m] != len(tasknames):
      ranks[m] = None
    else:
      ranks[m] = renorm * ranks[m]
  return ranks

def display_rank(ranking, metric):
  if metric in ranking and ranking[metric] is not None:
    return f'{ranking[metric]:0.2f}'
  else:
    return 'None'

# Compute ranking for partitions according to selected attributes.
ranking_by_attr = {}  # attr/val -> metric -> rank
for attr in ['test_set', 'lang', 'domain', 'level', 'human', 'corr']:
  for val, tasks in partition_by_attribute(main_results, attr).items():
    ranking_by_attr[f'{attr}={val}'] = rank_metrics(
        tasks, main_results, task_weights)

prime = 'test_set=wmt22'
attr_vals = [k for k in ranking_by_attr if k != prime]
print('metric', prime, ' '.join(attr_vals))
for metric in ranking_by_attr[prime]:
  ranks = [display_rank(ranking_by_attr[prime], metric)]
  for av in attr_vals:
    ranks.append(display_rank(ranking_by_attr[av], metric))
  print(metric, ' '.join(ranks))


In [None]:
# @title Print raw MQM ranking results

attrs = ['lang', 'domain', 'level', 'human', 'corr']
ordered_tasks = sort_by_attrs(main_results, attrs)
ordered_metrics = ranking_by_attr['test_set=wmt22']

# Print big table
for attr in attrs:
  print(attr, ' '.join([get_val(task, attr) for task in ordered_tasks]))
for metric in ordered_metrics:
  print(metric, end='')
  for task in ordered_tasks:
    if metric in main_results[task]:
      print(f' {main_results[task][metric][0]}', end='')
    else:
      print(' --', end='')
  print()


# Raw correlations

In [None]:
# @title Utils for raw correlations

def get_canonical_metric_list(results_map, attr_vals):
  """Get canonical metric order from a given task in a results map."""

  # Find unique task designated by attr_val pairs.
  res = results_map
  for attr, val in attr_vals:
    res = partition_by_attribute(res, attr)[val]
  assert len(res) == 1
  main_task = res[0]
  # print(task)
  metric_list = list(results_map[main_task])

  # Add in metrics that aren't available for selected task
  for task in results_map:
    for metric in results_map[task]:
      if metric not in metric_list:
        metric_list.append(metric)

  return metric_list, main_task

def print_results_table(results_map, sortby_attrs, metric_list):
  """Print all results in given map, using specified tasks and metric order."""

  # Sort tasks by given attributes
  ordered_tasks = sort_by_attrs(results_map, sortby_attrs)

  # Print header lines
  for attr in sortby_attrs:
    print(attr, ' '.join([get_val(task, attr) for task in ordered_tasks]))

  # Table content.
  for metric in metric_list:
    print(metric, end='')
    for task in ordered_tasks:
      if metric in results_map[task]:
        print(f' {results_map[task][metric][1]}', end='')
      else:
        print(' --', end='')
    print()

In [None]:
# @title Correlations using MQM

# Differences from main results:
# - no significance
# - correlations for all metrics, not just primary submissions

mqm_results = eval_metrics(
    eval_sets, focus_lps, ['sys', 'seg'], primary_only=False, k=0,
     gold_name='mqm')
print()

ordered_metrics, _ = get_canonical_metric_list(mqm_results, [('corr', 'accuracy')])
print_results_table(mqm_results, main_attributes, ordered_metrics)


In [None]:
# @title Correlations using Appraise

appraise_lps = ['cs-uk', 'en-cs', 'en-de', 'en-hr', 'en-ja', 'en-liv', 'en-ru',
                'en-uk', 'en-zh', 'liv-en', 'sah-ru', 'uk-cs', 'zh-en']

appraise_results = eval_metrics(
    eval_sets, appraise_lps, ['sys', 'seg'], primary_only=False, k=0,
    gold_name='wmt-appraise', include_domains=False, seg_level_no_avg=True,
    include_human_with_acc=True)
print()

ordered_metrics, _ = get_canonical_metric_list(
    appraise_results, [('corr', 'accuracy'), ('human', 'True')])
attrs = ['lang', 'level', 'human', 'corr'] 
print_results_table(appraise_results, attrs, ordered_metrics)


In [None]:
# @title Correlations using DA

da_lps = ['cs-en', 'de-en', 'ja-en', 'ru-en', 'uk-en', 'zh-en']

da_results = eval_metrics(
    eval_sets, da_lps, ['sys', 'seg'], primary_only=False, k=0,
    gold_name='wmt', include_domains=False, seg_level_no_avg=True)
print()

ordered_metrics, _ = get_canonical_metric_list(da_results, [('corr', 'accuracy')])
attrs = ['lang', 'level', 'human', 'corr'] 
print_results_table(da_results, attrs, ordered_metrics)


In [None]:
# @title Latex tables for appendices

# Depends on the cells above to compute: 
# mqm_results, da_results, appraise_results.

BASELINES = {'f101spBLEU', 'f200spBLEU', 'chrF', 'BERTScore', 'YiSi-1', 'BLEU',
             'BLEURT-20', 'COMET-20', 'COMET-QE'}

def make_table(res, level='sys', corr='pearson', acc_human='True',
               other_humans=None):
  """Make Latex table from results map: task -> metric -> score."""

  metrics_in_order, main_task = get_canonical_metric_list(
      res, [('corr', 'accuracy'), ('human', acc_human)])
  
  # Pick out subset of tasks in order.
  if other_humans is None:
    other_humans = ['True', 'False']
  tasks_to_print = {}  # task -> metric -> score
  for task in res:
    if task == main_task:
      tasks_to_print[task] = res[task]
    else:
      attrs = task_to_dict(task)
      if (attrs['level'] == level and attrs['domain'] == 'None' and 
          attrs['corr'] == corr and attrs['avg_by'] == 'none' and
          attrs['human'] in other_humans):
        tasks_to_print[task] = res[task]

  # Print header
  print(f'level={level} corr={corr}')
  for task in tasks_to_print:
    lang = get_val(task, 'lang')
    if lang.startswith('['):
      print('acc', end='')
    else:
      print(f' & {lang}', end='')
  print('\\\\')
  print(' & '.join(get_val(t, 'human') for t in tasks_to_print), '\\\\')

  # Contents
  for m in metrics_in_order:
    orig_m = m
    if m.endswith('[noref]'):
      m = m[:-len('[noref]')]
      star = '*'
    else:
      star = ''
    if m.startswith('*'):
      m = m[1:] + star
    else:
      if m in BASELINES:
        m = f'\\underline{{{m}{star}}}'
      else:
        m = f'\\textbf{{{m}{star}}}'
    print(f'{m}', end='')

    for task in tasks_to_print:
      val = f'{tasks_to_print[task][orig_m][1]:0.3f}' if orig_m in tasks_to_print[task] else '--'
      print(f' & {val}', end='')
    print('\\\\')
  print()


print('MQM results')
make_table(mqm_results, 'sys', 'pearson', 'False')
print('MQM results')
make_table(mqm_results, 'seg', 'kendall', 'False')

print('DA results')
make_table(da_results, 'sys', 'pearson', 'False', ['False'])
print('DA results')
make_table(da_results, 'seg', 'kendall', 'False', ['False'])

print('Appraise results')
make_table(appraise_results, 'sys', 'pearson', 'True', ['False'])
print('Appraise results')
make_table(appraise_results, 'seg', 'kendall', 'True', ['False'])


# Extras

In [None]:
# @title Unbabel vs Google MQM stats for en-ru

# This computes system-level Pearson and Kendall for all metrics
# with both MQM scoring schemes.

goog_results = eval_metrics(
    eval_sets, ['en-ru'], ['sys'], primary_only=True, k=100, gold_name='mqm', 
    include_domains=False)
unb_results = eval_metrics(
    eval_sets, ['en-ru'], ['sys'], primary_only=True, k=100, gold_name='mqm.unb',
    include_domains=False)

unb_pearson = partition_by_attribute(unb_results, 'corr')['pearson'][0]
unb_kendall = partition_by_attribute(unb_results, 'corr')['kendall'][0]
goog_pearson = partition_by_attribute(goog_results, 'corr')['pearson'][0]
goog_kendall = partition_by_attribute(goog_results, 'corr')['kendall'][0]

print('metric', 
      'unb-pears-rank unb-pears-corr unb-kend-rank unb-kend-corr ' 
      'goog-pears-rank goog-pears-corr goog-kend-rank goog-kend-corr')
for m in unb_results[unb_pearson]:
  upr, upc, _ = unb_results[unb_pearson][m]
  ukr, ukc, _ = unb_results[unb_kendall][m]
  gpr, gpc, _ = goog_results[goog_pearson][m]
  gkr, gkc, _ = goog_results[goog_kendall][m]
  print(m, 
        f'{upr} {upc:0.3f} {ukr} {ukc:0.3f} {gpr} {gpc:0.3f} {gkr} {gkc:0.3f}')


In [None]:
# @title Accuracy vs Kendall

# en-de only, testing the difference between these two tasks.
# (The ranking is identical and significance clusters are almost identical.)

kf = stats.KendallLike(thresh=0)
def acc(v1, v2):
  _, num_pairs, concordant, _ = kf.Corr(v1, v2)
  return concordant / num_pairs, num_pairs

evs = eval_sets['en-de']

corrs = data.GetCorrelations(
    evs, level='sys', main_refs = {evs.std_ref}, close_refs=set(),
    include_human=True, include_outliers=False, gold_name='mqm',
    primary_metrics=True)

# Kendall results
res, sig_matrix = data.CompareMetrics(corrs, scipy.stats.kendalltau, k=1000)
for m, (corr, rank) in res.items():
  print(m, rank, f'{corr:0.3f}')
print()

# Accuracy results
res, sig_matrix = data.CompareMetrics(corrs, acc, k=1000)
for m, (corr, rank) in res.items():
  print(m, rank, f'{corr:0.3f}')
