# Translation vs. Prediction

In [1]:
import pandas as pd
from scipy import stats as st

from memolon.src import utils, constants

Define evaluation for single dataset:

In [2]:
def evaluate_translation(gold, mt, pred):
    rt = {}
    
    shared_rows = sorted(list(set(gold.index).intersection(set(mt.index)).intersection(set(pred.index))))
    shared_columns = list(set(gold.columns).intersection(set(mt.columns)).intersection(set(pred.columns)))
    
    gold = gold.loc[shared_rows, shared_columns]
    mt = mt.loc[shared_rows, shared_columns]
    pred = pred.loc[shared_rows, shared_columns]
    
    
    # TargetMT may have duplicated entries. Those will be averaged to get a single rating per type.
    # (The reason why duplicated TargetMT entries are averaged here (and not in other analyses such as the
    # silver evaluation) is that we want to determine which of the two target side lexica, TargetPred or TargetMT is
    # closer to the gold standard. As such, averaging TargetMT entries is expected to result in higher performance
    # and leads to a more challenging comparison. Additionally, if TargetMT would be used in a downstream application,
    # having competing entries may not be acceptable.)
    tmp = pd.DataFrame(columns=mt.columns)
    for i in set(mt.index):
        candidate = mt.loc[i]
        if isinstance(candidate, pd.Series):
            tmp.loc[i] = candidate
        elif isinstance(candidate, pd.DataFrame):
            tmp.loc[i] = candidate.mean(axis=0)
        else:
            raise ValueError
    mt = tmp
    
    rt['overlap'] = len(shared_rows)
    if rt['overlap'] > 50:
        rt['GoldvMT'] = {}
        rt['GoldvPred'] = {}
        rt['MTvPred'] = {}
        for c in shared_columns:
            rt['GoldvMT'][c] = st.pearsonr(gold.loc[shared_rows, c], mt.loc[shared_rows, c])[0]
            rt['GoldvPred'][c] = st.pearsonr(gold.loc[shared_rows, c], pred.loc[shared_rows, c])[0]
            rt['MTvPred'][c] = st.pearsonr(mt.loc[shared_rows, c], pred.loc[shared_rows, c])[0]

    return rt
    

Run evaluation for all gold datasets:

In [3]:
index = pd.MultiIndex.from_product([utils.conditions.keys(), ['GoldvMT', 'GoldvPred', 'MTvPred']], names=['lexicon', 'measure'])
df = pd.DataFrame(index=index, columns =constants.emotions+['overlap'])

for key, data in utils.conditions.items():
    print(key, end='\r')
    iso = data['iso']
    if not iso == 'en':
        gold = data['get']()
        # This analysis was done on the respective train sets because using TargetMT
        # rather than TargetPred is only an option for entries known at training time.
        mt = utils.get_TargetMT(iso, split='train')
        pred = utils.get_TargetPred(iso, split='train')

        rt = evaluate_translation(gold=gold,
                                  mt=mt,
                                  pred=pred)
        
        for measure in ['GoldvMT', 'GoldvPred', 'MTvPred']:
            df.loc[(key, measure), 'overlap'] = rt['overlap']
            if measure in rt:
                for emo in constants.emotions:
                    if emo in rt[measure]:
                        df.loc[(key, measure), emo] = rt[measure][emo]

df.to_csv(constants.TRANSLATION_VS_PREDICTION_RESULTS)

en1en2en3es1

  exec(code_obj, self.user_global_ns, self.user_ns)


hr2

Loading and formatting results:

In [4]:
df = pd.read_csv(constants.TRANSLATION_VS_PREDICTION_RESULTS, index_col=[0,1])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,valence,arousal,dominance,joy,anger,sadness,fear,disgust,overlap
lexicon,measure,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
en1,GoldvMT,,,,,,,,,
en1,GoldvPred,,,,,,,,,
en1,MTvPred,,,,,,,,,
en2,GoldvMT,,,,,,,,,
en2,GoldvPred,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
tr2,GoldvPred,,,,0.772422,0.713524,0.739871,0.704026,0.659582,502.0
tr2,MTvPred,,,,0.923422,0.933139,0.888376,0.892582,0.866980,502.0
hr,GoldvMT,0.814773,0.535266,,,,,,,1374.0
hr,GoldvPred,0.864108,0.687535,,,,,,,1374.0


In [5]:
table = pd.DataFrame(index=['MT', 'Pred', 'Diff'], columns=constants.emotions, data=0.)
table.loc['MT'] = df.xs('GoldvMT', level='measure').mean()
table.loc['Pred'] = df.xs('GoldvPred', level='measure').mean()
table.loc['Diff'] = table.loc['Pred'] - table.loc['MT']
table

Unnamed: 0,valence,arousal,dominance,joy,anger,sadness,fear,disgust
MT,0.795791,0.514922,0.613122,0.69894,0.67698,0.636101,0.654108,0.578932
Pred,0.871302,0.652009,0.732621,0.767076,0.734397,0.692273,0.728254,0.649699
Diff,0.075511,0.137087,0.119499,0.068136,0.057416,0.056172,0.074146,0.070767


In [6]:
s = table.round(3).to_latex(float_format=lambda x: "{:.3f}".format(x).lstrip('0'))
print(s)

\begin{tabular}{lrrrrrrrr}
\toprule
{} &  valence &  arousal &  dominance &  joy &  anger &  sadness &  fear &  disgust \\
\midrule
MT   &     .796 &     .515 &       .613 & .699 &   .677 &     .636 &  .654 &     .579 \\
Pred &     .871 &     .652 &       .733 & .767 &   .734 &     .692 &  .728 &     .650 \\
Diff &     .076 &     .137 &       .119 & .068 &   .057 &     .056 &  .074 &     .071 \\
\bottomrule
\end{tabular}



---