# Gold Evaluation

In [1]:
import numpy as np
import scipy.stats as st
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from memolon.src import utils, constants

from pathlib import Path
import json

import copy
from io import StringIO

Define gold evaluation for one particular dataset:

In [2]:
def gold_eval_one_dataset(targetGold, targetPred_test):

    rt = {}
    
    # Find shared vocab between TargetGold and TargetPred-test. This is what we test on.
    shared = utils.list_intersection(targetGold.index, targetPred_test.index)
    rt['shared'] = len(shared)
    rt['(%)'] = int(len(shared) / len(targetGold) * 100)
    
    # Compute Pearson correlation for each  emotional variable.
    for c in targetGold.columns:
        rt[c] = st.pearsonr(targetGold.loc[shared, c], targetPred_test.loc[shared, c])[0]
    return rt


Run evaluation for all gold datasets and save to disk:

In [3]:
df = pd.DataFrame(columns=['valence', 'arousal', 'dominance', 'joy', 'anger', 'sadness',
                           'fear', 'disgust', 'shared', '(%)'],
                 index=utils.conditions.keys())
for key, value in utils.conditions.items():
    print(key, end='\r')
    iso = value['iso']
    target_gold = value['get']()
    df.loc[key] = gold_eval_one_dataset(targetGold=target_gold,
                                targetPred_test=utils.get_TargetPred(iso=iso, split='test'))

df.to_csv(constants.GOLD_EVALUATION_RESULTS)

en1

  exec(code_obj, self.user_global_ns, self.user_ns)


hr2

Inspect and format into VA(D) and BE5 datasets:

In [4]:
df = pd.read_csv(constants.GOLD_EVALUATION_RESULTS, index_col=0)
df.head()

Unnamed: 0,valence,arousal,dominance,joy,anger,sadness,fear,disgust,shared,(%)
en1,0.940544,0.760695,0.878971,,,,,,1032.0,100.0
en2,0.921625,0.708846,0.782097,,,,,,1034.0,100.0
en3,,,,0.890146,0.834981,0.79838,0.821245,0.776637,1033.0,99.0
es1,0.905345,0.714292,0.823378,,,,,,612.0,59.0
es2,0.789475,0.637861,,,,,,,7685.0,54.0


In [5]:
df_vad = pd.DataFrame(columns=['Shared', '(%)', 'Val', 'Aro', 'Dom'])
df_be = pd.DataFrame(columns=['Shared', '(%)', 'Joy', 'Ang', 'Sad', 'Fea', 'Dis'])

for key, value in utils.conditions.items():
    if value['emo'] == 'vad':
        df_vad.loc[key] = [df.loc[key, var] for var in ['shared', '(%)', 'valence', 'arousal', 'dominance']]
    elif value['emo'] == 'be':
        df_be.loc[key] = [df.loc[key, var] for var in ['shared','(%)', 'joy', 'anger', 'sadness', 'fear', 'disgust']]
    else:
        raise ValueError('{} neither vad nor be'.format(value['emo']))

In [6]:
df_vad.loc['Mean'] = df_vad.mean(axis=0)
s = df_vad.round(2).astype({'Shared': int, '(%)':int}).to_latex(float_format=utils.formatter)

print(s)

\begin{tabular}{lrrrrr}
\toprule
{} &  Shared &  (\%) &  Val &  Aro &  Dom \\
\midrule
en1  &    1032 &  100 &  .94 &  .76 &  .88 \\
en2  &    1034 &  100 &  .92 &  .71 &  .78 \\
es1  &     612 &   59 &  .91 &  .71 &  .82 \\
es2  &    7685 &   54 &  .79 &  .64 &  --- \\
es3  &     363 &   41 &  .91 &  .73 &  --- \\
de1  &     677 &   67 &  .89 &  .78 &  .68 \\
de2  &    2329 &   80 &  .75 &  .64 &  --- \\
de3  &     916 &   91 &  .80 &  .67 &  --- \\
pl1  &    2271 &   46 &  .83 &  .74 &  .60 \\
pl2  &    1381 &   47 &  .82 &  .61 &  --- \\
zh1  &    1685 &   60 &  .84 &  .56 &  --- \\
zh2  &     701 &   63 &  .84 &  .44 &  --- \\
it   &     660 &   58 &  .89 &  .63 &  .76 \\
pt   &     645 &   62 &  .89 &  .71 &  .75 \\
nl   &    2064 &   48 &  .85 &  .58 &  --- \\
id   &     696 &   46 &  .84 &  .64 &  .63 \\
el   &     633 &   61 &  .86 &  .50 &  .74 \\
tr1  &     721 &   35 &  .75 &  .57 &  --- \\
hr   &    1331 &   44 &  .81 &  .66 &  --- \\
Mean &    1444 &   61 &  .85 &  .65 &  

Mean for datasets with comparative monolingual results (Buechel, NAACL 2018).

In [7]:
df_vad.loc[['en1', 'en2', 'es1', 'es2', 'de1', 'pl1', 'zh1', 'it', 'pt', 'nl', 'id']].mean(axis=0)

Shared    1732.818182
(%)         63.636364
Val          0.871590
Aro          0.677322
Dom          0.738592
dtype: float64

Comparison against monolingual results from Buechel (NAACL 2018). Experimental results are available [here](https://github.com/JULIELab/wordEmotions/tree/master/naacl/prediction/experiments/main/results). (Data from proposed model found in `my_model_relu.tsv`. For the English datasets, the versions `_common_crawl` showed the best performance.

In [8]:
buechel = pd.DataFrame(columns=constants.vad)
buechel.loc["en1"] = [0.8695357984492947, 0.6743475845073215, 0.7585379325056423]
buechel.loc["en2"] = [0.9182468721131594, 0.7298232760549845, 0.8247833442109835]
buechel.loc["es1"] = [0.8795993715288171, 0.6981487340135996, 0.8346854175355294]
buechel.loc["es2"] = [0.8176857473317852, 0.7388663857337776, None]
buechel.loc["de1"] = [0.8675498005735844, 0.7996219516790998, 0.7365101991148422]
buechel.loc["pl1"] = [0.7417973149362131, 0.6987762216663478, 0.694064739363759]
buechel.loc["zh1"] = [0.8547400710739769, 0.6338107344827453, None]
buechel.loc["it"] = [0.8605569379688705, 0.6452377487011844, 0.7458722210180169]
buechel.loc["pt"] = [0.8612869524191618, 0.7144457538585538, 0.7290024427024978]
buechel.loc["nl"] = [0.792326386848945, 0.7384957169011812, None]
buechel.loc["id"] = [0.7993408905574892, 0.603200774683224, 0.5778482333058352]
buechel.loc["MEAN"] = buechel.mean(axis=0)
buechel

Unnamed: 0,valence,arousal,dominance
en1,0.869536,0.674348,0.758538
en2,0.918247,0.729823,0.824783
es1,0.879599,0.698149,0.834685
es2,0.817686,0.738866,
de1,0.86755,0.799622,0.73651
pl1,0.741797,0.698776,0.694065
zh1,0.85474,0.633811,
it,0.860557,0.645238,0.745872
pt,0.861287,0.714446,0.729002
nl,0.792326,0.738496,


Formatting for BE5:

In [10]:
df_be.loc['Mean'] = df_be.mean(axis=0)
s = df_be.round(2).astype({'Shared': int, '(%)':int}).to_latex(float_format=utils.formatter)

print(s)

\begin{tabular}{lrrrrrrr}
\toprule
{} &  Shared &  (\%) &  Joy &  Ang &  Sad &  Fea &  Dis \\
\midrule
en3  &    1033 &   99 &  .89 &  .83 &  .80 &  .82 &  .78 \\
es4  &     363 &   41 &  .86 &  .84 &  .84 &  .84 &  .76 \\
es5  &    6096 &   58 &  .64 &  .72 &  .72 &  .72 &  .63 \\
es6  &     992 &   43 &  .80 &  .74 &  .71 &  .72 &  .68 \\
de4  &     848 &   43 &  .80 &  .66 &  .52 &  .68 &  .42 \\
pl3  &    1381 &   47 &  .78 &  .71 &  .66 &  .69 &  .71 \\
tr2  &     721 &   35 &  .77 &  .69 &  .71 &  .70 &  .65 \\
Mean &    1633 &   52 &  .79 &  .74 &  .71 &  .74 &  .66 \\
\bottomrule
\end{tabular}



---