In [1]:
import scipy
import pandas as pd
from pathlib import Path
from utils.classificaton_utils import evaluate_results

avg = "none" # either macro or none (if none, we consider label 1)

  from pandas import Panel


# Table 2: Main evaluation

In [3]:
results_1850 = evaluate_results(Path(f'results_1850'),avg=avg)
results_1920 = evaluate_results(Path(f'results_1920'),avg=avg)
results_2000 = evaluate_results(Path(f'results_2000'),avg=avg)

In [4]:
df_1850 = pd.DataFrame.from_dict(results_1850, orient='index', columns=['precision','recall','fscore','preds'])
df_1920 = pd.DataFrame.from_dict(results_1920, orient='index', columns=['precision','recall','fscore','preds'])
df_2000 = pd.DataFrame.from_dict(results_2000, orient='index', columns=['precision','recall','fscore','preds'])

In [5]:
renamed_columns = {"def_tok_overlap_ranking": "Lesk: token overlap",
                   "sent_embedding": "Lesk: sentence embedding",
                   "w2v_lesk_ranking": "Lesk: w2v",
                   "svm_wemb_baseline": "SVM classifier",
                   "bert_binary_centroid_vector_bert_base_-1,-2,-3,-4_mean": "BERT_base binary centroid",
                   "bert_centroid_sense_vector_bert_base_-1,-2,-3,-4_mean": "BERT_base sense centroid",
                   "bert_ml_perceptron_vector_bert_base_-1,-2,-3,-4_mean": "BERT_base perceptron",
                   "bert_binary_centroid_vector_blert_base_-1,-2,-3,-4_mean": "BERT_1900 binary centroid",
                   "bert_centroid_sense_vector_blert_base_-1,-2,-3,-4_mean": "BERT_1900 sense centroid",
                   "bert_ml_perceptron_vector_blert_base_-1,-2,-3,-4_mean": "BERT_1900 perceptron",
                   "bert_binary_centroid_vector_bert_1850_-1,-2,-3,-4_mean": "BERT_1850 binary centroid",
                   "bert_centroid_sense_vector_bert_1850_-1,-2,-3,-4_mean": "BERT_1850 sense centroid",
                   "bert_ml_perceptron_vector_bert_1850_-1,-2,-3,-4_mean": "BERT_1850 perceptron",
                   "bert_ts_nearest_centroid_sense_vector_bert_base_-1,-2,-3,-4_mean": "BERT_base nearest sense centroid",
                   "bert_ts_weighted_centroid_sense_vector_bert_base_-1,-2,-3,-4_mean": "BERT_base weighted sense centroid",
                   "bert_ts_nearest_centroid_sense_vector_blert_-1,-2,-3,-4_mean": "BERT_1900 nearest sense centroid",
                   "bert_ts_weighted_centroid_sense_vector_blert_base_-1,-2,-3,-4_mean": "BERT_1900 weighted sense centroid",
                   "bert_ts_nearest_centroid_sense_vector_bert_1850_-1,-2,-3,-4_mean": "BERT_1850 nearest sense centroid",
                   "bert_ts_weighted_centroid_sense_vector_bert_1850_-1,-2,-3,-4_mean": "BERT_1850 weighted sense centroid"}

df_1850 = df_1850.rename(index=renamed_columns)
df_1920 = df_1920.rename(index=renamed_columns)
df_2000 = df_2000.rename(index=renamed_columns)

In [6]:
df_fscores = pd.concat([df_1850[['precision', 'recall', 'fscore']], df_1920[['precision', 'recall', 'fscore']], df_2000[['precision', 'recall', 'fscore']]],axis=1) # ,df_2000['fscore']
cols_baselines = [c for c in df_fscores.index if not c.startswith("bert_")]
cols_bert = [c for c in df_fscores.index if c.startswith("bert_") and not 'ts' in c and not 'contrast' in c]
df_fscores = df_fscores.loc[cols_baselines + cols_bert]

In [7]:
print(df_fscores.to_latex())

\begin{tabular}{lrrrrrrrrr}
\toprule
{} &  precision &  recall &  fscore &  precision &  recall &  fscore &  precision &  recall &  fscore \\
\midrule
random                    &      0.103 &   0.516 &   0.171 &      0.090 &   0.508 &   0.153 &      0.085 &   0.503 &   0.145 \\
Lesk: token overlap       &      0.271 &   0.284 &   0.278 &      0.232 &   0.280 &   0.254 &      0.250 &   0.285 &   0.266 \\
Lesk: sentence embedding  &      0.235 &   0.165 &   0.194 &      0.256 &   0.196 &   0.222 &      0.251 &   0.195 &   0.219 \\
Lesk: w2v                 &      0.318 &   0.306 &   0.312 &      0.270 &   0.278 &   0.274 &      0.275 &   0.261 &   0.268 \\
SVM classifier            &      0.489 &   0.089 &   0.150 &      0.492 &   0.093 &   0.156 &      0.525 &   0.086 &   0.148 \\
BERT\_base binary centroid &      0.253 &   0.702 &   0.372 &      0.236 &   0.700 &   0.353 &      0.237 &   0.721 &   0.356 \\
BERT\_base sense centroid  &      0.732 &   0.480 &   0.580 &      0.658 &   0.4

In [8]:
rows_1850 = [r for r in df_fscores.index if '1850' in r]
rows_1920 = [r for r in df_fscores.index if '1900' in r]
rows_2000 = [r for r in df_fscores.index if 'base' in r]

In [9]:
diff_1850_2000 = df_1850.loc[df_1850.index.isin(rows_1850)].fscore.values - df_1850.loc[df_1850.index.isin(rows_2000)].fscore.values
diff_1920_2000 = df_1920.loc[df_1920.index.isin(rows_1920)].fscore.values - df_1920.loc[df_1920.index.isin(rows_2000)].fscore.values

In [10]:
df_diff = pd.DataFrame([diff_1850_2000,diff_1920_2000],columns=rows_2000).T
#df_diff['sum'] = df_diff.sum(axis=1)
print(df_diff.to_latex())

\begin{tabular}{lrr}
\toprule
{} &      0 &      1 \\
\midrule
BERT\_base binary centroid & -0.010 & -0.011 \\
BERT\_base sense centroid  &  0.011 &  0.025 \\
BERT\_base perceptron      & -0.009 &  0.001 \\
\bottomrule
\end{tabular}



# Table 3: Time-sensitive methods

In [11]:
results_ts_1850 = evaluate_results(Path("results_ts_1850"),avg=avg)
results_ts_1920 = evaluate_results(Path("results_ts_1920"),avg=avg)

In [12]:
df_ts_1850 = pd.DataFrame.from_dict(results_ts_1850, orient='index', columns=['precision','recall','fscore','preds'])
df_ts_1920 = pd.DataFrame.from_dict(results_ts_1920, orient='index', columns=['precision','recall','fscore','preds'])
df_fscores = pd.concat([df_ts_1850['fscore'], df_ts_1920['fscore']],axis=1)
df_fscores = df_fscores.rename(index=renamed_columns)

In [13]:
df_fscores.index

Index(['BERT_base sense centroid', 'BERT_base nearest sense centroid',
       'BERT_base weighted sense centroid',
       'bert_centroid_sense_vector_blert_base_-1,-2,-3,-4_mean',
       'bert_ts_nearest_centroid_sense_vector_blert_base_-1,-2,-3,-4_mean',
       'bert_ts_weighted_centroid_sense_vector_blert_base_-1,-2,-3,-4_mean',
       'BERT_1850 sense centroid', 'BERT_1850 nearest sense centroid',
       'BERT_1850 weighted sense centroid'],
      dtype='object')

In [14]:
print(df_fscores.to_latex())

\begin{tabular}{lrr}
\toprule
{} &  fscore &  fscore \\
\midrule
BERT\_base sense centroid                           &   0.575 &   0.552 \\
BERT\_base nearest sense centroid                   &   0.458 &   0.433 \\
BERT\_base weighted sense centroid                  &   0.593 &   0.556 \\
bert\_centroid\_sense\_vector\_blert\_base\_-1,-2,-3,... &   0.604 &   0.592 \\
bert\_ts\_nearest\_centroid\_sense\_vector\_blert\_bas... &   0.505 &   0.464 \\
bert\_ts\_weighted\_centroid\_sense\_vector\_blert\_ba... &   0.627 &   0.584 \\
BERT\_1850 sense centroid                           &   0.602 &   0.579 \\
BERT\_1850 nearest sense centroid                   &   0.489 &   0.441 \\
BERT\_1850 weighted sense centroid                  &   0.609 &   0.562 \\
\bottomrule
\end{tabular}



# Curated examples

In [25]:
results_curated_seed = evaluate_results(Path(f'results_curated_1920_seed'),avg=avg)
results_curated_synonym = evaluate_results(Path(f'results_curated_1920_syn'),avg=avg)

In [26]:
df_curated_seed = pd.DataFrame.from_dict(results_curated_seed, orient='index', columns=['precision','recall','fscore','preds'])
df_curated_synonym = pd.DataFrame.from_dict(results_curated_synonym, orient='index', columns=['precision','recall','fscore','preds'])

In [27]:
df_fscore = pd.concat([df_curated_seed['fscore'],df_curated_synonym['fscore']], axis=1)
df_fscore = df_fscore.rename(index=renamed_columns)
df_fscore.columns=['vertical','horizontal']
print(df_fscore.to_latex())

\begin{tabular}{lrr}
\toprule
{} &  vertical &  horizontal \\
\midrule
BERT\_base sense centroid                           &     0.691 &       0.536 \\
BERT\_base weighted sense centroid                  &     0.582 &       0.521 \\
BERT\_base perceptron                               &     0.710 &       0.493 \\
bert\_centroid\_sense\_vector\_blert\_base\_-1,-2,-3,... &     0.700 &       0.554 \\
bert\_ts\_weighted\_centroid\_sense\_vector\_blert\_ba... &     0.613 &       0.566 \\
bert\_ml\_perceptron\_vector\_blert\_base\_-1,-2,-3,-... &     0.612 &       0.526 \\
BERT\_1850 sense centroid                           &     0.658 &       0.563 \\
BERT\_1850 weighted sense centroid                  &     0.564 &       0.540 \\
BERT\_1850 perceptron                               &     0.621 &       0.482 \\
\bottomrule
\end{tabular}



# Assess Statistical Significance

In [None]:
res = evaluate_results(Path("results_1920/"),avg=avg)

selected = "bert_centroid_sense_vector_bert_base_-1,-2,-3,-4_mean"

selected_pred = res[selected][3]
print (selected, res[selected][:3], "\n\nIs the difference significant?\n")

for method,values in res.items():
    if method != selected:
        pred = values[3]
        p_value = scipy.stats.ttest_rel(selected_pred[0],pred[0])[1]
        if p_value<0.05:
            print (method, values[:3], "YES")
        else:
            print (method, values[:3], "NO p_value:",round(p_value,5))