In [None]:
import scipy
import pandas as pd
from pathlib import Path
from utils.classificaton_utils import evaluate_results

avg = "macro" # either macro or none (if none, we consider label 1)

# Table 2: Main evaluation

In [None]:
results_1850 = evaluate_results(Path(f'results_1850'),avg=avg)
results_1920 = evaluate_results(Path(f'results_1920'),avg=avg)
results_2000 = evaluate_results(Path('results_2000'),avg=avg)

In [None]:
df_1850 = pd.DataFrame.from_dict(results_1850, orient='index', columns=['precision','recall','fscore','preds'])
df_1920 = pd.DataFrame.from_dict(results_1920, orient='index', columns=['precision','recall','fscore','preds'])
df_2000 = pd.DataFrame.from_dict(results_2000, orient='index', columns=['precision','recall','fscore','preds'])

In [None]:
df_fscores = pd.concat([df_1850[['precision', 'recall', 'fscore']], df_1920[['precision', 'recall', 'fscore']], df_2000[['precision', 'recall', 'fscore']]],axis=1) # ,df_2000['fscore']
cols_baselines = ["random", "def_tok_overlap_ranking", "sent_embedding", "w2v_lesk_ranking", "svm_wemb_baseline"]
cols_bert = [c for c in df_fscores.index if not 'ts' in c or 'contrast' in c]
df_fscores = df_fscores.loc[cols_baselines + cols_bert]
print(df_fscores.to_latex())

In [None]:
cols_baselines + cols_bert

In [None]:
rows_1850 = [r for r in df_fscores.index if '1850' in r]
rows_1920 = [r for r in df_fscores.index if 'blert' in r]
rows_2000 = [r for r in df_fscores.index if 'bert_base' in r]

In [None]:
diff_1850_2000 = df_1850.loc[df_1850.index.isin(rows_1850)].fscore.values - df_1850.loc[df_1850.index.isin(rows_2000)].fscore.values
diff_1920_2000 = df_1920.loc[df_1920.index.isin(rows_1920)].fscore.values - df_1920.loc[df_1920.index.isin(rows_2000)].fscore.values

In [None]:
df_diff = pd.DataFrame([diff_1850_2000,diff_1920_2000],columns=rows_2000).T
#df_diff['sum'] = df_diff.sum(axis=1)
print(df_diff.to_latex())

# Table 3: Time-sensitive methods

In [None]:
results_ts_1850 = evaluate_results(Path("results_ts_1850"),avg=avg)
results_ts_1920 = evaluate_results(Path("results_ts_1920"),avg=avg)

In [None]:
df_ts_1850 = pd.DataFrame.from_dict(results_ts_1850, orient='index', columns=['precision','recall','fscore','preds'])
df_ts_1920 = pd.DataFrame.from_dict(results_ts_1920, orient='index', columns=['precision','recall','fscore','preds'])
df_fscores = pd.concat([df_ts_1850['fscore'], df_ts_1920['fscore']],axis=1)

In [None]:
print(df_fscores.to_latex())

# Curated examples

In [None]:
results_curated_seed = evaluate_results(Path(f'results_cuevaluate_results'),avg=avg)
results_curated_synonym = evaluate_results(Path(f'results_curated_1920_syn'),avg=avg)

In [None]:
df_curated_seed = pd.DataFrame.from_dict(results_curated_seed, orient='index', columns=['precision','recall','fscore','preds'])
df_curated_synonym = pd.DataFrame.from_dict(results_curated_synonym, orient='index', columns=['precision','recall','fscore','preds'])

In [None]:
df_fscore = pd.concat([df_curated_seed['fscore'],df_curated_synonym['fscore']], axis=1)
df_fscore.columns=['vertical','horizontal']
print(df_fscore.to_latex())

# Assess Statistical Significance

In [None]:
res = evaluate_results(Path("results_1850/"),avg=avg)

selected = "bert_binary_centroid_vector_bert_base_-1,-2,-3,-4_mean"

selected_pred = res[selected][3][0]
print (selected, res[selected][:3], "\n\nIs the difference significant?\n")

for method,values in res.items():
    if method != selected:
        pred = values[3][0]
        p_value = scipy.stats.ttest_rel(selected_pred,pred)[1]
        if p_value<0.05:
            print (method, values[:3], "YES")
        else:
            print (method, values[:3], "NO p_value:",round(p_value,5))