# Exploration of results

In [1]:
from pathlib import Path
import pandas as pd
from collections import defaultdict,Counter
import statistics
import re

In [2]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import seaborn as sns
sns.set()

In [3]:
def select_lemmas(setting):
    lemmas = []
    if setting == "all_lemmas":
        lemmas = ["anger_NN","apple_NN","art_NN", "democracy_NN",
                  "happiness_NN", "labour_NN", "machine_NN", "man_NN",
                  "nation_NN", "power_NN", "slave_NN", 'woman_NN']

    elif setting == "non_industrial":
        lemmas = ["anger_NN","apple_NN","democracy_NN",
                  "happiness_NN","man_NN",
                  "nation_NN","slave_NN",'woman_NN']

    elif setting == "industrial":
        lemmas = ["art_NN", "labour_NN", "machine_NN", "power_NN"]

    elif setting == "non_tech":
        lemmas = ["anger_NN","apple_NN","art_NN", "democracy_NN",
                  "happiness_NN", "labour_NN", "man_NN",
                  "nation_NN", "slave_NN", 'woman_NN']

    elif setting == "tech":
        lemmas = ["machine_NN", "power_NN"]

    elif setting == "human":
        lemmas = ["slave_NN", 'woman_NN', "man_NN"]

    elif setting == "emotion":
        lemmas = ["happiness_NN", "anger_NN"]

    elif setting == "abstract":
        lemmas = ["happiness_NN", "anger_NN", "art_NN", "democracy_NN",
                  "labour_NN", "nation_NN"]

    elif setting == "abstract_wo_emotions":
        lemmas = ["art_NN", "democracy_NN",
                  "labour_NN", "nation_NN"]

    elif setting == "concrete":
        lemmas = ["apple_NN", "machine_NN", "man_NN", "slave_NN", 'woman_NN']

    elif setting == "concrete_wo_machine":
        lemmas = ["apple_NN", "man_NN", "slave_NN", 'woman_NN']

    elif setting == "man_apple_woman":
        lemmas = ["apple_NN", "man_NN", 'woman_NN']

    elif setting == "apple":
        lemmas = ["apple_NN"]

    elif setting == "machine":
        lemmas = ["machine_NN"]

    elif setting == "slave":
        lemmas = ["slave_NN"]

    elif setting == "man_woman":
        lemmas = ["man_NN", "woman_NN"]
        
    if setting == "all_wo_machine":
        lemmas = ["anger_NN","apple_NN","art_NN", "democracy_NN",
                  "happiness_NN", "labour_NN", "man_NN",
                  "nation_NN", "power_NN", "slave_NN", 'woman_NN']

    elif setting == "work_related":
        lemmas = ["slave_NN", "machine_NN", "labour_NN", "power_NN"]

    elif setting == "non_work_related":
        lemmas = ["anger_NN","apple_NN","art_NN", "democracy_NN",
                  "happiness_NN", "man_NN",
                  "nation_NN", 'woman_NN']
        
    return lemmas

###  Optimal time range for a Language Model

In [4]:
def find_optimal_daterange(results_path, lemmas, timestart, year_window, metric):
    from sklearn.metrics import precision_recall_fscore_support
    clf_dict = defaultdict(list)
    results = {}
    csv_files = results_path.glob("**/*.csv")
    for csv in csv_files:
        current_csv = str(csv).split("/")
        current_lemma = current_csv[1]
        current_sense = current_csv[3].split("~")[0]
        lemma_pickle = pd.read_pickle("data/lemma_senses_" + current_lemma + ".pickle")
        sst = lemma_pickle[lemma_pickle["id"] == current_sense].iloc[0]["daterange.start"]
            
        try:
            df = pd.read_csv(csv)
            
            df = df[["label","year","quotation_id","bert_centroid_sense_vector_bert_base_-1,-2,-3,-4_mean",
                     "bert_centroid_sense_vector_bert_1850_-1,-2,-3,-4_mean","bert_centroid_sense_vector_blert_base_-1,-2,-3,-4_mean"]]
            df = df.rename(columns={"bert_centroid_sense_vector_bert_base_-1,-2,-3,-4_mean": "bert_base_sense_centroid",
                                    "bert_centroid_sense_vector_bert_1850_-1,-2,-3,-4_mean": "bert_1850_sense_centroid",
                                    "bert_centroid_sense_vector_blert_base_-1,-2,-3,-4_mean": "bert_1900_sense_centroid"})
            df = df[df["year"].between(timestart, timestart+(year_window*2))]

        except Exception as e:
            continue
        
        if current_lemma in lemmas:
            for col in df.columns:
                clf_dict[col].extend(df[col])

    for colname, classifications in clf_dict.items():
        if colname not in ('label','year','quotation_id') and colname.startswith("bert_"):
            p,r = [round(x,3) for x in precision_recall_fscore_support(clf_dict['label'],classifications,average='binary',pos_label=1)[:2] if x] 
            f1 = round((2*(p*r))/(p+r),3)
            if metric == "recall":
                results[colname] = round(r,3)
            if metric == "precision":
                results[colname] = round(p,3)
            if metric == "fscore":
                results[colname] = round(f1,3)

    return results

In [6]:
setting = "all_lemmas"
metric = "fscore"

lemmas = select_lemmas(setting)

path = Path('figures')
path.mkdir(exist_ok=True)

time_experiment = 2000 # Folder from which we select results
test_daterange_start = 1760 # Quotations starting from
year_window = 50
bert_base = []
bert_1850 = []
bert_1900 = []
time_mean = []
for date_start in range(test_daterange_start, time_experiment - year_window,10):
    results = find_optimal_daterange(Path('results_' + str(time_experiment)), lemmas, date_start, year_window, metric)
    bert_base.append(results["bert_base_sense_centroid"])
    bert_1850.append(results["bert_1850_sense_centroid"])
    bert_1900.append(results["bert_1900_sense_centroid"])
    time_mean.append(date_start + year_window)
    
plt.plot(time_mean, bert_base)
plt.plot(time_mean, bert_1850)
plt.plot(time_mean, bert_1900)

plt.legend(["BERTbase", "BERT1850", "BERT1900"], loc='lower right')

plt.savefig(path / ("plot_LM_" + setting + "_class1" + metric + "_50.png"), dpi=300, bbox_inches="tight")

TypeError: unsupported operand type(s) for +: 'PosixPath' and 'str'

# Data exploration

**[WARNING]** To run the following cell you will need to have generated the `lemma` and `quotation` dataframes for each headword.

Code to generate table 1 describing the data:

In [None]:
import pandas as pd
from utils.classificaton_utils import binarize

words = [['anger',"NN"],["apple","NN"],["art","NN"],["democracy","NN"],
         ["happiness","NN"],["labour","NN"],["machine","NN"],["man","NN"],
         ["nation","NN"],["power","NN"],["slave","NN"],['woman','NN']]

experiment = {"start": 1760, "end": 1850, "filter_val": True, "filter_test": True}

dr = dict()
for lemma, pos in words:
    print("### lemma: {} ###".format(lemma))
    quotations_path = f"./data/sfrel_quotations_{lemma}_{pos}.pickle"
    lemma_senses = pd.read_pickle(f'./data/lemma_senses_{lemma}_{pos}.pickle')

    senses = set(lemma_senses[lemma_senses.word_id.str.startswith(f'{lemma}_{pos.lower()}')]["id"])
    relations = ['seed','synonym']
    eval_mode = "lemma_etal"

    df_train, df_val, df_test = binarize(lemma=lemma,
                                    pos=pos,
                                    senses=senses, 
                                    start=experiment["start"],
                                    end=experiment["end"],
                                    relations=relations,
                                    eval_mode=eval_mode,
                                    filter_val_by_year=experiment["filter_val"],
                                    filter_test_by_year=experiment["filter_test"],
                                    strict_filter=True)

    df_all = pd.concat([df_train, df_val, df_test])
    
    unique_seed_senses = len(df_all[df_all["provenance_type"] == "seed"]["sense_id"].unique())
    unique_syn_senses = round(len(df_all[df_all["provenance_type"] == "synonym"]["sense_id"].unique())/unique_seed_senses)
    unique_other_senses = round(len(df_all[~df_all["provenance_type"].isin(["synonym", "seed"])]["sense_id"].unique())/unique_seed_senses)
    df_all_posq = df_all[df_all["label"] == "1"]
    df_all_negq = df_all[df_all["label"] == "0"]
    quotations_p = round(len(df_all_posq.quotation_id.unique())/unique_seed_senses)
    quotations_n = round(len(df_all_negq.quotation_id.unique())/unique_seed_senses)
    quotations = str(quotations_p) + "/" + str(quotations_n)
    derived_senses = str(unique_syn_senses) + "/" + str(unique_other_senses)
     
    dr[lemma] = [unique_seed_senses, derived_senses, quotations]
    
data_description = pd.DataFrame.from_dict(dr, orient='index', columns=['Seeds','ExpSenses', 'Quotations'])

In [None]:
print(data_description.to_latex())