In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import pickle
import pandas as pd
from tasks import wsd
from pathlib import Path
from utils import nlp_tools
from tqdm.auto import tqdm
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
from utils.classificaton_utils import binarize,generate_definition_df

tqdm.pandas()

lemma_id = 'machine_nn01'
senses = {'machine_nn01-38475923'}
relations = ['seed','synonym','descendant','sibling']

# whether we use only information on the lemma for the predictive model (e.g. only the lemma senses definitions for lesk baselines)
eval_mode = "lemma_etal" # lemma or lemma_etal

df_train, df_val, df_test = binarize(lemma_id, 
                        senses, 
                        relations,
                        strict_filter=True,
                        start=1700,
                        end=1910,
                        eval_mode=eval_mode)

# senses before filtering by date = 8383
# senses after filtering by date = 5904


# of seed senses 23 
# of synonyms 312 
# of branch senses 4968


# of seeds selected 1 
# of synonyms selected 8 
# of branches selected 5


In [4]:
df_train["nlp_full_text"] = df_train.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)

df_val["nlp_full_text"] = df_val.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)

df_test["nlp_full_text"] = df_test.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)

In [5]:
df_selected_senses = generate_definition_df(df_train,lemma_id,eval_mode=eval_mode)

df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1)


We are not offering this functionality yet, defaulting to 'lemma' !!


# Lesk-based Unsupervised Approaches

In [6]:
df_test["random"] = df_test.progress_apply (lambda row: wsd.random_predict(), axis=1)

wsd.eval("random",df_test)

100%|██████████| 84/84 [00:00<00:00, 37617.08it/s]


{'1': [0.154, 0.462, 0.231], '0': [0.844, 0.535, 0.655]}

In [7]:
df_test["def_tok_overlap_ranking"] = df_test.progress_apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1)

wsd.eval("def_tok_overlap_ranking",df_test)

100%|██████████| 84/84 [00:00<00:00, 458.34it/s]


{'1': [1.0, 0.077, 0.143], '0': [0.855, 1.0, 0.922]}

In [8]:
df_test["sent_embedding"] = df_test.progress_apply (lambda row: wsd.sent_embedding(row["nlp_full_text"], df_selected_senses), axis=1)

wsd.eval("sent_embedding",df_test)

100%|██████████| 84/84 [00:00<00:00, 485.52it/s]


{'1': [0.0, 0.0, 0.0], '0': [0.845, 1.0, 0.916]}

In [9]:
# Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
wemb_model = Word2Vec.load("models/w2v/w2v_v004/w2v_words.model")

df_test["w2v_lesk_ranking"] = df_test.progress_apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1)

wsd.eval("w2v_lesk_ranking",df_test)

100%|██████████| 84/84 [00:04<00:00, 17.50it/s]


{'1': [0.0, 0.0, 0.0], '0': [0.841, 0.972, 0.902]}

In [10]:
# Download model from (warning: this is a contemporary model):
# https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/bert-base-nli-mean-tokens.zip

bert_sentsim_model = SentenceTransformer('models/bert/bert-base-nli-mean-tokens')
df_test["bert_lesk_ranking"] = df_test.progress_apply (lambda row: wsd.bert_lesk_ranking(row["text"]["full_text"], df_selected_senses, bert_sentsim_model), axis=1)

wsd.eval("bert_lesk_ranking",df_test)

100%|██████████| 84/84 [02:37<00:00,  1.88s/it]


{'1': [0.25, 0.154, 0.19], '0': [0.855, 0.915, 0.884]}

# Supervised Approaches

In [11]:
# Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
wemb_model = Word2Vec.load("models/w2v/w2v_v004/w2v_words.model")

df_test["svm_wemb_baseline"] = wsd.svm_wemb_baseline(df_train,df_test,wemb_model)

wsd.eval("svm_wemb_baseline",df_test)

{'1': [0.714, 0.385, 0.5], '0': [0.896, 0.972, 0.932]}

In [12]:
results_path = 'results/'+ lemma_id +"/"+ eval_mode+"/"
results_filename = "+".join(senses) +"~"+ "+".join(relations)+".csv"
Path(results_path).mkdir(parents=True, exist_ok=True)

out_df = df_test.filter(['id_x','label','random','def_tok_overlap_ranking', 'sent_embedding', 'w2v_lesk_ranking',
       'bert_lesk_ranking', 'svm_wemb_baseline'], axis=1)

out_df.to_csv(results_path+results_filename, index=False)         