In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import pickle
import pandas as pd
from tasks import wsd
from utils import nlp_tools
from utils.classificaton_utils import binarize,generate_definition_df
from tqdm.auto import tqdm
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer

tqdm.pandas()

lemma_id = 'machine_nn01'
senses = {'machine_nn01-38475835','machine_nn01-38475923'}
relations = ['seed','synonym','descendant','sibling']

# whether we use only information on the lemma for the predictive model (e.g. only the lemma senses definitions for lesk baselines)
eval_mode = "lemma" # or lemma_etal

df_train, df_val, df_test = binarize(lemma_id, 
                        senses, 
                        relations,
                        strict_filter=True,
                        start=1700,
                        end=1910)

# senses before filtering by date = 8383
# senses after filtering by date = 5904


# of seed senses 23 
# of synonyms 312 
# of branch senses 4968


# of seeds selected 1 
# of synonyms selected 8 
# of branches selected 5


In [4]:
df_train["nlp_full_text"] = df_train.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)

df_val["nlp_full_text"] = df_val.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)

df_test["nlp_full_text"] = df_test.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)

In [5]:
df_selected_senses = generate_definition_df(df_train,lemma_id,eval_mode="lemma")

df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1)


# Lesk-based Unsupervised Approaches

In [6]:
approach = "random"

df_test[approach] = df_test.progress_apply (lambda row: wsd.random_predict(), axis=1)

wsd.eval(approach,df_test)

100%|██████████| 84/84 [00:00<00:00, 37713.72it/s]


{'1': [0.2, 0.692, 0.31], '0': [0.897, 0.493, 0.636]}

In [7]:
# be careful: i am using the example sentence (row["text"]["full_text"]) as the input sentence and then measure its word overlap with the definition (see function). if you instead want to use the example as training data, we need to split in train/test

approach = "def_tok_overlap_ranking"

df_test[approach] = df_test.progress_apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1)

wsd.eval(approach,df_test)

100%|██████████| 84/84 [00:00<00:00, 471.84it/s]


{'1': [1.0, 0.077, 0.143], '0': [0.855, 1.0, 0.922]}

In [8]:
approach = "sent_embedding"

df_test[approach] = df_test.progress_apply (lambda row: wsd.sent_embedding(row["nlp_full_text"], df_selected_senses), axis=1)

wsd.eval(approach,df_test)

100%|██████████| 84/84 [00:00<00:00, 456.64it/s]


{'1': [0.0, 0.0, 0.0], '0': [0.845, 1.0, 0.916]}

In [9]:
approach = "w2v_lesk_ranking"

# Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
wemb_model = Word2Vec.load("models/w2v/w2v_v004/w2v_words.model")
df_test[approach] = df_test.progress_apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1)

wsd.eval(approach,df_test)

100%|██████████| 84/84 [00:04<00:00, 17.59it/s]


{'1': [0.0, 0.0, 0.0], '0': [0.841, 0.972, 0.902]}

In [10]:
approach = "bert_lesk_ranking"

# Download model from (warning: this is a contemporary model):
# https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/bert-base-nli-mean-tokens.zip
bert_sentsim_model = SentenceTransformer('models/bert/bert-base-nli-mean-tokens')
df_test[approach] = df_test.progress_apply (lambda row: wsd.bert_lesk_ranking(row["text"]["full_text"], df_selected_senses, bert_sentsim_model), axis=1)

wsd.eval(approach,df_test)

100%|██████████| 84/84 [05:50<00:00,  4.17s/it]


{'1': [0.25, 0.154, 0.19], '0': [0.855, 0.915, 0.884]}

# Supervised Approaches

In [11]:
approach = "svm_wemb_baseline"

# Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
wemb_model = Word2Vec.load("models/w2v/w2v_v004/w2v_words.model")

df_test[approach] = df_test.progress_apply(lambda row: wsd.svm_wemb_baseline(df_train,row["nlp_full_text"],wemb_model), axis=1)

wsd.eval(approach,df_test)

100%|██████████| 84/84 [00:32<00:00,  2.56it/s]


{'1': [0.625, 0.385, 0.476], '0': [0.895, 0.958, 0.925]}