In [1]:
import pickle
import pandas as pd
from tasks import wsd
from utils import nlp_tools
from utils.classificaton_utils import binarize,retrieve_labelled_definitions
from tqdm.auto import tqdm
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer

tqdm.pandas()

lemma_id = 'machine_nn01'
senses = {'machine_nn01-38475835','machine_nn01-38475923'}
relations = ['seed','synonym','descendant','sibling']


df_quotations = binarize(lemma_id, 
                        senses, 
                        relations,
                        strict_filter=True,
                        start=1700,
                        end=1910)

# senses before filtering by date = 8383
# senses after filtering by date = 5904


# of seed senses 23 
# of synonyms 312 
# of branch senses 4968


# of seeds selected 1 
# of synonyms selected 8 
# of branches selected 5


In [2]:
df_selected_senses = retrieve_labelled_definitions(lemma_id,df_quotations)

df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1)

df_selected_senses.head()

Unnamed: 0,id,label,lemma,definition,nlp_definition
0,mover_nn01-35820685,1,mover,More fully first mover. An initial source (nat...,"(More, fully, first, mover, ., An, initial, so..."
1,mover_nn01-35820598,0,mover,In full first mover. The outermost of the conc...,"(In, full, first, mover, ., The, outermost, of..."
2,power_nn01-28687898,1,power,"Mechanics. In full mechanical power, †mathemat...","(Mechanics, ., In, full, mechanical, power, ,,..."
3,power_nn01-224965906,0,power,"The mechanical advantage of a pulley, tackle, ...","(The, mechanical, advantage, of, a, pulley, ,,..."
4,rocker_nn01-25157825,1,rocker,A device or apparatus which rocks or is rocked...,"(A, device, or, apparatus, which, rocks, or, i..."


In [3]:
df_quotations["nlp_full_text"] = df_quotations.apply (lambda row: nlp_tools.preprocess(row["text"]["full_text"]), axis=1)
df_quotations.head()

Unnamed: 0,id,text,year,lemma,source,oed_url,word_id,sense_id,datestring,first_in_word,oed_reference,first_in_sense,label,nlp_full_text
25430,mover_nn01-35820849,"{'keyword': 'mover', 'full_text': 'Providence,...",1704,mover,"{'title': '15th Rep. Royal Comm. Hist. MSS', '...",https://www.oed.com/view/Entry/123034#eid35820849,mover_nn01,mover_nn01-35820773,1704,False,"mover, n.1, sense 2a",False,0,"(Providence, ,, which, I, humbly, recognize, a..."
25431,mover_nn01-35820717,"{'keyword': 'first Mover', 'full_text': 'The M...",1711,mover,"{'title': 'Ship-builders Assistant', 'author':...",https://www.oed.com/view/Entry/123034#eid35820717,mover_nn01,mover_nn01-35820685,1711,False,"mover, n.1, sense 1c",False,1,"(The, Main, -, mast, is, the, first, Mover, .)"
25432,mover_nn01-35820860,"{'keyword': 'mover', 'full_text': 'They will w...",1711,mover,"{'title': 'Jrnl. to Stella', 'author': 'J. Swi...",https://www.oed.com/view/Entry/123034#eid35820860,mover_nn01,mover_nn01-35820773,1711,False,"mover, n.1, sense 2a",False,0,"(They, will, want, him, prodigiously, in, the,..."
25433,mover_nn01-35821031,"{'keyword': 'Movers', 'full_text': 'Nor is the...",1736,mover,"{'title': 'Analogy of Relig.', 'author': 'Bp. ...",https://www.oed.com/view/Entry/123034#eid35821031,mover_nn01,mover_nn01-35821002,1736,False,"mover, n.1, sense 3",False,0,"(Nor, is, there, any, Ground, to, think, .., t..."
25434,mover_nn01-35821108,"{'keyword': 'Mover', 'full_text': 'Therefore w...",1737,mover,"{'title': 'Gentleman's Mag.', 'author': None, ...",https://www.oed.com/view/Entry/123034#eid35821108,mover_nn01,mover_nn01-35821082,1737,False,"mover, n.1, sense 4",False,0,"(Therefore, we, must, suppose, ,, that, withou..."


In [4]:
approach = "random"

df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.random_predict(df_selected_senses), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 420/420 [00:00<00:00, 627.44it/s]


(0.158, 0.723, 0.26, 0.362)

In [5]:
# be careful: i am using the example sentence (row["text"]["full_text"]) as the input sentence and then measure its word overlap with the definition (see function). if you instead want to use the example as training data, we need to split in train/test

approach = "def_tok_overlap_ranking"

df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 420/420 [00:01<00:00, 311.46it/s]


(0.154, 0.877, 0.263, 0.238)

In [6]:
approach = "sent_embedding"

df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.sent_embedding(row["nlp_full_text"], df_selected_senses), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 420/420 [00:01<00:00, 365.19it/s]


(0.256, 0.815, 0.39, 0.605)

In [7]:
approach = "w2v_lesk_ranking"

# Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
wemb_model = Word2Vec.load("models/w2v/w2v_v004/w2v_words.model")
df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 420/420 [00:48<00:00,  8.59it/s]


(0.232, 0.785, 0.358, 0.564)

In [8]:
approach = "bert_lesk_ranking"

# Download model from (warning: this is a contemporary model):
# https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/bert-base-nli-mean-tokens.zip
bert_sentsim_model = SentenceTransformer('models/bert/bert-base-nli-mean-tokens')
df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.bert_lesk_ranking(row["text"]["full_text"], df_selected_senses, bert_sentsim_model), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 420/420 [29:51<00:00,  4.27s/it]


(0.265, 0.738, 0.39, 0.643)