In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import pickle
import pandas as pd
from tasks import wsd
from utils import nlp_tools
from utils.classificaton_utils import binarize
from tqdm.auto import tqdm
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer

tqdm.pandas()

lemma_id = 'machine_nn01'
senses = {'machine_nn01-38475835','machine_nn01-38475923'}
relations = ['seed','synonym','descendant','sibling']

# whether we use only information on the lemma for the predictive model (e.g. only the lemma senses definitions for lesk baselines)
eval_mode = "lemma" # or lemma_etal

df_quotations = binarize(lemma_id, 
                        senses, 
                        relations,
                        strict_filter=True,
                        start=1700,
                        end=1910)

# senses before filtering by date = 8383
# senses after filtering by date = 5904


# of seed senses 23 
# of synonyms 312 
# of branch senses 4968


# of seeds selected 1 
# of synonyms selected 8 
# of branches selected 5


In [4]:
df_quotations.head()

Unnamed: 0,id_x,text,year,lemma,source,oed_url,word_id,sense_id,datestring,first_in_word,oed_reference,first_in_sense,label,id_y,daterange,definition,provenance,provenance_type,relation_to_core_senses,relation_to_seed_senses
0,mover_nn01-35820849,"{'keyword': 'mover', 'full_text': 'Providence,...",1704,mover,"{'title': '15th Rep. Royal Comm. Hist. MSS', '...",https://www.oed.com/view/Entry/123034#eid35820849,mover_nn01,mover_nn01-35820773,1704,False,"mover, n.1, sense 2a",False,0,,,,,,,
1,mover_nn01-35820717,"{'keyword': 'first Mover', 'full_text': 'The M...",1711,mover,"{'title': 'Ship-builders Assistant', 'author':...",https://www.oed.com/view/Entry/123034#eid35820717,mover_nn01,mover_nn01-35820685,1711,False,"mover, n.1, sense 1c",False,1,mover_nn01-35820685,"{'end': None, 'start': 1626, 'obsolete': False...",More fully first mover. An initial source (nat...,"[[mover_nn01-35820685, synonym, machine_nn01-3...",synonym,"{vice_nn02-15526247, mover_nn01-35820685, prim...",{machine_nn01-38475923}
2,mover_nn01-35820717,"{'keyword': 'first Mover', 'full_text': 'The M...",1711,mover,"{'title': 'Ship-builders Assistant', 'author':...",https://www.oed.com/view/Entry/123034#eid35820717,mover_nn01,mover_nn01-35820685,1711,False,"mover, n.1, sense 1c",False,1,mover_nn01-35820685,"{'end': None, 'start': 1626, 'obsolete': False...",More fully first mover. An initial source (nat...,"[[214198, sibling, 214198]]",branch,"{vice_nn02-15526247, mover_nn01-35820685, prim...",{machine_nn01-38475923}
3,mover_nn01-35820717,"{'keyword': 'first Mover', 'full_text': 'The M...",1711,mover,"{'title': 'Ship-builders Assistant', 'author':...",https://www.oed.com/view/Entry/123034#eid35820717,mover_nn01,mover_nn01-35820685,1711,False,"mover, n.1, sense 1c",False,1,mover_nn01-35820685,"{'end': None, 'start': 1626, 'obsolete': False...",More fully first mover. An initial source (nat...,"[[84483, sibling, 84483]]",branch,"{vice_nn02-15526247, mover_nn01-35820685, prim...",{machine_nn01-38475923}
4,mover_nn01-35820860,"{'keyword': 'mover', 'full_text': 'They will w...",1711,mover,"{'title': 'Jrnl. to Stella', 'author': 'J. Swi...",https://www.oed.com/view/Entry/123034#eid35820860,mover_nn01,mover_nn01-35820773,1711,False,"mover, n.1, sense 2a",False,0,,,,,,,


In [5]:
df_selected_senses = df_quotations[['sense_id','lemma','word_id','definition','label']]
df_selected_senses = df_selected_senses.rename(columns={'sense_id': 'id','word_id':'lemma_id'})
df_selected_senses.drop_duplicates(inplace = True)
df_selected_senses = df_selected_senses.reset_index(drop=True)

if eval_mode == "lemma":
    df_selected_senses = df_selected_senses[df_selected_senses['lemma_id'] == lemma_id]
    df_selected_senses = df_selected_senses.reset_index(drop=True)

if eval_mode == "lemma_etal":
    print ("We are not covering this functionality yet.")
    # we need all definitions of all senses in the quotation dataframe

df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1)

df_selected_senses.head()

Unnamed: 0,id,lemma,lemma_id,definition,label,nlp_definition
0,machine_nn01-38474548,machine,machine_nn01,A ship or other vessel. Now colloquial: a boat.,0,"(A, ship, or, other, vessel, ., Now, colloquia..."
1,machine_nn01-38475923,machine,machine_nn01,Mechanics. Anything that transmits force or di...,1,"(Mechanics, ., Anything, that, transmits, forc..."
2,machine_nn01-38474607,machine,machine_nn01,"A (usually wheeled) vehicle or conveyance, esp...",0,"(A, (, usually, wheeled, ), vehicle, or, conve..."
3,machine_nn01-38474405,machine,machine_nn01,"In literature, etc.: a contrivance for the sak...",0,"(In, literature, ,, etc, ., :, a, contrivance,..."
4,machine_nn01-38475164,machine,machine_nn01,"In general use: an apparatus, device, instrume...",0,"(In, general, use, :, an, apparatus, ,, device..."


In [6]:
df_quotations["nlp_full_text"] = df_quotations.apply (lambda row: nlp_tools.preprocess(row["text"]["full_text"]), axis=1)
df_quotations.head()

Unnamed: 0,id_x,text,year,lemma,source,oed_url,word_id,sense_id,datestring,first_in_word,...,first_in_sense,label,id_y,daterange,definition,provenance,provenance_type,relation_to_core_senses,relation_to_seed_senses,nlp_full_text
0,mover_nn01-35820849,"{'keyword': 'mover', 'full_text': 'Providence,...",1704,mover,"{'title': '15th Rep. Royal Comm. Hist. MSS', '...",https://www.oed.com/view/Entry/123034#eid35820849,mover_nn01,mover_nn01-35820773,1704,False,...,False,0,,,,,,,,"(Providence, ,, which, I, humbly, recognize, a..."
1,mover_nn01-35820717,"{'keyword': 'first Mover', 'full_text': 'The M...",1711,mover,"{'title': 'Ship-builders Assistant', 'author':...",https://www.oed.com/view/Entry/123034#eid35820717,mover_nn01,mover_nn01-35820685,1711,False,...,False,1,mover_nn01-35820685,"{'end': None, 'start': 1626, 'obsolete': False...",More fully first mover. An initial source (nat...,"[[mover_nn01-35820685, synonym, machine_nn01-3...",synonym,"{vice_nn02-15526247, mover_nn01-35820685, prim...",{machine_nn01-38475923},"(The, Main, -, mast, is, the, first, Mover, .)"
2,mover_nn01-35820717,"{'keyword': 'first Mover', 'full_text': 'The M...",1711,mover,"{'title': 'Ship-builders Assistant', 'author':...",https://www.oed.com/view/Entry/123034#eid35820717,mover_nn01,mover_nn01-35820685,1711,False,...,False,1,mover_nn01-35820685,"{'end': None, 'start': 1626, 'obsolete': False...",More fully first mover. An initial source (nat...,"[[214198, sibling, 214198]]",branch,"{vice_nn02-15526247, mover_nn01-35820685, prim...",{machine_nn01-38475923},"(The, Main, -, mast, is, the, first, Mover, .)"
3,mover_nn01-35820717,"{'keyword': 'first Mover', 'full_text': 'The M...",1711,mover,"{'title': 'Ship-builders Assistant', 'author':...",https://www.oed.com/view/Entry/123034#eid35820717,mover_nn01,mover_nn01-35820685,1711,False,...,False,1,mover_nn01-35820685,"{'end': None, 'start': 1626, 'obsolete': False...",More fully first mover. An initial source (nat...,"[[84483, sibling, 84483]]",branch,"{vice_nn02-15526247, mover_nn01-35820685, prim...",{machine_nn01-38475923},"(The, Main, -, mast, is, the, first, Mover, .)"
4,mover_nn01-35820860,"{'keyword': 'mover', 'full_text': 'They will w...",1711,mover,"{'title': 'Jrnl. to Stella', 'author': 'J. Swi...",https://www.oed.com/view/Entry/123034#eid35820860,mover_nn01,mover_nn01-35820773,1711,False,...,False,0,,,,,,,,"(They, will, want, him, prodigiously, in, the,..."


In [7]:
approach = "random"

df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.random_predict(df_selected_senses), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 799/799 [00:01<00:00, 645.68it/s]


(0.234, 0.294, 0.261, 0.66)

In [8]:
# be careful: i am using the example sentence (row["text"]["full_text"]) as the input sentence and then measure its word overlap with the definition (see function). if you instead want to use the example as training data, we need to split in train/test

approach = "def_tok_overlap_ranking"

df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 799/799 [00:02<00:00, 359.76it/s]


(0.714, 0.123, 0.209, 0.811)

In [13]:
approach = "sent_embedding"

df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.sent_embedding(row["nlp_full_text"], df_selected_senses), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 799/799 [00:01<00:00, 462.77it/s]


(0.0, 0.0, 0.0, 0.795)

In [10]:
approach = "w2v_lesk_ranking"

# Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
wemb_model = Word2Vec.load("models/w2v/w2v_v004/w2v_words.model")
df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 799/799 [00:50<00:00, 15.71it/s]


(0.0, 0.0, 0.0, 0.792)

In [11]:
approach = "bert_lesk_ranking"

# Download model from (warning: this is a contemporary model):
# https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/bert-base-nli-mean-tokens.zip
bert_sentsim_model = SentenceTransformer('models/bert/bert-base-nli-mean-tokens')
df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.bert_lesk_ranking(row["text"]["full_text"], df_selected_senses, bert_sentsim_model), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 799/799 [28:26<00:00,  2.14s/it]


(0.518, 0.178, 0.265, 0.798)