In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import pickle
import pandas as pd
from tasks import wsd
from utils import nlp_tools
from utils.classificaton_utils import binarize
from tqdm.auto import tqdm
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer

tqdm.pandas()

lemma_id = 'machine_nn01'
senses = {'machine_nn01-38475835','machine_nn01-38475923'}
relations = ['seed','synonym','descendant','sibling']

# whether we use only information on the lemma for the predictive model (e.g. only the lemma senses definitions for lesk baselines)
eval_mode = "lemma" # or lemma_etal

df_quotations = binarize(lemma_id, 
                        senses, 
                        relations,
                        strict_filter=True,
                        start=1700,
                        end=1910)

# senses before filtering by date = 8383
# senses after filtering by date = 5904


# of seed senses 23 
# of synonyms 312 
# of branch senses 4968


# of seeds selected 1 
# of synonyms selected 8 
# of branches selected 5


In [4]:
df_quotations["full_text"] = df_quotations.apply (lambda row: row["text"]["full_text"], axis=1)
df_quotations["nlp_full_text"] = df_quotations.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
df_quotations.drop_duplicates(subset = ["year", "lemma", "word_id", "sense_id", "definition", "full_text"], inplace = True)
df_quotations = df_quotations.reset_index(drop=True)
df_quotations.head()

Unnamed: 0,id_x,text,year,lemma,source,oed_url,word_id,sense_id,datestring,first_in_word,...,label,id_y,daterange,definition,provenance,provenance_type,relation_to_core_senses,relation_to_seed_senses,full_text,nlp_full_text
0,mover_nn01-35820849,"{'keyword': 'mover', 'full_text': 'Providence,...",1704,mover,"{'title': '15th Rep. Royal Comm. Hist. MSS', '...",https://www.oed.com/view/Entry/123034#eid35820849,mover_nn01,mover_nn01-35820773,1704,False,...,0,,,,,,,,"Providence, which I humbly recognize as the fi...","(Providence, ,, which, I, humbly, recognize, a..."
1,mover_nn01-35820717,"{'keyword': 'first Mover', 'full_text': 'The M...",1711,mover,"{'title': 'Ship-builders Assistant', 'author':...",https://www.oed.com/view/Entry/123034#eid35820717,mover_nn01,mover_nn01-35820685,1711,False,...,1,mover_nn01-35820685,"{'end': None, 'start': 1626, 'obsolete': False...",More fully first mover. An initial source (nat...,"[[mover_nn01-35820685, synonym, machine_nn01-3...",synonym,"{primemover_nn01-28348676, primummobile_nn01-2...",{machine_nn01-38475923},The Main-mast is the first Mover.,"(The, Main, -, mast, is, the, first, Mover, .)"
2,mover_nn01-35820860,"{'keyword': 'mover', 'full_text': 'They will w...",1711,mover,"{'title': 'Jrnl. to Stella', 'author': 'J. Swi...",https://www.oed.com/view/Entry/123034#eid35820860,mover_nn01,mover_nn01-35820773,1711,False,...,0,,,,,,,,They will want him prodigiously in the House o...,"(They, will, want, him, prodigiously, in, the,..."
3,mover_nn01-35821031,"{'keyword': 'Movers', 'full_text': 'Nor is the...",1736,mover,"{'title': 'Analogy of Relig.', 'author': 'Bp. ...",https://www.oed.com/view/Entry/123034#eid35821031,mover_nn01,mover_nn01-35821002,1736,False,...,0,,,,,,,,Nor is there any Ground to think..that his Eye...,"(Nor, is, there, any, Ground, to, think, .., t..."
4,mover_nn01-35821108,"{'keyword': 'Mover', 'full_text': 'Therefore w...",1737,mover,"{'title': 'Gentleman's Mag.', 'author': None, ...",https://www.oed.com/view/Entry/123034#eid35821108,mover_nn01,mover_nn01-35821082,1737,False,...,0,,,,,,,,"Therefore we must suppose, that without any Re...","(Therefore, we, must, suppose, ,, that, withou..."


In [5]:
df_selected_senses = df_quotations[['sense_id','lemma','word_id','definition','label']]
df_selected_senses = df_selected_senses.rename(columns={'sense_id': 'id','word_id':'lemma_id'})
df_selected_senses.drop_duplicates(inplace = True)
df_selected_senses = df_selected_senses.reset_index(drop=True)

if eval_mode == "lemma":
    df_selected_senses = df_selected_senses[df_selected_senses['lemma_id'] == lemma_id]
    df_selected_senses = df_selected_senses.reset_index(drop=True)

if eval_mode == "lemma_etal":
    print ("We are not covering this functionality yet.")
    # we need all definitions of all senses in the quotation dataframe

df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1)

df_selected_senses.head()

Unnamed: 0,id,lemma,lemma_id,definition,label,nlp_definition
0,machine_nn01-38474548,machine,machine_nn01,A ship or other vessel. Now colloquial: a boat.,0,"(A, ship, or, other, vessel, ., Now, colloquia..."
1,machine_nn01-38475923,machine,machine_nn01,Mechanics. Anything that transmits force or di...,1,"(Mechanics, ., Anything, that, transmits, forc..."
2,machine_nn01-38474607,machine,machine_nn01,"A (usually wheeled) vehicle or conveyance, esp...",0,"(A, (, usually, wheeled, ), vehicle, or, conve..."
3,machine_nn01-38474405,machine,machine_nn01,"In literature, etc.: a contrivance for the sak...",0,"(In, literature, ,, etc, ., :, a, contrivance,..."
4,machine_nn01-38475164,machine,machine_nn01,"In general use: an apparatus, device, instrume...",0,"(In, general, use, :, an, apparatus, ,, device..."


In [6]:
approach = "random"

df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.random_predict(df_selected_senses), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 420/420 [00:00<00:00, 630.34it/s]


{'1': [0.156, 0.231, 0.186], '0': [0.846, 0.772, 0.807]}

In [7]:
# be careful: i am using the example sentence (row["text"]["full_text"]) as the input sentence and then measure its word overlap with the definition (see function). if you instead want to use the example as training data, we need to split in train/test

approach = "def_tok_overlap_ranking"

df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 420/420 [00:01<00:00, 406.30it/s]


{'1': [0.5, 0.077, 0.133], '0': [0.854, 0.986, 0.915]}

In [8]:
approach = "sent_embedding"

df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.sent_embedding(row["nlp_full_text"], df_selected_senses), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 420/420 [00:00<00:00, 422.59it/s]


{'1': [0.0, 0.0, 0.0], '0': [0.845, 0.997, 0.915]}

In [9]:
approach = "w2v_lesk_ranking"

# Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
wemb_model = Word2Vec.load("models/w2v/w2v_v004/w2v_words.model")
df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 420/420 [00:25<00:00, 16.19it/s]


{'1': [0.0, 0.0, 0.0], '0': [0.844, 0.992, 0.912]}

In [10]:
approach = "bert_lesk_ranking"

# Download model from (warning: this is a contemporary model):
# https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/bert-base-nli-mean-tokens.zip
bert_sentsim_model = SentenceTransformer('models/bert/bert-base-nli-mean-tokens')
df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.bert_lesk_ranking(row["text"]["full_text"], df_selected_senses, bert_sentsim_model), axis=1)

wsd.eval(approach,df_quotations)

100%|██████████| 420/420 [14:21<00:00,  2.05s/it]


{'1': [0.387, 0.185, 0.25], '0': [0.864, 0.946, 0.903]}