In [1]:
import pickle
import pandas as pd
from tasks import wsd
from tqdm.auto import tqdm

tqdm.pandas()

with open("data/machine_nn01_all.pickle", "rb") as f:
    df = pickle.load(f)


In [2]:
definition_df = df[['lemma','sense_id','definition']]
definition_df = definition_df[definition_df.lemma=="machine"]
definition_df.drop_duplicates(inplace = True)
definition_df = definition_df.reset_index(drop=True)
definition_df["nlp_definition"] = definition_df.apply (lambda row: wsd.preprocess(row["definition"]), axis=1)

print (len(definition_df))

25


In [3]:
definition_df.head()

Unnamed: 0,lemma,sense_id,definition,nlp_definition
0,machine,machine_nn01-38473945,"A material or immaterial structure, esp. the f...","(A, material, or, immaterial, structure, ,, es..."
1,machine,machine_nn01-38474233,A military engine or siege-tower. Cf. war mach...,"(A, military, engine, or, siege, -, tower, ., ..."
2,machine,machine_nn01-38474097,spec. A scheme or plot. Obsolete.,"(spec, ., A, scheme, or, plot, ., Obsolete, .)"
3,machine,machine_nn01-38474140,"A living body, esp. the human body considered ...","(A, living, body, ,, esp, ., the, human, body,..."
4,machine,machine_nn01-38474301,Theatre. A (usually movable) contrivance for t...,"(Theatre, ., A, (, usually, movable, ), contri..."


In [4]:
machine_df = df[df.lemma=="machine"]
machine_df = machine_df.reset_index(drop=True)
machine_df["nlp_full_text"] = machine_df.apply (lambda row: wsd.preprocess(row["text"]["full_text"]), axis=1)
machine_df.head()

Unnamed: 0,notes,id_quotation,root,word_id,oed_url,part_of_speech,transitivity,semantic_class_ids,source,sense_id,definition,oed_reference,text,main_current_sense,lemma,first_use,nlp_full_text
0,[],machine_nn01-38473945,True,machine_nn01,https://www.oed.com/view/Entry/111850#eid38473945,NN,,"[[1, 111290, 118635, 119024, 120162, 120172], ...","{'title': 'Early Mod. Eng. Lexicogr.', 'author...",machine_nn01-38473945,"A material or immaterial structure, esp. the f...","machine, n., sense I.1a","{'keyword': 'machyne', 'full_text': 'The hole ...",False,machine,J. Schäfer,"(The, hole, machyne, of, this, world, is, divi..."
1,[],machine_nn01-38473945,True,machine_nn01,https://www.oed.com/view/Entry/111850#eid38473945,NN,,"[[1, 111290, 118635, 119024, 120162, 120172], ...","{'title': 'Early Mod. Eng. Lexicogr.', 'author...",machine_nn01-38473945,"A material or immaterial structure, esp. the f...","machine, n., sense I.1a","{'keyword': 'Machine', 'full_text': 'Machine, ...",False,machine,J. Schäfer,"(Machine, ,, hath, many, significacions, ,, bu..."
2,[],machine_nn01-38473945,True,machine_nn01,https://www.oed.com/view/Entry/111850#eid38473945,NN,,"[[1, 111290, 118635, 119024, 120162, 120172], ...","{'title': 'Complaynt Scotl.', 'author': None, ...",machine_nn01-38473945,"A material or immaterial structure, esp. the f...","machine, n., sense I.1a","{'keyword': 'machine', 'full_text': 'The maist...",False,machine,J. Schäfer,"(The, maist, illustir, potent, prince, of, the..."
3,[],machine_nn01-38473945,True,machine_nn01,https://www.oed.com/view/Entry/111850#eid38473945,NN,,"[[1, 111290, 118635, 119024, 120162, 120172], ...","{'title': 'Hymnes', 'author': 'A. Hume', 'gend...",machine_nn01-38473945,"A material or immaterial structure, esp. the f...","machine, n., sense I.1a","{'keyword': 'machin', 'full_text': 'Be his wis...",False,machine,J. Schäfer,"(Be, his, wisedome,, .., sa, wondrouslie, of, ..."
4,[],machine_nn01-38473945,True,machine_nn01,https://www.oed.com/view/Entry/111850#eid38473945,NN,,"[[1, 111290, 118635, 119024, 120162, 120172], ...","{'title': 'Hist. Quinq-articularis', 'author':...",machine_nn01-38473945,"A material or immaterial structure, esp. the f...","machine, n., sense I.1a","{'keyword': 'Machine', 'full_text': 'They that...",False,machine,J. Schäfer,"(They, that, asserted, Universal, redemption, ..."


In [5]:
approach = "random_predict"

machine_df[approach] = machine_df.progress_apply (lambda row: wsd.random_predict(definition_df), axis=1)

wsd.eval(machine_df[approach],machine_df["sense_id"])

100%|██████████| 805/805 [00:00<00:00, 23106.98it/s]


(0.058, 0.057, 0.043, 0.056)

In [6]:
# be careful: i am using the example sentence (row["text"]["full_text"]) as the input sentence and then measure its word overlap with the definition (see function). if you instead want to use the example as training data, we need to split in train/test

approach = "def_tok_overlap_ranking"

machine_df[approach] = machine_df.progress_apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], definition_df), axis=1)

wsd.eval(machine_df[approach],machine_df["sense_id"])

100%|██████████| 805/805 [00:02<00:00, 361.58it/s]


(0.107, 0.052, 0.043, 0.113)

In [7]:
approach = "sent_embedding"

machine_df[approach] = machine_df.progress_apply (lambda row: wsd.sent_embedding(row["nlp_full_text"], definition_df), axis=1)

wsd.eval(machine_df[approach],machine_df["sense_id"])

100%|██████████| 805/805 [00:01<00:00, 424.11it/s]


(0.19, 0.123, 0.095, 0.173)