In [None]:
import pickle
import pandas as pd
from tasks import wsd
from utils import nlp_tools
from utils.classificaton_utils import binarize
from tqdm.auto import tqdm
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer

tqdm.pandas()

lemma_id = 'machine_nn01'
senses = {'machine_nn01-38475835','machine_nn01-38475923'}
relations = ['seed','synonym','descendant','sibling']

df_source = pd.read_pickle(f'./data/extended_{lemma_id}.pickle')

df_quotations = binarize(lemma_id, 
                        senses, 
                        relations,
                        strict_filter=True,
                        start=1700,
                        end=1910)

In [None]:
df_quotations.head()

In [None]:
all_selected_senses = set(df_quotations["sense_id"])
all_labels = df_quotations[['sense_id','label']]
all_labels = all_labels.rename(columns={'sense_id': 'id'})
all_labels.drop_duplicates(inplace = True)
all_labels = all_labels.reset_index(drop=True)

df_selected_senses = df_source[df_source.id.isin(all_selected_senses)]
df_selected_senses = df_selected_senses[['lemma','id','definition']]
df_selected_senses.drop_duplicates(inplace = True)
df_selected_senses = df_selected_senses[df_selected_senses['definition'].notna()]
df_selected_senses = df_selected_senses.reset_index(drop=True)

df_selected_senses = pd.merge(all_labels, df_selected_senses, on='id')

df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1)

df_selected_senses.head()

In [None]:
df_quotations["nlp_full_text"] = df_quotations.apply (lambda row: nlp_tools.preprocess(row["text"]["full_text"]), axis=1)
df_quotations.head()

In [None]:
approach = "random"

df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.random_predict(df_selected_senses), axis=1)

wsd.eval(approach,df_quotations)

In [None]:
# be careful: i am using the example sentence (row["text"]["full_text"]) as the input sentence and then measure its word overlap with the definition (see function). if you instead want to use the example as training data, we need to split in train/test

approach = "def_tok_overlap_ranking"

df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1)

wsd.eval(approach,df_quotations)

In [None]:
approach = "sent_embedding"

df_quotations[approach] = df_quotations.progress_apply (lambda row: wsd.sent_embedding(row["nlp_full_text"], df_selected_senses), axis=1)

wsd.eval(approach,df_quotations)

In [None]:
approach = "w2v_lesk_ranking"

# Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
wemb_model = Word2Vec.load("models/w2v/w2v_v004/w2v_words.model")
machine_df[approach] = machine_df.progress_apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], definition_df, wemb_model), axis=1)

wsd.eval(machine_df[approach],machine_df["sense_id"])

In [None]:
approach = "bert_lesk_ranking"

# Download model from (warning: this is a contemporary model):
# https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/bert-base-nli-mean-tokens.zip
bert_sentsim_model = SentenceTransformer('models/bert/bert-base-nli-mean-tokens')
machine_df[approach] = machine_df.progress_apply (lambda row: wsd.bert_lesk_ranking(row["text"]["full_text"], definition_df, bert_sentsim_model), axis=1)

wsd.eval(machine_df[approach],machine_df["sense_id"])