# Run `eval_sense` function in parallel using multiprocessing

In [None]:
import pandas as pd
from tasks import wsd
from pathlib import Path
from utils import nlp_tools
from gensim.models import Word2Vec
from utils.classificaton_utils import binarize,generate_definition_df
from tqdm.notebook import tqdm

In [None]:
def eval_sense(lemma,pos,sense,start=1760,end=1920,train_on_dev=True):

    # this is the second for loop to parallelize
    df_train, df_val, df_test = binarize(lemma,
                pos,
                {sense}, 
                relations,
                strict_filter=True,
                start=start,
                end=end,
                eval_mode=eval_mode)

    # no quotations for sense and timeframe
    if df_train is None:
        return None
    
    if train_on_dev:
        df_train = pd.concat([df_train, df_val], axis=0)

    df_train["nlp_full_text"] = df_train.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
    df_val["nlp_full_text"] = df_val.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
    df_test["nlp_full_text"] = df_test.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)

    # random 
    df_test["random"] = df_test.apply (lambda row: wsd.random_predict(), axis=1)

    # retrieve and process definitions            
    df_selected_senses = generate_definition_df(df_train,lemma,eval_mode=eval_mode)
    df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1)

    # token overlap
    df_test["def_tok_overlap_ranking"] = df_test.apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1)

    # spacy sentence embeddings
    df_test["sent_embedding"] = df_test.apply (lambda row: wsd.sent_embedding(row["nlp_full_text"], df_selected_senses), axis=1)

    #w2v lesk
    # Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
    df_test["w2v_lesk_ranking"] = df_test.apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1)

    return df_test

In [None]:
# Here, we first define the serial version of the run
# It combines both eval_sense and other parts to output the results

def serial_run(lemma, pos, sense, eval_mode, relations, start=1760, end=1920, train_on_dev=True):
    df_test = eval_sense(lemma,
                         pos,
                         sense,
                         start=1760,
                         end=1920,
                         train_on_dev=True)

    results_path = os.path.join('results', f"{lemma}_{pos}", eval_mode)
    results_filename = sense + "~" + "+".join(sorted(relations)) + ".csv"
    Path(results_path).mkdir(parents=True, exist_ok=True)

    out_df = df_test.filter(['id_x','label','random','def_tok_overlap_ranking', 
                             'sent_embedding', 'w2v_lesk_ranking'], axis=1)

    out_df.to_csv(os.path.join(results_path, results_filename), index=False)  

In [None]:
# Collect a list of all runs, see list_jobs

# this is the first for loop to parallelise
words = [["machine","NN"]]
#words = [["anger","NN"],["apple","NN"],["art","NN"],["democracy","NN"],["happiness","NN"],["labour","NN"],["machine","NN"],["man","NN"],["nation","NN"],["power","NN"],["slave","NN"],["technology","NN"],["woman","NN"]]

relations = ['seed','synonym'] # ,'descendant','sibling'
eval_mode = "lemma_etal" # lemma or lemma_etal

wemb_model = Word2Vec.load("models/w2v/w2v_v004/w2v_words.model")

list_jobs = []
for lemma, pos in words:
    quotations_path = f"./data/sfrel_quotations_{lemma}_{pos}.pickle"
    lemma_senses = pd.read_pickle(f'./data/lemma_senses_{lemma}_{pos}.pickle')
    
    # not sure what is this thing
    idx = "01"
    
    senses = set(lemma_senses[lemma_senses.word_id==f'{lemma}_{pos.lower()}{idx}'].id)
    
    for sense in senses:
        list_jobs.append([serial_run, (lemma, pos, sense, eval_mode, relations, 1760, 1920, True)])

In [None]:
from multiproc import multiProc
# num_req_p is the number of requested processes to be run in parallel
myprocs = multiProc(num_req_p=10)

In [None]:
myprocs.add_list_jobs(list_jobs)
print(myprocs)

In [None]:
myprocs.run_jobs()