# Run `eval_sense` in parallel

**Install `parhugin`** by:

```bash
pip install git+https://github.com/kasra-hosseini/parhugin.git
```

or follow the instructions [here](https://github.com/kasra-hosseini/parhugin#installation).

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import pandas as pd
import numpy as np
import json
from tasks import wsd
from pathlib import Path
from utils import nlp_tools
from parhugin import multiFunc
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
from utils.dataset_download import harvest_data_from_extended_senses
from utils.classificaton_utils import binarize,generate_definition_df, vectorize_target_expressions
from tqdm.notebook import tqdm
tqdm.pandas()

# Prepare data

In [3]:


# import API credentials
with open('oed_experiments/oed_credentials.json') as f:
    auth = json.load(f)


lemma_pos = [["anger","NN"],["apple","NN"],["art","NN"],["democracy","NN"],["happiness","NN"],["labour","NN"],["machine","NN"],["man","NN"],["nation","NN"],["power","NN"],["slave","NN"],["technology","NN"],["woman","NN"]]


embedding_methods = {'bert_base': {"path":'bert-base-uncased',
                                   'layers':'-1,-2,-3,-4',
                                   'pooling_operation':'mean'}}

In [4]:


for lemma, pos in lemma_pos:
    # [WARNING] if you run code for the first time uncomment line below, comment again after running this cell
    #quotations = harvest_data_from_extended_senses(auth,f"{lemma}_{pos}")
    quotations_path = f"./data/sfrel_quotations_{lemma}_{pos}.pickle"
    vectorize_target_expressions(quotations_path,embedding_methods)

Dataframe alread contains vectors from bert_base settings
{'path': 'bert-base-uncased', 'layers': '-1,-2,-3,-4', 'pooling_operation': 'mean'}
Dataframe alread contains vectors from bert_base settings
{'path': 'bert-base-uncased', 'layers': '-1,-2,-3,-4', 'pooling_operation': 'mean'}


# Evaluate

In [30]:
def eval_sense(lemma,
                pos,
                senses,
                start=1760,
                end=1920,
                train_on_dev=False,
                vector_col='vector_bert_base_-1,-2,-3,-4_mean'):

    df_train, df_val, df_test = binarize(lemma,
                pos,
                senses, 
                relations,
                strict_filter=True,
                start=start,
                end=end,
                eval_mode=eval_mode)

    # no quotations for sense and timeframe
    if df_train is None:
        return None
    
    if train_on_dev:
        df_train = pd.concat([df_train, df_val], axis=0)
        df_train.reset_index(inplace=True,drop=True)

    df_train["nlp_full_text"] = df_train.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
    df_val["nlp_full_text"] = df_val.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
    df_test["nlp_full_text"] = df_test.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)

    # random
    df_test["random"] = df_test.apply (lambda row: wsd.random_predict(), axis=1)

    # retrieve and process definitions            
    df_selected_senses = generate_definition_df(df_train,lemma,eval_mode=eval_mode)
    df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1)

    # token overlap
    df_test["def_tok_overlap_ranking"] = df_test.apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1)

    # spacy sentence embeddings
    df_test["sent_embedding"] = df_test.apply (lambda row: wsd.sent_embedding(row["nlp_full_text"], df_selected_senses), axis=1)

    #w2v lesk
    # Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
    
    df_test["w2v_lesk_ranking"] = df_test.apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1)

    #Bert lesk
    
    #df_test["bert_lesk_ranking"] = df_test.apply (lambda row: wsd.bert_lesk_ranking(row["text"]["full_text"], df_selected_senses, bert_sentsim_model), axis=1)


    # supervised baselined (w-emb SVM) - careful this is a 19thC BL model
    df_test["svm_wemb_baseline"] = wsd.svm_wemb_baseline(df_train,df_test,wemb_model)

    
    df_test[f"bert_binary_centroid_{vector_col}"] = df_test.apply(wsd.bert_binary_centroid_vector, 
                                        df_train = df_train, 
                                        vector_col=vector_col,
                                        return_ranking=False, axis=1)

    senseid2label = dict(df_test[['sense_id','label']].values)
    df_test[f"bert_centroid_sense_{vector_col}"] = df_test.apply(wsd.bert_sense_centroid_vector,  
                                                    senseid2label= senseid2label,
                                                    vector_col=vector_col,
                                                    df_train = df_train, axis=1)

    centroid_vectors = df_train.groupby('label')[vector_col].apply(np.mean,axis=0)
    sem_axis = centroid_vectors[1] - centroid_vectors[0] 
    df_test[f"bert_contrast_{vector_col}"] = df_test[vector_col].apply(wsd.bert_semaxis_vector,
                                                    sem_axis=sem_axis,
                                                    threshold=.0)

    df_test[f"bert_ts_binary_centroid_{vector_col}"] = df_test.apply(wsd.bert_ts_binary_centroid_vector, 
                                                        df_train=df_train, 
                                                        ts_method='nearest',
                                                        vector_col=vector_col,
                                                        axis=1)

    senseid2label = dict(df_test[['sense_id','label']].values)
    df_test[f"bert_ts_centroid_sense_{vector_col}"] = df_test.apply(wsd.bert_ts_sense_centroid_vector,  
                        senseid2label= senseid2label,
                        ts_method='nearest',
                        vector_col=vector_col,
                        df_train = df_train, axis=1)

    return df_test

In [31]:
# Here, we first define the serial version of the run
# It combines both eval_sense and other parts to output the results

def serial_run(lemma, pos, senses, eval_mode, relations, start=1760, end=1920, train_on_dev=True, vector_col='vector_bert_base_-1,-2,-3,-4_mean'):
    df_test = eval_sense(lemma,
                         pos,
                         senses,
                         start=1760,
                         end=1920,
                         train_on_dev=True)

    results_path = os.path.join('results', f"{lemma}_{pos}", eval_mode)
    results_filename = '_'.join(senses) + "~" + "+".join(sorted(relations)) + ".csv"
    Path(results_path).mkdir(parents=True, exist_ok=True)

    # IF df_test is None, create an empty DataFrame
    if not isinstance(df_test, type(None)):
        out_df = df_test.filter(['id_x','label','random','def_tok_overlap_ranking', 
                                 'sent_embedding', 'w2v_lesk_ranking',#'bert_lesk_ranking',
                                 'svm_wemb_baseline',f"bert_binary_centroid_{vector_col}",
                                 f"bert_centroid_sense_{vector_col}",f"bert_contrast_{vector_col}",
                                 f"bert_ts_binary_centroid_{vector_col}",f"bert_ts_centroid_sense_{vector_col}"], axis=1)
    else:
        out_df = pd.DataFrame()

    out_df.to_csv(os.path.join(results_path, results_filename), index=False)  

In [38]:
# Collect a list of all runs, see list_jobs

words = [['machine','NN'],["democracy","NN"]]
#words = [["anger","NN"],["apple","NN"],["art","NN"],["democracy","NN"],["happiness","NN"],["labour","NN"],["machine","NN"],["man","NN"],["nation","NN"],["power","NN"],["slave","NN"],["technology","NN"],["woman","NN"]]

relations = ['seed','synonym'] # ,'descendant','sibling'
eval_mode = "lemma_etal" # lemma or lemma_etal

wemb_model = Word2Vec.load("models/w2v_004/w2v_words.model")

# Download model from (warning: this is a contemporary model):
# https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/bert-base-nli-mean-tokens.zip

bert_sentsim_model = SentenceTransformer('models/bert/bert-base-nli-mean-tokens')


In [39]:

list_jobs = []
for lemma, pos in words:
    quotations_path = f"./data/sfrel_quotations_{lemma}_{pos}.pickle"
    lemma_senses = pd.read_pickle(f'./data/lemma_senses_{lemma}_{pos}.pickle')
    
    # this is the index of the lemma id <-- we could remove this later
    idx = "01"
    
    senses = set(lemma_senses[lemma_senses.word_id==f'{lemma}_{pos.lower()}{idx}'].id)
    
    for sense in list(senses):
        list_jobs.append([serial_run, (lemma, pos, {sense}, eval_mode, relations, 1760, 1920, True)])

In [40]:
list_jobs[:2]

[[<function __main__.serial_run(lemma, pos, senses, eval_mode, relations, start=1760, end=1920, train_on_dev=True, vector_col='vector_bert_base_-1,-2,-3,-4_mean')>,
  ('machine',
   'NN',
   {'machine_nn01-38476397'},
   'lemma_etal',
   ['seed', 'synonym'],
   1760,
   1920,
   True)],
 [<function __main__.serial_run(lemma, pos, senses, eval_mode, relations, start=1760, end=1920, train_on_dev=True, vector_col='vector_bert_base_-1,-2,-3,-4_mean')>,
  ('machine',
   'NN',
   {'machine_nn01-38475099'},
   'lemma_etal',
   ['seed', 'synonym'],
   1760,
   1920,
   True)]]

---

At this point, `list_jobs` contains a list of jobs to be run in parallel, e.g.: 

```python
[   
    [serial_run, (lemma1, pos1, ...)],
    [serial_run, (lemma2, pos2, ...)],  
    [serial_run, (...)],
    ...
] 
``` 

In [41]:
# num_req_p: number of processes to be run in parallel
myprocs = multiFunc(num_req_p=8)

[INFO] #requested processes: 8


In [42]:
# Add the list of jobs
myprocs.add_list_jobs(list_jobs)
print(myprocs)

#requested processed: 8
#jobs: 33


In [37]:
myprocs.run_jobs(verbosity=2)

[INFO] start job-0
[92m2021-01-20 12:57:37[0m [95mlwm-embeddings[0m [1m[90m[INFO][0m [92m#finished jobs: 0[0m
[92m2021-01-20 12:57:37[0m [95mlwm-embeddings[0m [1m[90m[INFO][0m [92m#running jobs: 1[0m
[92m2021-01-20 12:57:37[0m [95mlwm-embeddings[0m [1m[90m[INFO][0m [92m#remained jobs: 6[0m
[INFO] start job-1
# senses before filtering by date = 94
# senses after filtering by date = 71
[92m2021-01-20 12:57:37[0m [95mlwm-embeddings[0m [1m[90m[INFO][0m [92m#finished jobs: 0[0m
[92m2021-01-20 12:57:37[0m [95mlwm-embeddings[0m [1m[90m[INFO][0m [92m#running jobs: 2[0m
[92m2021-01-20 12:57:37[0m [95mlwm-embeddings[0m [1m[90m[INFO][0m [92m#remained jobs: 5[0m


# of seed senses 7 
# of synonyms 59 
# of branch senses 0


# of seeds selected 1 
# of synonyms selected 0 
# of branches selected 0
[INFO] start job-2
[LOG] #rows before removing None vector (24, 20)
[LOG] #rows after removing None vector (24, 20)
# senses before filtering by date 