# Run `eval_sense` function in parallel using multiprocessing

In [1]:
import pandas as pd
from tasks import wsd
import os
from pathlib import Path
from utils import nlp_tools
from gensim.models import Word2Vec
from utils.classificaton_utils import binarize,generate_definition_df
from tqdm.notebook import tqdm

In [2]:
def eval_sense(lemma,pos,sense,start=1760,end=1920,train_on_dev=True):

    # this is the second for loop to parallelize
    df_train, df_val, df_test = binarize(lemma,
                pos,
                {sense}, 
                relations,
                strict_filter=True,
                start=start,
                end=end,
                eval_mode=eval_mode)

    # no quotations for sense and timeframe
    if df_train is None:
        return None
    
    if train_on_dev:
        df_train = pd.concat([df_train, df_val], axis=0)

    df_train["nlp_full_text"] = df_train.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
    df_val["nlp_full_text"] = df_val.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)
    df_test["nlp_full_text"] = df_test.apply (lambda row: nlp_tools.preprocess(row["full_text"]), axis=1)

    # random 
    df_test["random"] = df_test.apply (lambda row: wsd.random_predict(), axis=1)

    # retrieve and process definitions            
    df_selected_senses = generate_definition_df(df_train,lemma,eval_mode=eval_mode)
    df_selected_senses["nlp_definition"] = df_selected_senses.apply (lambda row: nlp_tools.preprocess(row["definition"]), axis=1)

    # token overlap
    df_test["def_tok_overlap_ranking"] = df_test.apply (lambda row: wsd.tok_overlap_ranking(row["nlp_full_text"], df_selected_senses), axis=1)

    # spacy sentence embeddings
    df_test["sent_embedding"] = df_test.apply (lambda row: wsd.sent_embedding(row["nlp_full_text"], df_selected_senses), axis=1)

    #w2v lesk
    # Warning: I use a Word2vec model trained on all 19thC BL corpus that is locally stored.
    df_test["w2v_lesk_ranking"] = df_test.apply (lambda row: wsd.w2v_lesk_ranking(row["nlp_full_text"], df_selected_senses, wemb_model), axis=1)

    return df_test

In [3]:
# Here, we first define the serial version of the run
# It combines both eval_sense and other parts to output the results

def serial_run(lemma, pos, sense, eval_mode, relations, start=1760, end=1920, train_on_dev=True):
    df_test = eval_sense(lemma,
                         pos,
                         sense,
                         start=1760,
                         end=1920,
                         train_on_dev=True)

    results_path = os.path.join('results', f"{lemma}_{pos}", eval_mode)
    results_filename = sense + "~" + "+".join(sorted(relations)) + ".csv"
    Path(results_path).mkdir(parents=True, exist_ok=True)

    out_df = df_test.filter(['id_x','label','random','def_tok_overlap_ranking', 
                             'sent_embedding', 'w2v_lesk_ranking'], axis=1)

    out_df.to_csv(os.path.join(results_path, results_filename), index=False)  

In [4]:
# Collect a list of all runs, see list_jobs

# this is the first for loop to parallelise
words = [["machine","NN"]]
#words = [["anger","NN"],["apple","NN"],["art","NN"],["democracy","NN"],["happiness","NN"],["labour","NN"],["machine","NN"],["man","NN"],["nation","NN"],["power","NN"],["slave","NN"],["technology","NN"],["woman","NN"]]

relations = ['seed','synonym'] # ,'descendant','sibling'
eval_mode = "lemma_etal" # lemma or lemma_etal

wemb_model = Word2Vec.load("models/w2v/w2v_v004/w2v_words.model")

list_jobs = []
for lemma, pos in words:
    quotations_path = f"./data/sfrel_quotations_{lemma}_{pos}.pickle"
    lemma_senses = pd.read_pickle(f'./data/lemma_senses_{lemma}_{pos}.pickle')
    
    # not sure what is this thing
    idx = "01"
    
    senses = set(lemma_senses[lemma_senses.word_id==f'{lemma}_{pos.lower()}{idx}'].id)
    
    for sense in senses:
        list_jobs.append([serial_run, (lemma, pos, sense, eval_mode, relations, 1760, 1920, True)])

In [5]:
from multiproc import multiProc
# num_req_p is the number of requested processes to be run in parallel
myprocs = multiProc(num_req_p=20)

Number of requesterd processes: 20


In [6]:
myprocs.add_list_jobs(list_jobs)
print(myprocs)

#requested processed: 20
#jobs: 26


In [7]:
myprocs.run_jobs()

--- START job number 0
--- START job number 1
Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',
       'quotation_id', 'source', 'text', 'year'],
      dtype='object')
# senses before filtering by date = 517
# senses after filtering by date = 352


# of seed senses 22 
# of synonyms 310 
# of branch senses 0
--- START job number 2


# of seeds selected 1 
# of synonyms selected 0 
# of branches selected 0
Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',
       'quotation_id', 'source', 'text', 'year'],
      dtype='object')
# senses before filtering by date = 517
# senses after filtering by date = 352


# of seed senses 22 
# of synonyms 310 
# of branch senses 0


# of seeds selected 0 
# of synonyms selected 14 
# of branches selected 0
--- START job number 3
Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',
       'quotation_id', 'source', 'text', 'year'],
      dtype='object')
# senses before filtering by date = 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


# of seeds selected 1 
# of synonyms selected 0 
# of branches selected 0


# of seeds selected 1 
# of synonyms selected 15 
# of branches selected 0


# of seeds selected 1 
# of synonyms selected 3 
# of branches selected 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


  definition_df["sent_embedding"] = definition_df.apply (lambda row: sent.similarity(row["nlp_definition"]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


  definition_df["sent_embedding"] = definition_df.apply (lambda row: sent.similarity(row["nlp_definition"]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.
--- START job number 20
Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',
       'quotation_id', 'source', 'text', 'year'],
      dtype='object')
# senses before filtering by date = 517
# senses after filtering by date = 352


# of seed senses 22 
# of synonyms 310 
# of branch senses 0


# of seeds selected 0 
# of synonyms selected 29 
# of branches selected 0
--- START job number 21
Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',
       'quotation_id', 'source', 'text', 'year'],
      dtype='object')
# senses before filtering by date = 517
# senses after filtering by date = 352


# of seed senses 22 
# of synonyms 310 
# of branch senses 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


# of seeds selected 1 
# of synonyms selected 1 
# of branches selected 0
--- START job number 22
Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',
       'quotation_id', 'source', 'text', 'year'],
      dtype='object')
# senses before filtering by date = 517
# senses after filtering by date = 352


# of seed senses 22 
# of synonyms 310 
# of branch senses 0


# of seeds selected 1 
# of synonyms selected 4 
# of branches selected 0
--- START job number 23
Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',
       'quotation_id', 'source', 'text', 'year'],
      dtype='object')
# senses before filtering by date = 517
# senses after filtering by date = 352


# of seed senses 22 
# of synonyms 310 
# of branch senses 0


# of seeds selected 0 
# of synonyms selected 0 
# of branches selected 0
--- START job number 24

There are not quotations available, given this sense-id and time-frame.


Process Process-24:
Traceback (most recent call last):
  File "/Users/khosseini/anaconda3_deezy/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/khosseini/anaconda3_deezy/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-3-d3a5057c9e97>", line 16, in serial_run
    out_df = df_test.filter(['id_x','label','random','def_tok_overlap_ranking',
AttributeError: 'NoneType' object has no attribute 'filter'


--- START job number 25
Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',
       'quotation_id', 'source', 'text', 'year'],
      dtype='object')
# senses before filtering by date = 517
# senses after filtering by date = 352


# of seed senses 22 
# of synonyms 310 
# of branch senses 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


# of seeds selected 1 
# of synonyms selected 33 
# of branches selected 0
Index(['sense_id', 'lemma_definition', 'definition', 'word_id', 'lemma',
       'quotation_id', 'source', 'text', 'year'],
      dtype='object')
# senses before filtering by date = 517
# senses after filtering by date = 352


# of seed senses 22 
# of synonyms 310 
# of branch senses 0


# of seeds selected 0 
# of synonyms selected 8 
# of branches selected 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


  definition_df["sent_embedding"] = definition_df.apply (lambda row: sent.similarity(row["nlp_definition"]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_senses['definition'] = df_selected_senses.apply(merge_definitions, axis=1)


Using lemma_etal as evaluation mode.


  definition_df["sent_embedding"] = definition_df.apply (lambda row: sent.similarity(row["nlp_definition"]), axis=1)
  definition_df["sent_embedding"] = definition_df.apply (lambda row: sent.similarity(row["nlp_definition"]), axis=1)




All jobs finished.
Total Time: 861.0047121047974
