# Review notebook for Harvesting Senses with Provenance Information

Functions reviewed in this Notebook:

- `get_provenance_by_semantic_class` (helper)
- `extend_from_lemma` (main)


Part of:
- `utils.dataset_download`


Creator: Kaspar Beelen

Reviewer(s):


These functions assume:
    - a pickled dataframe with information harvested from the OED word endpoint for a given lemma id

What these functions should do:
    - for a given lemma id (e.g. `machine_nn01` saved in pickled data)
    - get all senses
    - for each of the senses get synonyms
    - for each of the senses + synonyms, get all branches (siblings and descedants
    - keep track of the relation between the initial lemma and sense harvested (this is saved in provenance and provenance_type column
    - for more documentation please refer to the code and this notebook

In [2]:
!git branch

  1-dataframe[m
  19-machine-tagger[m
  3-group-senses[m
* [32m4-semantic-provenance[m
  dev[m
  master[m
  oed-experiments[m


In [3]:
%load_ext autoreload

In [4]:
%autoreload 2

In [5]:
from utils.dataset_download import *
import pickle
import json
from pathlib import Path, PosixPath
import pandas as pd

# Load credentials, set paths and arguments

In [6]:
# import API credentials
with open('oed_experiments/oed_credentials.json') as f:
    auth = json.load(f)

In [7]:
# define lemma
lemma_id = "machine_nn01"

In [8]:
dp = "../data"

In [9]:
save_path = Path(dp)
save_path.mkdir(exist_ok=True)

In [10]:
start,end = 1750,1950
lemma_id = 'machine_nn01'

## Run function

In [None]:
extended_df = extend_from_lemma(auth,lemma_id,start,end)

In [None]:
extended_df.head(3)

In [None]:
extended_df.shape

# Inspect functions

- `get_provenance_by_semantic_class`

In [None]:
def get_provenance_by_semantic_class(row: pd.Series) -> list:
    """
    decide on the relation between the sense and the target querry
    here we use the lowest semantic class id to decide on the relation
    
    if last semantic class id (sc_ids[-1]) == provenance id: then sense is sibling of provenance id
    elif provenance semantic class id in the list of semantic class last ids
    (but provenance not the last one): then sense is descendant of provenance id
    Argument:
        row (pd.Series): row of dataframe obtained from branchsenses endpoint
    
    Returns:
        nested listed in the format of [lowest semantic class id, relation, provenance semantic class id]
            in other words it said that for a given sense (which can have multiple semantic class ids)
            the lowest semantic class id stands in the relation "sibling" or "descendant" of the 
            provenance semantic class id
    """
    
    provenance = []
    
    # one sense can belong to multiple semantic class ids
    for sc_ids in row.semantic_class_ids:
        relation = ''
        
        # scenario 1
        # if the last id equals provenance, the relation is sibling
        if sc_ids[-1] == row.provenance_pivot:
            relation = 'sibling'
        
        # scenario 2
        # if not, then the relation is descendant
        elif (row.provenance_pivot in sc_ids):
            relation = 'descendant'
        
        # exclude other relations
        if relation:
            provenance.append([sc_ids[-1], relation, row.provenance_pivot])
    
    # double check, each sense SHOULD have a provenance
    # if not this will print a warning message
    if not provenance:
        print(f'Warning: No descendants or siblings found for {row.id}')
 
    return provenance


Inspect function `extend_from_lemma`

Below we put the function in seperate cells, to facilitate scrutinizing individual steps.

In [None]:
senses_df = pd.read_pickle(f"./data/senses_{lemma_id}.pickle")
senses_df.head()

In [None]:
# helper function to get last element in a nested list
get_last_id = lambda nested_list :[l[-1] for l in nested_list]
    
# load seed query dataframe or download from api
lemma_path = f"./data/senses_{lemma_id}.pickle"
if Path(lemma_path).is_file():
    print(f'Loading senses for {lemma_id} from pickle.')
    query_df = pd.read_pickle(lemma_path)
else:
    print(f'Dowloading senses for {lemma_id} from OED API.')
    sense_json = query_oed(auth,'word',lemma_id,flags='include_senses=true&include_quotations=true')
    # convert the json in a dataframe
    query_df = convert_json_to_dataframe(sense_json)
    # save the datafram as pickle
    query_df.to_pickle(f"./data/senses_{lemma_id}.pickle")
    
# use the sense endpoint to ensure all information 
# can be properly concatenated in one dataframe
    
# retrieve all sense ids
query_sense_ids = query_df.id.unique()

In [None]:
# get all senses by sense id
print(f"Get all sense for the lemma {lemma_id}")
seeds = [(s,query_oed(auth,'sense',s,
                flags=f"current_in='{start}-{end}'&limit=1000", # probably "current_in" not needed here see APi
                verbose=False)) # set verbose to True to see the url request
                    for s in tqdm(query_sense_ids)]

In [None]:
# convert to dataframe
seeds_df = pd.DataFrame([seed['data'] for s_id,seed in seeds])

# seed_df contains all the senses of the word machine_nn01
# we distinguish between provenance and provenance_type
# provenance will refer to specific word, sense of semantic class ids
# provenance_type will distinguish between different types of extension
# define provenance, these words are "seed"
seeds_df['provenance'] = [[[i,'seed',lemma_id]] for i in seeds_df.id] # for the seed sense we use the id of the word machine_nn0
                                       # we use list here, reason is explained later, see provenance of synonyms
seeds_df['provenance_type'] = 'seed' # categorize these lemmas as seed

In [None]:
# get all synonyms for the seed senses
# reminder synonyms uses same function as the /senses/ endpoint, flags should work here
print(f"Get all synonyms of the senses listed in {lemma_id}")
synonyms = [(s,query_oed(auth,'sense',s,
                level='synonyms',
                flags=f"current_in='{start}-{end}'&limit=1000"))
                        for s in tqdm(query_sense_ids)]

In [None]:
# transform list of synonyms to a dataframe
synonyms_df = pd.DataFrame([s for s_id,syn in synonyms for s in syn['data']])
    
# for synonyms the provenance_type is set to "synonym"
synonyms_df['provenance_type'] = 'synonym'
# for synonyms we refer the sense_id via which this synonym was retrieved
synonyms_df['provenance'] = [[[s['id'],'synonym',s_id]] for s_id,syn in synonyms for s in syn['data']]

In [None]:
# seed + synonyms constitute the nucleas of our query
# these are saved in the core_df
# shape should be 485 (synonyms senses) + 26 (seed senses)
core_df = pd.concat([seeds_df,synonyms_df],sort=True)
    
# branch out from there
# we save the lowest level of the semantic_class_last_id columns
core_df['semantic_class_last_id'] = core_df['semantic_class_ids'].apply(get_last_id)

In [None]:
# retrieve all the _lowest_ (or last) semantic class ids for the core senses so far
semantic_class_ids = set([s for l in core_df.semantic_class_last_id.to_list() for s in l])

In [None]:
# now, we use the descendants endpoint
# for each lowest semantic class id
# we get all "descendants" which according the API documentation
# returns an array of senses that belong to the semantic class
# specified by ID, plus senses that belong to its child and descendant classes.
print("Get all branches for seed senses and synonyms")
branches = [(idx,query_oed(auth,'semanticclass', idx, 
                        level='branchsenses', # 
                        flags=f"current_in='{start}-{end}'&limit=1000"))
                            for idx in tqdm(semantic_class_ids)]

In [None]:
# convert API response to dataframe
branches_df = pd.DataFrame([s for idx,branch in branches for s in branch['data']])
    
# ISSUE: again we have duplicate 
# senses here, as some appear multiple time as
# in the same semantic class (or as descendant)
    
# provenance_type is branch with semantic class id 
# that was use for retrieving the sense is the provenance
branches_df['provenance_type'] = 'branch'
    
# we create a provenance_pivot columsn, which shows
# the semantic class id via which the sense was retrieved
branches_df['provenance_pivot'] = [idx for idx, branch in branches for s in branch['data']]

In [None]:
# now there are two scenarios to specify for the pro
# both scenarios can apply to one sense
# if last semantic class id (sc_ids[-1]) == provenance id: then sense is sibling of provenance id
# elif provenance semantic class id in the list of semantic class last ids
# (but provenance not the last one): then sense is descendant of provenance id
    
branches_df['provenance'] = branches_df.apply(get_provenance_by_semantic_class,axis=1)
    
# drop the provenance_pivot column
branches_df.drop('provenance_pivot',axis=1,inplace=True)
    
# concatenate core and branch senses
# ISSUE: have a closer look at the warning message
extended_df = pd.concat([core_df,branches_df],sort=True)

# to check if rows match
#extended_df.shape[0] == core_df.shape[0] + branches_df.shape[0]
# save dataframe as pickle
extended_df.to_pickle(f"./data/extended_{lemma_id}.pickle") 

## Fin.

In [11]:
def add_unrelated_senses(path):
extended_df = pd.read_pickle(f"./data/extended_{lemma_id}.pickle") 

In [17]:
word_id = list(set(extended_df.word_id))

In [20]:
word_id[100]

'carry_nn01'

In [21]:
res = query_oed(auth,'word',word_id[100],
                level='quotations',
                #flags=f"current_in='{start}-{end}'&limit=1000"
               )

In [23]:
pd.DataFrame(res['data'])

Unnamed: 0,id,text,year,lemma,source,oed_url,word_id,sense_id,datestring,first_in_word,oed_reference,first_in_sense
0,carry_nn01-9991081,"{'keyword': 'carry', 'full_text': 'On the last...",1605,carry,"{'title': 'Annales', 'author': 'J. Stow', 'gen...",https://www.oed.com/view/Entry/28251#eid9991081,carry_nn01,carry_nn01-9991075,1605,True,"carry, n., sense 1b",True
1,carry_nn01-9991130,"{'keyword': 'carry', 'full_text': 'Shee is a b...",1618,carry,"{'title': 'New & 2nd Bk. Falconrie', 'author':...",https://www.oed.com/view/Entry/28251#eid9991130,carry_nn01,carry_nn01-9991125,1618,False,"carry, n., sense 2",True
2,carry_nn01-9991292,"{'keyword': 'carry', 'full_text': 'I min'..sin...",1788,carry,"{'title': 'Poems & Epist.', 'author': 'E. Pick...",https://www.oed.com/view/Entry/28251#eid9991292,carry_nn01,carry_nn01-9991289,1788,False,"carry, n., sense 6b",True
3,carry_nn01-9991300,"{'keyword': 'carry', 'full_text': 'Mirk and ra...",1807,carry,"{'title': 'Sleeping, Maggie', 'author': 'R. Ta...",https://www.oed.com/view/Entry/28251#eid9991300,carry_nn01,carry_nn01-9991289,1807–10,False,"carry, n., sense 6b",False
4,carry_nn01-9991263,"{'keyword': 'carries', 'full_text': 'Still tow...",1819,carry,"{'title': 'Vestriad', 'author': 'H. Busk', 'ge...",https://www.oed.com/view/Entry/28251#eid9991263,carry_nn01,carry_nn01-9991259,1819,False,"carry, n., sense 6a",True
5,carry_nn01-9991089,"{'keyword': 'carrie', 'full_text': 'Alexander ...",1820,carry,"{'title': 'Caldeonian Merc.', 'author': None, ...",https://www.oed.com/view/Entry/28251#eid9991089,carry_nn01,carry_nn01-9991075,1820,False,"carry, n., sense 1b",False
6,carry_nn01-9991271,"{'keyword': 'carry', 'full_text': 'The clouds ...",1828,carry,"{'title': 'Blackwood's Edinb. Mag.', 'author':...",https://www.oed.com/view/Entry/28251#eid9991271,carry_nn01,carry_nn01-9991259,1828,False,"carry, n., sense 6a",False
7,carry_nn01-9991147,"{'keyword': 'Carry', 'full_text': 'At the halt...",1833,carry,"{'title': 'Regulations Instr. Cavalry', 'autho...",https://www.oed.com/view/Entry/28251#eid9991147,carry_nn01,carry_nn01-9991139,1833,False,"carry, n., sense 3",True
8,carry_nn01-307616029,"{'keyword': 'carry', 'full_text': 'From this p...",1857,carry,"{'title': 'Knickerbocker', 'author': None, 'ge...",https://www.oed.com/view/Entry/28251#eid307616029,carry_nn01,carry_nn01-9991233,1857,False,"carry, n., sense 5",True
9,carry_nn01-9991281,"{'keyword': 'carry', 'full_text': 'The directi...",1857,carry,"{'title': 'Madeira', 'author': 'R. White', 'ge...",https://www.oed.com/view/Entry/28251#eid9991281,carry_nn01,carry_nn01-9991259,1857,False,"carry, n., sense 6a",False
