# Review Notebook for Sense Filtering by Time and Provenance

This notebook builds on 4.1 (harvest senses with provenance) and 4.2 (harvest quotations for senses). The code in this notebook allows you to filter senses by (a) selecting seed senses (b) defining relations words should have to the seed senses.

Functions reviewed in this notebook:
- `filter_by_year_range` (helper)
- `select_senses_by_provenance`(helper)
- `filter_senses` (main)

Part of:
- `utils.dataset_download`

In [None]:
!git branch

## Load libraries, data and set parameters

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import pandas as pd
import json
from utils.dataset_download import *

In [None]:
lemma_id = 'machine_nn01'

In [None]:
# import API credentials
with open('oed_experiments/oed_credentials.json') as f:
    auth = json.load(f)

In [None]:
df = pd.read_pickle(f'./data/extended_{lemma_id}.pickle')

In [None]:
df.head(3)

In [None]:
df_quotations = pd.read_pickle(f'./data/quotations_{lemma_id}.pickle')

In [None]:
df_quotations.head()

# Run code

In [None]:
set(df[df.id.isin(['machine_nn01-38475835','machine_nn01-38475923'])].definition)

In [None]:
senses = filter_senses(df,
                       {'machine_nn01-38475835','machine_nn01-38475923'},
                       ['seed','synonym','descendant'],
                       start=1760, 
                       end=1920
                      )

In [None]:
senses

In [None]:
quotations = obtain_quotations_for_senses(df_quotations,senses)
quotations.shape

In [None]:
quotations.head()

# Inspect code

### Inspect `filter_by_year_range`

In [None]:
def filter_by_year_range(dr: dict, target_start: int, target_end: int) -> bool:
    """
    Helper function that expects a datarange dictionary from the OED
    Is used for filter senses that are outside the historical scope 
    of the research. The date range is defined by the target_start and target_end
    arguments. If the date range of the sense has NO overlap with the
    target period, then return False, otherwise return True
    
    Arguments:
        dr (dict): daterange dict of OED
        target_start (int): start year of target period
        target_end (int): end year of target period
    
    Returns:
        return a boolean, True if there is overlap between
        the target period and the date range of the sense
    """
    # if there is not start date, set to 0
    if dr.get('start',None) is None:
        sense_start = 0
    else:
        sense_start = dr['start']
    
    
    # if there is no end date, set to 2021
    if dr.get('end',None) is None:
        sense_end = 2021
    else:
        sense_end = dr['end']
    
    # if there is an intersection between the target period and sense period empty
    # return True
    if set(range(sense_start,sense_end+1)).intersection(set(range(target_start,target_end+1))):
        return True
    
    # otherwise return False
    return False

In [None]:
# number of senses selected
df['in_period'] = df.daterange.apply(filter_by_year_range, target_start=1760, target_end=1920)
print(sum(df.in_period))

In [None]:
#print(df['in_period'][110:120])

In [None]:
print(df.iloc[0][['daterange','in_period']])
print(df.iloc[2][['daterange','in_period']])
print(df.iloc[70][['daterange','in_period']])
print(df.iloc[88][['daterange','in_period']])
print(df.iloc[89][['daterange','in_period']])

### Inspect `select_senses_by_provenance`

In [None]:
def select_senses_by_provenance(sub_df: pd.DataFrame, 
                                item_ids: set, 
                                relations: list) -> tuple:
    """Helper function that given a subsection of a dataframe filters senses based
    on a set of target sense ids and relations. This function requires a dataframe created
    by the extend_from_lemma function.
    
    Arguments:
        sub_df (pd.DataFrame): slice of a pd.DataFrame
        item_ids (set): include senses related to these items 
                        these can be sense ids or semantic class ids
        relations (list): filter based on these relations 
                          options are: seed, synonyms, sibling, descedant
        
    Returns:
        a tuple that contains a list with position indices and a list with items
    """
    
    indices, items = set(),set()
    
    for i, row in sub_df.iterrows():
        for oed_id, relation, prov_id in row.provenance:
            # if the provenance and relation match to the arguments
            # add the items and position to the respective lists
            if (prov_id in item_ids) and (relation in relations):
                indices.add(i) ; items.add(oed_id)
                
    return list(indices), list(items)

In [None]:
sel_indices, selected = select_senses_by_provenance(df, {'machine_nn01-38475835','machine_nn01-38475923'},
                                                        ['synonym'])

In [None]:
selected

### Inspect `filter_senses`

In [None]:
def filter_senses(df, sense_ids:set, 
                      relations:list, 
                      start:int, 
                      end:int,
                      verbose=True) -> set:
    """
    Main function that filter sense by a give date range 
    and set of seed senses with provenace relations. 
    The seeds sense are selected from the lemma dataframe
    used as starting point for harvesting. Builds on dataframe created 
    by the extend_from_lemma function.
    
    Returns selected senses as a set. 
    
    Arguments:
        df (pd.DataFrame): main dataframe created by the extend_from_lemma
        senses_ids (set): seeds senses from the lemma used for filtering
        relations (list): filter based on these relations
        start (int): beginning of target period
        end (int): end of target period
        verbose (bool): print outcomes of intermediate steps
    
    Returns:
        set with senses
    """
    print("# senses before filtering by date =", df.shape[0])
    df = df[df.daterange.apply(filter_by_year_range, target_start=start, target_end=end)]
    print("# senses after filtering by date =", df.shape[0])
    
    
    seeds = df[df['provenance_type'] == "seed"].reset_index(inplace=False)
    # select words retrieved as synonyms
    # exclude those that already appear in the seed dataframe
    # reset index after selection
    synonyms = df[(df['provenance_type'] == "synonym") & (~df.id.isin(seeds.id))
                     ].reset_index(inplace=False)
    
    # select words retrieved as a branch of the synonym or a seed sense
    # exclude those that already appear as seed or synonym
    branches = df[(df['provenance_type'] == "branch") & (~df.id.isin(set(seeds.id).union(set(synonyms.id))))
                      ].reset_index(inplace=False)
    
    print("\n\n# of seed senses", seeds.shape[0],
          "\n# of synonyms", synonyms.shape[0],
          "\n# of branch senses", branches.shape[0])

    if "seed" in relations:
        seeds_selected = set(seeds[seeds.id.isin(sense_ids)].id)

    if "synonym" in relations:
        syn_sel_indices, synonyms_selected = select_senses_by_provenance(synonyms,sense_ids,relations)
    
    # as branches are retrieved by semantic class id, we get the semantic class ids 
    # of the seed AND synonyms senses
    select_seed_semantic_class_id = seeds[seeds.id.isin(seeds_selected)].semantic_class_last_id
    select_seed_semantic_class_id = set().union(*map(set,select_seed_semantic_class_id))
    
    select_synonyms_semantic_class_id = synonyms[synonyms.id.isin(synonyms_selected)].semantic_class_last_id
    select_synonyms_semantic_class_id = set().union(*map(set,select_synonyms_semantic_class_id))
    
    selected_semantic_class_id = set(select_seed_semantic_class_id).union(set(select_synonyms_semantic_class_id))
    
    branch_sel_indices, branches_selected = select_senses_by_provenance(branches,selected_semantic_class_id,relations)
    
    
    senses = set(branches.iloc[branch_sel_indices].id # for the branches we return the sense ids not the semantic class ids
               ).union(set(synonyms.iloc[syn_sel_indices].id)
                        ).union(set(seeds_selected))
    if verbose:
        print('\n\n# of seeds selected', len(seeds_selected),
              '\n# of synonyms selected', len(syn_sel_indices),
              '\n# of branches selected', len(branches_selected))
    return senses

In [None]:
senses = filter_senses(df,
                       {'machine_nn01-38475835','machine_nn01-38475923'},
                       ['seed','synonym','descendant'],
                       start=1760, 
                       end=1920
                      )

In [None]:
senses

### Inspect `obtain_quotations_for_senses`

In [None]:
def obtain_quotations_for_senses(
                      df_quotations:  pd.DataFrame,
                      senses: set) -> pd.DataFrame:
    """Create a dataframe with quotations and their metadata for 
    a selected set of senses. This function builds on
    harvest_quotations_by_sense_id.
    
    Arguments:
        df_quotations: dataframe with quotations, created using harvest_quotations_by_sense_id
        senses (set): set of senses for which we want to obtain quotations
        
    Returns:
        pd.DataFrame with quotations
        
    """
    df = pd.concat([
        pd.DataFrame.from_records(df_quotations.text.values),
        pd.DataFrame.from_records(df_quotations.source.values)
            ], axis=1)
    df['year'] = df_quotations['year']
    df['sense_id'] = df_quotations['sense_id']
    df_selected = df[df.sense_id.isin(senses)]
    
    df_selected.drop_duplicates(inplace=True)
    
    return df_selected



In [None]:
quotations = obtain_quotations_for_senses(df_quotations,senses)
quotations.head()

## Fin.