# Review Notebook for Sense Filtering by Time and Provenance

This notebook builds on 4.1 (harvest senses with provenance) and 4.2 (harvest quotations for senses). The code in this notebook allows you to filter senses by (a) selecting seed senses (b) defining relations words should have to the seed senses.

Functions reviewed in this notebook:
- `filter_by_year_range` (helper)
- `select_senses_by_provenance`(helper)
- `filter_senses` (main)

Part of:
- `utils.dataset_download`

Creator: Kaspar Beelen

Reviewer(s):


In [1]:
!git branch

  19-machine-tagger[m
* [32m4-semantic-provenance[m
  44-kNN-BERT-baseline[m
  dev[m
  master[m


## Load libraries, data and set parameters

In [17]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
%autoreload 2

In [19]:
import pandas as pd
import json
from utils.dataset_download import *

In [20]:
lemma_id = 'machine_nn01'

In [21]:
# import API credentials
with open('./oed_experiments/oed_credentials.json') as f:
    auth = json.load(f)

In [32]:
df = pd.read_pickle(f'./data/extended_{lemma_id}.pickle')
df.shape

(8383, 19)

In [24]:
df.head(3)

Unnamed: 0,categories,daterange,definition,first_use,id,lemma,main_current_sense,meta,notes,oed_reference,oed_url,part_of_speech,provenance,provenance_type,quotation_ids,semantic_class_ids,semantic_class_last_id,transitivity,word_id
0,"{'topic': [], 'usage': [['rare']], 'region': []}","{'end': None, 'start': 1545, 'obsolete': False...","A material or immaterial structure, esp. the f...",J. Schäfer,machine_nn01-38473945,machine,False,"{'created': 1904, 'revised': True, 'updated': ...",[],"machine, n., sense I.1a",https://www.oed.com/view/Entry/111850#eid38473945,NN,"[[machine_nn01-38473945, seed, machine_nn01]]",seed,"[machine_nn01-38473950, machine_nn01-38473961,...","[[1, 111290, 118635, 119024, 120162, 120172], ...","[120172, 120173]",,machine_nn01
1,"{'topic': [['Military', 'Weaponry']], 'usage':...","{'end': None, 'start': 1583, 'obsolete': False...",A military engine or siege-tower. Cf. war mach...,Brian Melbancke,machine_nn01-38474233,machine,False,"{'created': 1904, 'revised': True, 'updated': ...",[],"machine, n., sense II.3",https://www.oed.com/view/Entry/111850#eid38474233,NN,"[[machine_nn01-38474233, seed, machine_nn01]]",seed,"[machine_nn01-38474243, machine_nn01-38474252,...","[[153072, 160439, 163207, 163208, 163377, 1633...",[163378],,machine_nn01
2,"{'topic': [], 'usage': [], 'region': []}","{'end': 1707, 'start': 1595, 'obsolete': True,...",spec. A scheme or plot. Obsolete.,Elizabeth I,machine_nn01-38474097,machine,False,"{'created': 1904, 'revised': True, 'updated': ...",[],"machine, n., sense I.1b",https://www.oed.com/view/Entry/111850#eid38474097,NN,"[[machine_nn01-38474097, seed, machine_nn01]]",seed,"[machine_nn01-38474102, machine_nn01-38474122,...","[[1, 84689, 87987, 87988, 87989, 88083, 88109,...",[88126],,machine_nn01


In [25]:
df_quotations = pd.read_pickle(f'./data/quotations_all_{lemma_id}.pickle')
df_quotations.shape

(203560, 12)

In [26]:
df_quotations.head()

Unnamed: 0,id,text,year,lemma,source,oed_url,word_id,sense_id,datestring,first_in_word,oed_reference,first_in_sense
0,pigmeat_nn01-13163366,"{'keyword': 'pig-meat', 'full_text': 'I was at...",1754,pigmeat,"{'title': 'Connoisseur', 'author': 'G. Colman'...",https://www.oed.com/view/Entry/237320#eid13163366,pigmeat_nn01,pigmeat_nn01-13163363,1754,True,"pigmeat, n., sense 1",True
1,pigmeat_nn01-13163379,"{'keyword': 'pig-meat', 'full_text': 'In short...",1784,pigmeat,"{'title': 'Year's Journey through Paix Bâs', '...",https://www.oed.com/view/Entry/237320#eid13163379,pigmeat_nn01,pigmeat_nn01-13163363,1784,False,"pigmeat, n., sense 1",False
2,pigmeat_nn01-13163399,"{'keyword': 'pig meat', 'full_text': 'It preve...",1817,pigmeat,"{'title': 'Parl. Deb.', 'author': None, 'gende...",https://www.oed.com/view/Entry/237320#eid13163399,pigmeat_nn01,pigmeat_nn01-13163363,1817,False,"pigmeat, n., sense 1",False
3,pigmeat_nn01-13163416,"{'keyword': 'pig meat', 'full_text': 'In most ...",1897,pigmeat,"{'title': 'Syst. Med.', 'author': 'T. C. Allbu...",https://www.oed.com/view/Entry/237320#eid13163416,pigmeat_nn01,pigmeat_nn01-13163363,1897,False,"pigmeat, n., sense 1",False
4,pigmeat_nn01-13163425,"{'keyword': 'pig meat', 'full_text': 'Beef tak...",1918,pigmeat,"{'title': 'Times', 'author': None, 'gender': N...",https://www.oed.com/view/Entry/237320#eid13163425,pigmeat_nn01,pigmeat_nn01-13163363,1918,False,"pigmeat, n., sense 1",False


# Run code

In [27]:
set(df[df.id.isin(['machine_nn01-38475835','machine_nn01-38475923'])].definition)

{'A conceptual, abstract, or theoretical mechanism or device; spec. a model or a mathematical abstraction of an existing or hypothetical computer. Cf. Turing machine n.',
 'Mechanics. Anything that transmits force or directs its application.'}

In [28]:
senses = filter_senses(df,
                       {'machine_nn01-38475835','machine_nn01-38475923'},
                       relations = ['seed','synonym','descendant','sibling'], # 'all',
                       expand_seeds=True,
                       expand_synonyms=False,
                       start=1760, 
                       end=1920
                      )

# senses before filtering by date = 8383
# senses after filtering by date = 5918


# of seed senses 22 
# of synonyms 310 
# of branch senses 4990


# of seeds selected 1 
# of synonyms selected 8 
# of branches selected 6


In [29]:
len(senses)

19

In [33]:
df_source = pd.read_pickle("./data/extended_machine_nn01.pickle")

In [34]:
quotations = obtain_quotations_for_senses(df_quotations,
                                df_source,                  
                                senses,
                                start=1760,end=1920)
quotations.shape

(138, 13)

In [35]:
quotations.head()

Unnamed: 0,keyword,full_text,keyword_offset,title,author,gender,year,sense_id,daterange,provenance,provenance_type,relation_to_core_senses,relation_to_seed_senses
0,mover,The treasury became literally moneyless and al...,75.0,Writings,T. Jefferson,male,1788,mover_nn01-35820685,"{'end': None, 'start': 1626, 'obsolete': False...","[[mover_nn01-35820685, synonym, machine_nn01-3...",synonym,"{primummobile_nn01-28369573, machine_nn01-3847...",{machine_nn01-38475923}
1,mover,The treasury became literally moneyless and al...,75.0,Writings,T. Jefferson,male,1788,mover_nn01-35820685,"{'end': None, 'start': 1626, 'obsolete': False...","[[84483, sibling, 84483]]",branch,"{primummobile_nn01-28369573, machine_nn01-3847...",{machine_nn01-38475923}
2,mover,The treasury became literally moneyless and al...,75.0,Writings,T. Jefferson,male,1788,mover_nn01-35820685,"{'end': None, 'start': 1626, 'obsolete': False...","[[214198, sibling, 214198]]",branch,"{primummobile_nn01-28369573, machine_nn01-3847...",{machine_nn01-38475923}
3,first mover,"When a fly is used merely as a regulator, it s...",64.0,Panorama Sci. & Art,J. Smith,male,1815,mover_nn01-35820685,"{'end': None, 'start': 1626, 'obsolete': False...","[[mover_nn01-35820685, synonym, machine_nn01-3...",synonym,"{primummobile_nn01-28369573, machine_nn01-3847...",{machine_nn01-38475923}
4,first mover,"When a fly is used merely as a regulator, it s...",64.0,Panorama Sci. & Art,J. Smith,male,1815,mover_nn01-35820685,"{'end': None, 'start': 1626, 'obsolete': False...","[[84483, sibling, 84483]]",branch,"{primummobile_nn01-28369573, machine_nn01-3847...",{machine_nn01-38475923}


In [36]:
set().union(*quotations.relation_to_core_senses)

{'energizer_nn01-1246574970',
 'generator_nn01-135539783',
 'machine_nn01-38475923',
 'machinepower_nn01-38476982',
 'mechanicaladvantage_nn01-37502752',
 'mover_nn01-35820685',
 'power_nn01-28687898',
 'primemover_nn01-28348676',
 'primummobile_nn01-28369573',
 'vice_nn02-15526247'}

In [37]:
set().union(*quotations.relation_to_seed_senses)

{'machine_nn01-38475923'}

# Test Demo Dataframe

In [38]:
df_quotations_demo = pd.read_pickle(f'./data/quotations_all_demo_{lemma_id}.pickle')
df_quotations_demo.shape

(235, 12)

In [40]:
df_quotations_demo.head()

Unnamed: 0,id,text,year,lemma,source,oed_url,word_id,sense_id,datestring,first_in_word,oed_reference,first_in_sense
0,brike_nn01-13524886,"{'keyword': 'bryke', 'full_text': 'bryke.', 'k...",1400,brike,"{'title': 'On the Seven Deadly Sins', 'author'...",https://www.oed.com/view/Entry/23326#eid13524886,brike_nn01,brike_nn01-13524883,c1400,True,"brike, n.",True
1,brike_nn01-13524897,"{'keyword': 'brike', 'full_text': 'Genylon Oly...",1405,brike,"{'title': 'Monk's Tale', 'author': 'G. Chaucer...",https://www.oed.com/view/Entry/23326#eid13524897,brike_nn01,brike_nn01-13524883,c1405,False,"brike, n.",False
2,brike_nn01-1024597551,"{'keyword': 'brike', 'full_text': 'I that am i...",1499,brike,"{'title': 'Poems from Pilgrimage of Soul', 'au...",https://www.oed.com/view/Entry/23326#eid102459...,brike_nn01,brike_nn01-13524883,a1500,False,"brike, n.",False
3,lorelei_nn01-38866541,"{'keyword': 'Loreley', 'full_text': 'The Lorel...",1878,Lorelei,"{'title': 'Chambers's Encycl.', 'author': None...",https://www.oed.com/view/Entry/110339#eid38866541,lorelei_nn01,lorelei_nn01-38866532,1878,True,"Lorelei, n.",True
4,lorelei_nn01-38866549,"{'keyword': 'loreleis', 'full_text': 'Instead ...",1908,Lorelei,"{'title': 'Strictly Business', 'author': '‘O. ...",https://www.oed.com/view/Entry/110339#eid38866549,lorelei_nn01,lorelei_nn01-38866532,1908,False,"Lorelei, n.",False


In [39]:
# checking if information between demo and full version is the same
list(df_quotations_demo.columns) == list(df_quotations.columns)

True

# Fin.