# Review Notebook for Obtaining Quotations form the OED API

Review code for downloading quotations and saving them as a pickle file.

Function to review:
- `harvest_quotations` (main)
    
Part of:
- `utils.dataset_download`

Creator: Kaspar Beelen

Reviewer(s):


In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import pandas as pd
import json
from tqdm.notebook import tqdm
from utils.dataset_download import *

## Download quotations

Function to download quotations given a dataframe with information obtained via the OED senses endpoint.
- `harvest_quotations_by_sense_id(auth,lemma_id)`

In [4]:
lemma_id = 'machine_nn01'

In [5]:
# import API credentials
with open('oed_experiments/oed_credentials.json') as f:
    auth = json.load(f)

# Run code

**Warning**: please don't run code over the complete dataframe, it may risk overusing the API monthly limit.

In [6]:
# don't run if not necessary, e.g. for testing purposes
# use Kernel > Interrupt
# harvest_quotations_by_sense(auth,'machine_nn01')

# Inspect code

*Warning*: please don't run over the complete dataframe, or it may risk overusing the API monthly limit.

In [27]:
if level == 'sense':
    ids = set(df.id)
    suffix = 'related'
elif level == 'word':
    ids = set(df.word_id)
    suffix = 'all'
else:
    raise Exception("Choose 'word' or 'level' as value for the 'level' argument")

In [28]:
len(ids)

6035

In [29]:
responses = [query_oed(auth,level, idx, level='quotations') for idx in tqdm(ids)]
quotation_df = pd.DataFrame([q for r in responses for q in r['data']])
quotation_df.to_pickle(f'./data/quotations_{suffix}_{lemma_id}.pickle')

HBox(children=(FloatProgress(value=0.0, max=6035.0), HTML(value='')))




In [26]:
quotation_df

Unnamed: 0,id,text,year,lemma,source,oed_url,word_id,sense_id,datestring,first_in_word,oed_reference,first_in_sense
0,pigmeat_nn01-13163366,"{'keyword': 'pig-meat', 'full_text': 'I was at...",1754,pigmeat,"{'title': 'Connoisseur', 'author': 'G. Colman'...",https://www.oed.com/view/Entry/237320#eid13163366,pigmeat_nn01,pigmeat_nn01-13163363,1754,True,"pigmeat, n., sense 1",True
1,pigmeat_nn01-13163379,"{'keyword': 'pig-meat', 'full_text': 'In short...",1784,pigmeat,"{'title': 'Year's Journey through Paix Bâs', '...",https://www.oed.com/view/Entry/237320#eid13163379,pigmeat_nn01,pigmeat_nn01-13163363,1784,False,"pigmeat, n., sense 1",False
2,pigmeat_nn01-13163399,"{'keyword': 'pig meat', 'full_text': 'It preve...",1817,pigmeat,"{'title': 'Parl. Deb.', 'author': None, 'gende...",https://www.oed.com/view/Entry/237320#eid13163399,pigmeat_nn01,pigmeat_nn01-13163363,1817,False,"pigmeat, n., sense 1",False
3,pigmeat_nn01-13163416,"{'keyword': 'pig meat', 'full_text': 'In most ...",1897,pigmeat,"{'title': 'Syst. Med.', 'author': 'T. C. Allbu...",https://www.oed.com/view/Entry/237320#eid13163416,pigmeat_nn01,pigmeat_nn01-13163363,1897,False,"pigmeat, n., sense 1",False
4,pigmeat_nn01-13163425,"{'keyword': 'pig meat', 'full_text': 'Beef tak...",1918,pigmeat,"{'title': 'Times', 'author': None, 'gender': N...",https://www.oed.com/view/Entry/237320#eid13163425,pigmeat_nn01,pigmeat_nn01-13163363,1918,False,"pigmeat, n., sense 1",False
...,...,...,...,...,...,...,...,...,...,...,...,...
134,shebang_nn01-23236591,"{'keyword': 'whole shebang', 'full_text': 'I'v...",1948,shebang,"{'title': 'Golconda', 'author': 'V. Palmer', '...",https://www.oed.com/view/Entry/177736#eid23236591,shebang_nn01,shebang_nn01-23236545,1948,False,"shebang, n., sense 2",False
135,shebang_nn01-23236535,"{'keyword': 'shebangs', 'full_text': 'Less pic...",1963,shebang,"{'title': 'Pioneer Farmer', 'author': 'E. C. G...",https://www.oed.com/view/Entry/177736#eid23236535,shebang_nn01,shebang_nn01-23236507,1963,False,"shebang, n., sense 1c",False
136,shebang_nn01-23236599,"{'keyword': 'shebang', 'full_text': 'You can't...",1967,shebang,"{'title': 'Boston Sunday Herald', 'author': No...",https://www.oed.com/view/Entry/177736#eid23236599,shebang_nn01,shebang_nn01-23236545,1967,False,"shebang, n., sense 2",False
137,shebang_nn01-23236608,"{'keyword': 'whole shebang', 'full_text': 'The...",1977,shebang,"{'title': 'Introd. Risk Anal.', 'author': 'R. ...",https://www.oed.com/view/Entry/177736#eid23236608,shebang_nn01,shebang_nn01-23236545,1977,False,"shebang, n., sense 2",False


Function:

In [None]:
def harvest_quotations(auth: dict,lemma_id: str, level: str) -> pd.DataFrame:
    """
    Given a dataframe obtained via the OED sense endpoints
    retrieve all quotations for the included words or senses and save them
    as a dataframe, path ./data/quotations_{lemma_id}.pickle
     
    Argument:
        lemma_id (str): lemma of the seed query
        level (str): endpoint for harvesting quotatios (sense or word)
                    when using the sense endpoint we only get quotations relevant to
                    the initial lemma
    Returns:
        saves and returns a pd.DataFrame with quotations
    """
    df = pd.read_pickle(f'./data/extended_{lemma_id}.pickle')
    if level == 'sense':
        ids = set(df.id)
        suffix = 'related'
    elif level == 'word':
        ids = set(df.word_id)
        suffix = 'all'
    else:
        raise Exception("Choose 'word' or 'sense' as values for the 'level' argument")

    responses = [query_oed(auth,level, idx, level='quotations') for idx in tqdm(ids)]
    quotation_df = pd.DataFrame([q for r in responses for q in r['data']])
    quotation_df.to_pickle(f'./data/quotations_{suffix}_{lemma_id}.pickle')
    return quotation_df

## Fin.