## OED Data Selection

Notebook that retrieves relevant data for OED API.

For relevant documentation, look [here](https://languages.oup.com/research/oed-researcher-api/#see-documentation)

In [59]:
import requests
import pickle
import pandas as pd
from tqdm.notebook import tqdm

In [11]:
auth = pickle.load(open('./dev/oed_keys.pkcl',"rb"))

In [60]:
def query_oed(
          auth:dict,
          endpoint:str,
          query:str,
          flags:str='',
          level:str='',
          verbose=True):
    
    """
    Get data from Oxford English Dictionary.
    Function requires an endpoint _and_ query as arguments. 
    
    Arguments:
    
        auth (dict): a dictionary with authentication inforamtion, needs details for 'app_id' and 'app_key'
    
        endpoint (str): select which endpoint to query, examples are word, sense, semanticclass etc
        
        query (str): query for the specific endpoint, most often a specific id, such as 'machine_nn01' or '120172'
        
        flags (str): options appended to query to include, for example, quotations instead of quotation ids
                     example "include_senses=false&include_quotations=false"
        
        level (str): at which level to query the endpoint, 
                     e.g. get sense of the query word, get siblings for semantic class etc
                     standard value is empty string
        
        verbose (bool): print the URL used for retrieving information from the API
        
    Returns:
    
        JSON of the response
        
    Example uses:
    
        query_oed(auth, 'word', 'machine_nn01')
            -> Retrieves information for the word machine_nn01.
        
        query_oed(auth, 'word','machine_nn01',level='quotations')
            -> Retrieves all quotations for machine_nn01.
        
        query_oed(auth, 'word', 'machine_nn01', flags="include_senses=true&include_quotations=true")
            -> Retrieves all senses and quotations  for the word machine_nn01.
            
        query_oed(auth, 'semanticclass', '163378')
            -> Retrieves semantic class with id 163378.
            
        query_oed(auth, 'semanticclass', '163378', level='children')
            -> Retrieves all children for the semanticlass with id 163378.
            
        query_oed(auth,'semanticclass', '163378', level='branchsenses',flags="current_in='1750-1950'")
            -> get all senses (siblings _and_ descendants) branching out from semantic class with id 163378
               restrict query to all senses observed between 1750 and 1950.
                
    """
    
    base_url = "https://oed-researcher-api.oxfordlanguages.com/oed/api/v0.2"
    
    url = f"{base_url}/{endpoint}/{query}" # build url
    
    if level: # if a level has been specified add this to the url
        url = f"{url}/{level}/"
    
    if flags: #  add flag to url with a question mark
        url = f"{url}?{flags}"
        
    response = requests.get(url, headers=auth) 
    
    if verbose:
        print(url)
        
    if response.status_code == 200: # check status code 
        return response.json() # return the data as json
    
    else:
        raise Exception(f"Error while accessing the API\nResponse code={response.status_code}")
    


# Retrieve and save all senses and quotations

In [4]:
machine = query_oed(auth, 'word','machine_nn01',flags="include_senses=true&include_quotations=true")

https://oed-researcher-api.oxfordlanguages.com/oed/api/v0.2/word/machine_nn01?include_senses=true&include_quotations=true


In [5]:
# with open('./dev/machine_senses_quotations.pckl','wb') as out_pickle:
#     pickle.dump(machine,out_pickle)

In [7]:
with open('./dev/machine_senses_quotations.pckl','rb') as in_pickle:
    machine = pickle.load(in_pickle)

## Retrieve and store all descendant and siblings for machine senses

In [65]:
get_semantic_class_idx = lambda data: [l[-1] for s in data['data']['senses']  for l in s['semantic_class_ids']]

# probably better make a normal function for this
get_branches = lambda sc_idx: { idx : query_oed(auth, 'semanticclass', idx, 
                                              level='branchsenses',
                                              flags="current_in='1750-1950'",
                                              verbose=False
                                               ) 
                                                   for idx in tqdm(sc_idx) 
                                      }





In [34]:
sem_class_idx = get_semantic_class_idx(machine)

In [66]:
branches = get_branches(sem_class_idx)

HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




In [67]:
# with open('./dev/machine_sc_branches.pckl','wb') as out_pickle:
#     pickle.dump(branches,out_pickle)

In [68]:
with open('./dev/machine_sc_branches.pckl','rb') as in_pickle:
    branches = pickle.load(in_pickle)

In [72]:
branches['120172']['data']

[{'id': 'arrangement_nn01-39073466',
  'meta': {'created': 1885,
   'revised': False,
   'updated': None,
   'sense_group': 'arrangement_nn01-g04',
   'position_in_entry': 4},
  'lemma': 'arrangement',
  'notes': [],
  'oed_url': 'https://www.oed.com/view/Entry/10968#eid39073466',
  'word_id': 'arrangement_nn01',
  'daterange': {'end': None,
   'start': 1800,
   'obsolete': False,
   'rangestring': '1800—'},
  'first_use': 'William Herschel',
  'categories': {'topic': [], 'region': [], 'register': []},
  'definition': 'concrete. A structure or combination of things arranged in a particular way or for any purpose; hence loosely, like affair, concern, production.',
  'transitivity': None,
  'oed_reference': 'arrangement, n., sense 4',
  'quotation_ids': ['arrangement_nn01-39073473',
   'arrangement_nn01-39073483',
   'arrangement_nn01-39073492'],
  'part_of_speech': 'NN',
  'main_current_sense': False,
  'semantic_class_ids': [['1',
    '97163',
    '101153',
    '104182',
    '104802',
