### Load up test samples

Let's load up some live examples from SESAR

In [1]:
import requests
from tqdm import tqdm

USER_CODES = {'ODP', 'IEUHM', 'IELCZ', 'NHB', 'HRV'}

def get_igsns(limit=10, pagesize=100, user_code='ODP'):
    """
    Get me some IGSNs from SESAR. 
    
    Hits the endpoint to discover the list of IGSNs for a given user code (this is reaaaaaly slow....)
    
    Parameters:
        limit - the total number of IGSN samples to get
        pagesize - the pagesize to generate these at. SESAR's server is reaaaaallly slow so 
            make this a bigger number if you can.
        user_code - a valid user code - see USER_CODES for valid values
    """
    # check user code
    user_code = user_code or 'ODP'
    if user_code not in USER_CODES:
        raise ValueError(f'Unknown user code {user_code}, valid values are {USER_CODES}')
    
    # Paginate over results and accumulate values
    igsns, page, length = [], 1, 0
    with tqdm(desc=f'Getting IGSNs...', total=limit) as pbar:
        while length < limit:
            response = requests.get(
                f'https://app.geosamples.org/samples/user_code/{user_code}', 
                {'page_no': page, 'limit': min(limit - length, pagesize)}, 
                headers={'Accept': 'application/json'}
            )
            if response.ok:
                new_igsns = response.json()['igsn_list']
                igsns.extend(new_igsns)
                pbar.update(len(new_igsns))
                length += len(new_igsns)
                page += 1
            else:
                break
    return igsns

In [73]:
igsns = get_igsns(user_code='IELCZ', limit=25)
igsns[:2]

Getting IGSNs...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:16<00:00,  1.53it/s]


['IELCZ197D', 'IELCZ196D']

Now we can resolve the IGSNs to get the actual data for them

In [99]:
def resolve(*igsns, host=None, accept=None):
    """
    Resolve some IGSNs and get their data
    
    Parameters:
        *igsns - a list of IGSN URIs to resolve
        host - the host to use for resolution (e.g. igsn.org). If None, defaults to 
            using handle.net (hdl.handle.net/20273)
        accept - the format to accept. If None defaults to application/json.
    
    Returns:
        if accept == application/json, then this will return Python objects (using 
        requests' `.json()` method), otherwise a list of text strings containing the
        content of the responses.
    """
    # If no host, default to handle.net
    host = host or 'hdl.handle.net/10273'
    
    # Loop through and load IGSNs
    data = []
    for igsn in tqdm(igsns, total=len(igsns), ncols=50):
        response = requests.get(
            f'http://{host}/{igsn}',
            headers={"Accept": accept or 'application/json'}
        )
        if response.ok:
            if accept == 'application/json':
                data.append(response.json())
            else:
                data.append(response.text)
        else:
            print(f'Failed to get {igsn} (HTTP status {response.status_code}), skipping')
    
    return data

def igsn_to_dataframe(igsn_jsons):
    "Convert IGSN data to a dataframe"
    return pd.DataFrame.from_records([
        content['sample'] for content in igsn_jsons
    ])

Let's download some IGSN data from SESAR and do something with it

In [104]:
## Uncomment to download data
# xml_data = resolve(*igsns, accept='text/xml')
# json_data = resolve(*igsns, accept='application/json')

## Dump to file for later
# with open('../data/sesar_igsns.json', 'w') as sink:
#     json.dump(json_data, sink)
# with open('../data/sesar_igsns.xml', 'w') as sink:
#     sink.writelines(xml_data)

with open('../data/sesar_igsns.xml', 'r') as src:
    xml_data = [
        l for l in src.readlines()
    ]
    
with open('../data/sesar_igsns.json', 'r') as src:
    json_data = json.load(src)

df = igsn_to_dataframe(json_data)

Let's generate some data with our schema

In [123]:
import json

schema_file = '../data/schema.igsn.org/json/v0.0.1/registry.json'
with open(schema_file, 'r', encoding='utf-8') as src:
    schema = json.load(src)
pjson(schema)

In [133]:
from faker import Faker

In [134]:
def generate_fakes(schema, limit=None, faker=None):
    """
    Creates a data generator based on a JSONSchema instance
    """
    faker = faker or Faker()
    for 
    result = {}
    for k, v in 
    yield result

In [135]:
generate_fakes(schema)

In [132]:
faker.generate_fake(schema)

AttributeError: 'Generator' object has no attribute 'http://json-schema.org/draft-07/schema#'

In [126]:
faker.generate_fake(schema['properties'])

AttributeError: 'Generator' object has no attribute 'identifiers.json#/igsn'

Now let's use these against our schema to see how well they work

### Validating IGSN metadata against our JSONSchema

This is some testing against the hand-rolled JSONSchema info we have for IGSN

In [85]:
import json

from highlight import pprint, pjson  # a couple of pretty-printers for code 

with open('../data/schema.igsn.org/json/v0.0.1/registry.json', 'r') as src:
    registry_schema = json.load(src)
    
pjson(registry_schema['properties'])

In [86]:
from jsonschema import validate

In [89]:
pjson(data[0])