In [None]:
# Import requests and set the OBIS API base URL. 
import requests
import json
import pandas as pd
import urllib

# Convenience function to pretty print JSON objects
def print_json(myjson):
    print(json.dumps(
        myjson,
        sort_keys=True,
        indent=4,
        separators=(',', ': ')
    ))
    

# Initialize the base URL for OBIS. This variable will be used for every API call
OBIS_URL = "https://api.obis.org/v3"

In [None]:
# We are not sure which node ID to query so lets get all of the OTN nodes.

# node
req = requests.get(f'{OBIS_URL}/node')
nodes_json = req.json()

# count the amount of OBIS nodes
f"Total Nodes: {nodes_json['total']}"

In [None]:
# Prints all the names and IDs for each node
for node in nodes_json['results']:
    print(f'Name: {node["name"]} - ID: {node["id"]}')

In [None]:
# Oh look, the OBIS USA is a OBIS node, lets just return our record using the id value:
nodeID = 'b7c47783-a020-4173-b390-7b57c4fa1426'
# node/{nodeID}
req = requests.get(f'{OBIS_URL}/node/{nodeID}')
obis_usa_json = req.json()

# Show OBIS-USA node record
print_json(obis_usa_json)

In [None]:
req = requests.get(f'{OBIS_URL}/dataset?nodeid={nodeID}')
datasets = req.json()
print('Number of datasets in OBIS-USA:', datasets['total'])

Lets print out the metadata from one of the datasets.

In [None]:
print_json(datasets['results'][0])

Now, lets iterate through all the datasets and collect metadata into a Pandas DataFrame. We're skipping over the ipt from ipt.geome-db because the website doesn't load: https://ipt.geome-db.org/resource?r=dipnet

In [None]:
# Lets grab out some metadata about each dataset

from bs4 import BeautifulSoup

columns = ['title','url','size_raw','size_MB']

df = pd.DataFrame(
        columns=columns
    )

for dataset in datasets['results']:
    if 'ipt.geome-db.org' not in dataset['url']:
        print(dataset['title'])
        print(dataset['url'])
        html_text = requests.get(dataset['url']).text
        soup = BeautifulSoup(html_text, 'html.parser')
        
        size_raw = soup.find('td').text.split('(')[1].split(')')[0]
        size = float(size_raw.split(" ")[0].replace(",",""))
        size_unit = size_raw.split(" ")[1]
        
        #convert sizes to MB
        if size_unit == 'KB':
            size = size*0.001
        elif size_unit == 'MB':
            size = size
        
        df_init = pd.DataFrame(
                    {"title": dataset['title'],
                     "url": dataset['url'],
                     "size_raw": size_raw,
                     "size_MB": size,
                     },
                  index=[1])

        df = pd.concat([df, df_init], ignore_index=True)

Print out statistics about the package sizes (in MB).

In [None]:
print('sum\t',df['size_MB'].sum())
print(df['size_MB'].describe())

## Download each Darwin Core Archive package
Don't download if it's already on local machine.

In [None]:
import os
for url in df['url']:
    print(url)
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, 'html.parser')
    size_raw = soup.find('td')
    
    url_download = size_raw.find('a').get('href')
    fname = 'OBIS_data/'+url.split('=')[-1]+'.zip'
    
    if not os.path.exists(fname):
        print('Downloading '+url)
        urllib.request.urlretrieve(url_download, fname)
        print('Downloaded to '+fname)

Read the occurrence data from all the packages you just downloaded.

In [None]:
# occurrence1 = pd.DataFrame(
#         columns = ['id', 'type', 'basisOfRecord', 'occurrenceID', 'occurrenceStatus',
#        'eventID', 'eventDate', 'decimalLatitude', 'decimalLongitude',
#        'scientificNameID', 'scientificName', 'kingdom', 'phylum', 'class',
#        'order', 'family', 'genus', 'taxonRank', 'scientificNameAuthorship'])

# from zipfile import ZipFile
# for obis_zip in os.listdir('OBIS_data/'):
#     if not obis_zip == 'unzipped':
#         with ZipFile('OBIS_data/'+obis_zip,'r') as zip:
#             df_init = pd.read_csv(obis_zip.open('occurrence.txt'), sep='\t') # not every occurrence file has eventDate
#             # extract all zip packages
#             # zip.extract_all(path='OBIS_data/unzipped/'+obis_zip.replace('.zip','/'))
#             # zip.ZIP_STORED
#             # might be able to read into pandas
#             occurrence1 = pd.concat([occurrence1, df_init], ignore_index=True)

Try to use the darwin core python reader package from https://python-dwca-reader.readthedocs.io/en/latest/index.html

In [None]:
with DwCAReader('OBIS_data/ambon_cetaceans_2015.zip') as dwca:
    print(dwca.archive_path)
    root = dwca.metadata
    node = root.find('.//westBoundingCoordinate')
    print('%s: %s' % (node.tag, node.text))

Now lets do some automated ingest of all the data:

In [None]:
from dwca.read import DwCAReader
from dwca.darwincore.utils import qualname as qn
import pandas as pd
import os

core_df = pd.DataFrame()
# occurrence only = OBIS_data/wod_2009.zip
# event = OBIS_data/ambon_cetaceans_2015.zip
for obis_zip in os.listdir('OBIS_data/'):
    if not obis_zip == 'unzipped':
        with DwCAReader('OBIS_data/'+obis_zip) as dwca:
            #eml = dwca.metadata
            print("\nReading: %s" % dwca.archive_path)
            print("Core type is: %s" % dwca.descriptor.core.type)
            print("Core data file is: %s" % dwca.descriptor.core.file_location)
            for ex in dwca.descriptor.extensions:
                print('Extensions: ',ex.type)

            core_df = pd.concat(
                [core_df, dwca.pd_read(dwca.core_file_location, parse_dates = True)], 
                axis = 0, 
                ignore_index = True)

In [None]:
#import xml.etree.ElementTree as ET
print(dwca.archive_path)
root = dwca.metadata
for child in root.findall('.//role'):
    print(child.tag, child.text)

In [None]:
core_df.columns.to_list()

In [None]:
#core_df['eventDate'].filter(like='0001', axis=0)
import numpy as np

## non-convertable dates:
bad_dates = ["0001-05-17","0001-04-11","0001-05-17","0001-04-11","0001-04-11","0001-04-11","0193-09-10","0193-09-10","0001-05-18","1027-10-24","0001-04-11","0001-04-11","0001-05-18","0001-04-11","0001-05-18","0001-04-11","0001-05-17","0001-04-11","0001-04-11","0001-04-11","0001-07-30","0001-07-30","0001-04-11","0001-05-17","0001-04-11","0001-05-17","2000-07-18","2006-10-05T11:55","3291-01-27","1975-10-05T20:15:00Z","1981-05-10T23:10:12Z","1985-07-09T12:00:00Z","1977-11-03T12:00:00Z","1988-10-09T02:15:00Z","1989-09-24T08:58:12Z","1995-01-27T03:01:48Z","1967-02-07T07:10:12Z"]
# replace those w/ nan
core_df['eventDate'].replace(bad_dates,np.nan, inplace = True)
# 164,341 observations have null dates
pd.to_datetime(core_df['eventDate'])

In [None]:
# print number of bad dates
core_df.loc[core_df['eventDate'].isnull()]

In [None]:
core_df.shape

In [None]:
pd.to_datetime(core_df['eventDate'],infer_datetime_format=True).min()

In [None]:
for value in core_df['eventDate'].values:
    try:
        pd.to_datetime(value)
    except:
        print(value)
        break

In [None]:
df_sizes = pd.DataFrame({
    'size':[0.015835,
0.018933,
0.049817,
0.087867,
0.110903,
0.159178,
0.197299,
0.241113,
0.297959,
0.415823,
0.457184,
0.510765,
0.590182,
0.601713,
0.703488,
0.703526,
0.703676,
0.703744,
0.703871,
0.703871,
0.777377,
0.924155,
1.10372,
1.44756,
1.56119,
1.56151,
1.5618,
1.56186,
1.90727,
2.2512,
2.25399,
2.25418,
2.25419,
2.25465,
2.25466,
2.2547,
2.25517,
2.25533,
2.25653,
2.27472,
2.38664,
2.77899,
3.26323,
3.49174,
3.73588,
3.91314,
4.10337,
4.11706,
4.37327,
4.73277,
5.53673,
5.80743,
6.01187,
6.01363,
6.01561,
6.02871,
6.03303,
6.0342,
6.21464,
6.33658,
6.50698,
6.60439,
6.67536,
6.83223,
7.04321,
9.09705,
9.58163,
9.70774,
9.84124,
9.84241,
10.5076,
10.598,
10.7722,
11.7014,
12.8499,
12.8959,
14.8624,
16.0524,
16.512,
17.2536,
18.5005,
19.8645,
23.5451,
23.961,
26.2298,
26.3219,
29.0074,
29.0466,
31.8277,
34.5672,
34.9802,
36.9862,
41.1389,
46.6154,
51.1379,
54.0095,
60.7156,
61.032,
78.5715,
78.7834,
82.6558,
94.6736,
96.1423,
98.5113,
116.51,
121.919,
127.061,
161.56,
167.974,
173.018,
173.473,
182.459,
190.971,
198.847,
203.816,
206.923,
212.72,
221.919,
230.564,
234.129,
239.094,
245.25,
245.448,
251.73,
254.732,
257.656,
266.012,
272.378,
286.562,
287.254,
296.639,
303.077,
306.597,
324.479,
325.387,
337.982,
354.244,
373.063,
416.506,
477.162,
612.461,
804.789,
844.7,
1058.94,
3681.96]
})

In [None]:
print(df_sizes.sum())
print(df_sizes.describe())

Read in the unpackaged occurrence data from `OBIS_data/unzipped`.

In [None]:
occurrence = pd.DataFrame(
        columns = ['id', 'type', 'basisOfRecord', 'occurrenceID', 'occurrenceStatus',
       'eventID', 'eventDate', 'decimalLatitude', 'decimalLongitude',
       'scientificNameID', 'scientificName', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'taxonRank', 'scientificNameAuthorship'])

for package in os.listdir('OBIS_data/unzipped/'):
    print('reading',package)
    
    df_init = pd.read_csv('OBIS_data/unzipped/'+package+'/occurrence.txt', sep = '\t')
    occurrence = pd.concat([occurrence, df_init], ignore_index=True)

In [None]:
pd.to_datetime(occurrence['eventDate']).min()

In [None]:
occurrence['eventDate']

In [None]:
print(core_df[['decimalLatitude','decimalLongitude']].describe())

In [None]:
core_df[['decimalLatitude','decimalLongitude']].min()

In [None]:
core_df[['decimalLatitude','decimalLongitude']].max()