In [None]:
# Import requests and set the OBIS API base URL. 
import requests
import json
import pandas as pd
import urllib

# Convenience function to pretty print JSON objects
def print_json(myjson):
    print(json.dumps(
        myjson,
        sort_keys=True,
        indent=4,
        separators=(',', ': ')
    ))
    

# Initialize the base URL for OBIS. This variable will be used for every API call
OBIS_URL = "https://api.obis.org/v3"

We are not sure which node ID to query so lets get all of the OBIS nodes.

In [None]:
# node
req = requests.get(f'{OBIS_URL}/node')
nodes_json = req.json()

# count the amount of OBIS nodes
f"Total Nodes: {nodes_json['total']}"

Print all the names and IDs for each node

In [None]:
for node in nodes_json['results']:
    print(f'Name: {node["name"]} - ID: {node["id"]}')

Lets just return the OBIS-USA record using the id value:

In [None]:
nodeID = 'b7c47783-a020-4173-b390-7b57c4fa1426'
# node/{nodeID}
req = requests.get(f'{OBIS_URL}/node/{nodeID}')
obis_usa_json = req.json()

# Show OBIS-USA node record
print_json(obis_usa_json)

Find the number of datasets currently in OBIS-USA

In [None]:
req = requests.get(f'{OBIS_URL}/dataset?nodeid={nodeID}')
datasets = req.json()
print('Number of datasets in OBIS-USA:', datasets['total'])

Lets print out the metadata from one of the datasets.

In [None]:
print_json(datasets['results'][0])

Now, lets iterate through all the datasets and collect metadata into a Pandas DataFrame. We're skipping over the ipt from `ipt.geome-db` because the website doesn't load: https://ipt.geome-db.org/resource?r=dipnet

To do this, we use the urls for each dataset in the OBIS-USA node and parse the html page for the **size** and **title** of the dataset. We also convert the size to MB. We return a pandas DataFrame with four columns: **title**, **url**, **size_raw**, **size_MB**

In [None]:
# Lets grab out some metadata about each dataset

from bs4 import BeautifulSoup

columns = ['title','url','size_raw','size_MB']

df = pd.DataFrame(
        columns=columns
    )

for dataset in datasets['results']:
    if 'ipt.geome-db.org' not in dataset['url']:
        print(dataset['title'])
        print(dataset['url'])
        html_text = requests.get(dataset['url']).text
        soup = BeautifulSoup(html_text, 'html.parser')
        
        size_raw = soup.find('td').text.split('(')[1].split(')')[0]
        size = float(size_raw.split(" ")[0].replace(",",""))
        size_unit = size_raw.split(" ")[1]
        
        #convert sizes to MB
        if size_unit == 'KB':
            size = size*0.001
        elif size_unit == 'MB':
            size = size
        
        df_init = pd.DataFrame(
                    {"title": dataset['title'],
                     "url": dataset['url'],
                     "size_raw": size_raw,
                     "size_MB": size,
                     },
                  index=[1])

        df = pd.concat([df, df_init], ignore_index=True)

Print out statistics about the package sizes (in MB).

In [None]:
print('sum\t',df['size_MB'].sum())
print(df['size_MB'].describe())

## Download each Darwin Core Archive package
For each dataset, we download the [DwC-A](https://github.com/gbif/ipt/wiki/DwCAHowToGuide#what-is-darwin-core-archive-dwc-a) zip package. To do that we:

1. Collect the DwC-A zip url by parsing the **IPT** dataset html page, looking for the **Data as a DwC-A file** `download` link.  
1. We download the zip package to the file `OBIS_data/{dataset short name}.zip` (eg. `OBIS_data/habsos.zip`) 
   1. Don't download if it's already on local machine.

In [None]:
import os
for url in df['url']:
    print(url)
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, 'html.parser')
    size_raw = soup.find('td')
    
    zip_download = size_raw.find('a').get('href')
    fname = 'OBIS_data/'+url.split('=')[-1]+'.zip'
    
    if not os.path.exists(fname):
        print('Downloading '+url)
        urllib.request.urlretrieve(zip_download, fname)
        print('Downloaded to '+fname)

Manually read the occurrence data from all the packages you just downloaded.

In [None]:
# occurrence1 = pd.DataFrame(
#         columns = ['id', 'type', 'basisOfRecord', 'occurrenceID', 'occurrenceStatus',
#        'eventID', 'eventDate', 'decimalLatitude', 'decimalLongitude',
#        'scientificNameID', 'scientificName', 'kingdom', 'phylum', 'class',
#        'order', 'family', 'genus', 'taxonRank', 'scientificNameAuthorship'])

# from zipfile import ZipFile
# for obis_zip in os.listdir('OBIS_data/'):
#     if not obis_zip == 'unzipped':
#         with ZipFile('OBIS_data/'+obis_zip,'r') as zip:
#             df_init = pd.read_csv(obis_zip.open('occurrence.txt'), sep='\t') # not every occurrence file has eventDate
#             # extract all zip packages
#             # zip.extract_all(path='OBIS_data/unzipped/'+obis_zip.replace('.zip','/'))
#             # zip.ZIP_STORED
#             # might be able to read into pandas
#             occurrence1 = pd.concat([occurrence1, df_init], ignore_index=True)

Use the [darwin core python reader package](https://python-dwca-reader.readthedocs.io/en/latest/index.html) to print out some metadata about the DwC-A package.

In [None]:
with DwCAReader('OBIS_data/ambon_cetaceans_2015.zip') as dwca:
    print(dwca.archive_path)
    root = dwca.metadata
    node = root.find('.//westBoundingCoordinate')
    print('%s: %s' % (node.tag, node.text))

Now lets do some automated ingest of all the data:
1. For each zip package
   1. Read the core file into a Pandas DataFrame.
   1. Concatenate all the core data into one large data frame.
   1. Print out some useful information as each package is processed.

In [1]:
from dwca.read import DwCAReader
from dwca.darwincore.utils import qualname as qn
import pandas as pd
import os

core_df = pd.DataFrame()
# occurrence only = OBIS_data/wod_2009.zip
# event = OBIS_data/ambon_cetaceans_2015.zip
for obis_zip in os.listdir('OBIS_data/'):
    if not obis_zip == 'unzipped':
        with DwCAReader('OBIS_data/'+obis_zip) as dwca:
            #eml = dwca.metadata
            print("\nReading: %s" % dwca.archive_path)
            print("Core type is: %s" % dwca.descriptor.core.type)
            print("Core data file is: %s" % dwca.descriptor.core.file_location)
            for ex in dwca.descriptor.extensions:
                print('Extensions: ',ex.type)

            core_df = pd.concat(
                [core_df, dwca.pd_read(dwca.core_file_location, parse_dates = True)], 
                axis = 0, 
                ignore_index = True)


Reading: OBIS_data/2009floridakeysrvc.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence

Reading: OBIS_data/2012floridakeys.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact

Reading: OBIS_data/afsc_northpacificgroundfishobserver.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact

Reading: OBIS_data/ambon_benthicepifauna_2015.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence

Reading: OBIS_data/ambon_cetaceans_2015.zip



Reading: OBIS_data/crempdrytortugas2003.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence

Reading: OBIS_data/crempdrytortugas2004.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact

Reading: OBIS_data/crempdrytortugas2005.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact

Reading: OBIS_data/crempdrytortugas2006.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence

Reading: OBIS_d


Reading: OBIS_data/floridakeysreefvisualcensus2001.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence

Reading: OBIS_data/floridakeysreefvisualcensus2003.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact

Reading: OBIS_data/floridakeysreefvisualcensus2004.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact

Reading: OBIS_data/floridakeysreefvisualcensus2005.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence
Extensions:  http://rs.iobis.org/obis/term

  if (await self.run_code(code, result,  async_=asy)):



Reading: OBIS_data/kubi_ichthyologycollection_marine.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/marinegeo_reeflifesurvey.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact

Reading: OBIS_data/mbenthos_penobscotbay.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/ncrmp_blt_fish_pacific.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence

Reading: OBIS_data/ncrmp_nspc_fish_pacific.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact
Extensions:  http://rs.tdw

  if (await self.run_code(code, result,  async_=asy)):



Reading: OBIS_data/noaa_coralreefmonitoring_lpipercentcover.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence

Reading: OBIS_data/noaa_dsc_rtp.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt


  if (await self.run_code(code, result,  async_=asy)):



Reading: OBIS_data/noaa_micronesia_reef_monitoring_benthic.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact

Reading: OBIS_data/noaa_micronesia_reef_monitoring_fish.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence

Reading: OBIS_data/noaa_micronesia_reef_monitoring_invert.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact

Reading: OBIS_data/noaa_nbi.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/ogl_msc_northeasternuniversity.zip
Core type is: http://rs.tdwg.org/

  if (await self.run_code(code, result,  async_=asy)):



Reading: OBIS_data/tpwd_harc_texascorpuschristibay_gillnet_20150130.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/tpwd_harc_texascorpuschristibay_trawl_20150130.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/tpwd_harc_texascorpuschristie_bagseine_20150130.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/tpwd_harc_texaslowerlagunamadre_bagseine_20150130.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/tpwd_harc_texaslowerlagunamadre_gillnet_20150130.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/tpwd_harc_texaslowerlagunamadre_trawl_20150130.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/tpwd_harc_texasmat

  if (await self.run_code(code, result,  async_=asy)):



Reading: OBIS_data/tpwd_harc_texasupperlagunamadre_gillnet_20150202.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt


  if (await self.run_code(code, result,  async_=asy)):



Reading: OBIS_data/tpwd_harc_upperlagunamadre_trawl_20150202.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/usgs_asc_polarbearmaternaldens_20150129.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/usgs_glsc_rvcat_trawl.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence

Reading: OBIS_data/usgs_nas.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/usgs_nas_nonfish.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/usgs_nas_tigershrimp.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/usgs_pwrc_seabirdscompendium.zip
Core type i

  if (await self.run_code(code, result,  async_=asy)):



Reading: OBIS_data/usgs_southflorida_fian_crustaceans.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/usgs_southflorida_fian_fish.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt


  if (await self.run_code(code, result,  async_=asy)):



Reading: OBIS_data/usgs_southflorida_fian_harvest_20150129.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/usgs_stcroix_marinefishes_20150129.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/usgs_waterquality_sanfranciscobay_phytoplankton_1992_2014.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact

Reading: OBIS_data/uwfc.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt


  if (await self.run_code(code, result,  async_=asy)):



Reading: OBIS_data/vims_chesmmap.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt

Reading: OBIS_data/vims_neamap.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence

Reading: OBIS_data/whoi_epr_deepseavents_speciescounts.zip
Core type is: http://rs.tdwg.org/dwc/terms/Event
Core data file is: event.txt
Extensions:  http://rs.tdwg.org/dwc/terms/Occurrence
Extensions:  http://rs.iobis.org/obis/terms/ExtendedMeasurementOrFact

Reading: OBIS_data/wod_2009.zip
Core type is: http://rs.tdwg.org/dwc/terms/Occurrence
Core data file is: occurrence.txt


Print out some additional metadata about the last package.

In [2]:
#import xml.etree.ElementTree as ET
print(dwca.archive_path)
root = dwca.metadata
for child in root.findall('.//role'):
    print(child.tag, child.text)

OBIS_data/wod_2009.zip
role publisher


In [30]:
core_df.columns.to_list()

['id',
 'type',
 'modified',
 'language',
 'license',
 'references',
 'datasetID',
 'datasetName',
 'ownerInstitutionCode',
 'eventID',
 'samplingProtocol',
 'samplingEffort',
 'eventDate',
 'habitat',
 'waterBody',
 'country',
 'stateProvince',
 'minimumElevationInMeters',
 'maximumElevationInMeters',
 'minimumDepthInMeters',
 'maximumDepthInMeters',
 'decimalLatitude',
 'decimalLongitude',
 'geodeticDatum',
 'institutionCode',
 'collectionCode',
 'basisOfRecord',
 'occurrenceID',
 'catalogNumber',
 'occurrenceRemarks',
 'individualCount',
 'sex',
 'occurrenceStatus',
 'year',
 'coordinateUncertaintyInMeters',
 'scientificNameID',
 'scientificName',
 'countryCode',
 'parentEventID',
 'eventRemarks',
 'footprintWKT',
 'bibliographicCitation',
 'dynamicProperties',
 'materialSampleID',
 'recordedBy',
 'associatedReferences',
 'verbatimEventDate',
 'georeferenceProtocol',
 'identificationRemarks',
 'kingdom',
 'phylum',
 'class',
 'order',
 'family',
 'taxonRank',
 'scientificNameAuthors

Find rows where the dates are not interpretable and where coordinates are outside of valid range.

eventDate is not ISO 8601

-90 < Lat < 90

-180 < Lon < 180

In [32]:
pd.set_option('display.max_columns', None)
bad_dates = ["0001-05-17","0001-04-11","0001-05-17","0001-04-11","0001-04-11","0001-04-11","0193-09-10","0193-09-10","0001-05-18","1027-10-24","0001-04-11","0001-04-11","0001-05-18","0001-04-11","0001-05-18","0001-04-11","0001-05-17","0001-04-11","0001-04-11","0001-04-11","0001-07-30","0001-07-30","0001-04-11","0001-05-17","0001-04-11","0001-05-17","3291-01-27"]
#core_df.loc[core_df['eventDate']=="0001-05-17"].to_csv('bad_dates.csv',index=False)

core_df.loc[  core_df['eventDate'].str.contains('(' + '|'.join(bad_dates) + ')', na=False) |
            ( core_df['decimalLatitude'] > 90 ) | 
            ( core_df['decimalLatitude'] < -90 ) | 
            ( core_df['decimalLongitude'] > 180 ) | 
            ( core_df['decimalLongitude'] < -180 ),
           ['id','datasetName','datasetID','eventDate','decimalLatitude','decimalLongitude']]#.to_csv('bad_dates.csv',index=False)

  return func(self, *args, **kwargs)


Unnamed: 0,id,datasetName,datasetID,eventDate,decimalLatitude,decimalLongitude
534336,15770D23-0378-4F02-9A03-0F67EF498345,University of Florida Museum of Natural Histor...,UF_Invertebrates,0001-05-17,13.597,144.830667
542429,0575082E-2E5E-4AD7-BC30-1A22218AA932,University of Florida Museum of Natural Histor...,UF_Invertebrates,0001-04-11,29.003167,-83.1777
554304,0BE71302-3A15-46A7-BA0F-21BD1D67AA2E,University of Florida Museum of Natural Histor...,UF_Invertebrates,0001-05-17,13.597,144.830667
568747,2D167A89-C753-4CDE-A8B1-4F9C09BD2732,University of Florida Museum of Natural Histor...,UF_Invertebrates,0001-04-11,29.003167,-83.1777
569040,EF7EABD4-2D8B-4091-AA4D-44A594F0CEFD,University of Florida Museum of Natural Histor...,UF_Invertebrates,0001-04-11,29.003167,-83.1777
569058,423573E6-171B-4D65-9EF1-176E33D5D100,University of Florida Museum of Natural Histor...,UF_Invertebrates,0001-04-11,29.003167,-83.1777
574732,8EB97E6C-8BE1-4BBC-A38C-CF3DCFD86BE1,University of Florida Museum of Natural Histor...,UF_Invertebrates,0193-09-10,25.0,-80.0
574791,7D1BCA97-0C21-482D-9DB6-334564DE8AF3,University of Florida Museum of Natural Histor...,UF_Invertebrates,0193-09-10,26.0,-80.0
577888,EA85C53F-4041-4398-8C04-4D3B4203C2C0,University of Florida Museum of Natural Histor...,UF_Invertebrates,0001-05-18,13.4335,144.6325
577901,291AC732-5E1D-43F0-97BD-46ACEA7D8CE0,University of Florida Museum of Natural Histor...,UF_Invertebrates,1027-10-24,26.0,-82.0


Create a mask for those observations that are problematic.

In [37]:
mask = core_df['eventDate'].str.contains('(' + '|'.join(bad_dates) + ')', na=False) | ( core_df['decimalLatitude'] > 90 ) | ( core_df['decimalLatitude'] < -90 ) | ( core_df['decimalLongitude'] > 180 ) | ( core_df['decimalLongitude'] < -180 )

  return func(self, *args, **kwargs)


Apply the mask and do some work with the observations.

In [49]:
for value in core_df[mask]['eventDate']:
    try:
        pd.to_datetime(value)
    except:
        print(value)
        break

0001-05-17


In [None]:
#core_df['eventDate'].filter(like='0001', axis=0)
import numpy as np

## non-convertable dates:
bad_dates = ["0001-05-17","0001-04-11","0001-05-17","0001-04-11","0001-04-11","0001-04-11","0193-09-10","0193-09-10","0001-05-18","1027-10-24","0001-04-11","0001-04-11","0001-05-18","0001-04-11","0001-05-18","0001-04-11","0001-05-17","0001-04-11","0001-04-11","0001-04-11","0001-07-30","0001-07-30","0001-04-11","0001-05-17","0001-04-11","0001-05-17","2000-07-18","2006-10-05T11:55","3291-01-27","1975-10-05T20:15:00Z","1981-05-10T23:10:12Z","1985-07-09T12:00:00Z","1977-11-03T12:00:00Z","1988-10-09T02:15:00Z","1989-09-24T08:58:12Z","1995-01-27T03:01:48Z","1967-02-07T07:10:12Z"]
# replace those w/ nan
core_df['eventDate'].replace(bad_dates,np.nan, inplace = True)
# 164,341 observations have null dates
pd.to_datetime(core_df['eventDate'])

In [None]:
# print number of bad dates
core_df.loc[core_df['eventDate'].isnull()]

In [None]:
core_df.shape

In [48]:
pd.to_datetime(core_df[~mask]['eventDate'],infer_datetime_format=True).min()

RuntimeError: No active exception to reraise

In [None]:
for value in core_df['eventDate'].values:
    try:
        pd.to_datetime(value)
    except:
        print(value)
        break

In [None]:
df_sizes = pd.DataFrame({
    'size':[0.015835,
0.018933,
0.049817,
0.087867,
0.110903,
0.159178,
0.197299,
0.241113,
0.297959,
0.415823,
0.457184,
0.510765,
0.590182,
0.601713,
0.703488,
0.703526,
0.703676,
0.703744,
0.703871,
0.703871,
0.777377,
0.924155,
1.10372,
1.44756,
1.56119,
1.56151,
1.5618,
1.56186,
1.90727,
2.2512,
2.25399,
2.25418,
2.25419,
2.25465,
2.25466,
2.2547,
2.25517,
2.25533,
2.25653,
2.27472,
2.38664,
2.77899,
3.26323,
3.49174,
3.73588,
3.91314,
4.10337,
4.11706,
4.37327,
4.73277,
5.53673,
5.80743,
6.01187,
6.01363,
6.01561,
6.02871,
6.03303,
6.0342,
6.21464,
6.33658,
6.50698,
6.60439,
6.67536,
6.83223,
7.04321,
9.09705,
9.58163,
9.70774,
9.84124,
9.84241,
10.5076,
10.598,
10.7722,
11.7014,
12.8499,
12.8959,
14.8624,
16.0524,
16.512,
17.2536,
18.5005,
19.8645,
23.5451,
23.961,
26.2298,
26.3219,
29.0074,
29.0466,
31.8277,
34.5672,
34.9802,
36.9862,
41.1389,
46.6154,
51.1379,
54.0095,
60.7156,
61.032,
78.5715,
78.7834,
82.6558,
94.6736,
96.1423,
98.5113,
116.51,
121.919,
127.061,
161.56,
167.974,
173.018,
173.473,
182.459,
190.971,
198.847,
203.816,
206.923,
212.72,
221.919,
230.564,
234.129,
239.094,
245.25,
245.448,
251.73,
254.732,
257.656,
266.012,
272.378,
286.562,
287.254,
296.639,
303.077,
306.597,
324.479,
325.387,
337.982,
354.244,
373.063,
416.506,
477.162,
612.461,
804.789,
844.7,
1058.94,
3681.96]
})

In [None]:
print(df_sizes.sum())
print(df_sizes.describe())

Read in the unpackaged occurrence data from `OBIS_data/unzipped`.

In [None]:
occurrence = pd.DataFrame(
        columns = ['id', 'type', 'basisOfRecord', 'occurrenceID', 'occurrenceStatus',
       'eventID', 'eventDate', 'decimalLatitude', 'decimalLongitude',
       'scientificNameID', 'scientificName', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'taxonRank', 'scientificNameAuthorship'])

for package in os.listdir('OBIS_data/unzipped/'):
    print('reading',package)
    
    df_init = pd.read_csv('OBIS_data/unzipped/'+package+'/occurrence.txt', sep = '\t')
    occurrence = pd.concat([occurrence, df_init], ignore_index=True)

In [None]:
pd.to_datetime(occurrence['eventDate']).min()

In [None]:
occurrence['eventDate']

In [None]:
print(core_df[['decimalLatitude','decimalLongitude']].describe())

In [None]:
core_df[['decimalLatitude','decimalLongitude']].min()

In [None]:
core_df[['decimalLatitude','decimalLongitude']].max()