In [None]:
import pandas as pd

In [None]:
occurrence_files = ['./AMBON/AMBON2015_zooplankton/AMBON2015_zooplankton_Occurrence_DWC.csv',
                   './AMBON/AMBON2017_zooplankton/AMBON2017_zooplankton_Occurrence_DWC.csv']

for file in occurrence_files:
    df = pd.read_csv(file)
    print("File:%s\n%s\n"%(file,df['lifeStage'].unique()))
    df.drop(columns='Unnamed: 0',inplace=True)
    
    # find where lifeStage is either 'male' or 'female' and return that to another column called 'sex'
    df['sex'] = df.loc[ (df['lifeStage'] == 'male') | (df['lifeStage'] == 'female'), ['lifeStage'] ]
    df.loc[ (df['lifeStage'] == 'male'), ['lifeStage']] = ''
    df.loc[ (df['lifeStage'] == 'female'), ['lifeStage']] = ''
    
    fname = file.replace('_DWC','_DWC_mb')
    df.to_csv(fname, index=False)

Get the data from OBIS API and try to fix the occurrenceID, among other things.

See also, `../AMBON/2017zooplanton_to_dwc(2)_mmb.ipynb`

In [3]:
# Import requests and set the OBIS API base URL. 
import requests
import json
import pandas as pd
import urllib

# Convenience function to pretty print JSON objects
def print_json(myjson):
    print(json.dumps(
        myjson,
        sort_keys=True,
        indent=4,
        separators=(',', ': ')
    ))
    

# Initialize the base URL for OBIS. This variable will be used for every API call
OBIS_URL = "https://api.obis.org/v3"

req = requests.get(f'{OBIS_URL}/node')
nodes_json = req.json()

df_nodes = pd.DataFrame(nodes_json['results'])
df_nodes[['name','id']]

nodeID = df_nodes.loc[df_nodes['name']=='OBIS USA','id'].tolist()[0]
# node/{nodeID}
req = requests.get(f'{OBIS_URL}/node/{nodeID}')
obis_usa_json = req.json()

# Show OBIS-USA node record
print_json(obis_usa_json)

{
    "results": [
        {
            "contacts": [
                {
                    "email": "albenson@usgs.gov",
                    "givenname": "Abby",
                    "oceanexpert_id": 25483,
                    "surname": "Benson"
                },
                {
                    "email": "sbristol@usgs.gov",
                    "givenname": "Sky",
                    "oceanexpert_id": 25731,
                    "surname": "Bristol"
                }
            ],
            "description": "Ocean Biodiversity Information System USA (OBIS-USA) brings together marine biological observation data \u2013 recorded observations of identifiable marine species at a known time and place, collected primarily from U.S. Waters or with U.S. funding.",
            "feeds": [
                {
                    "id": "753ce293-c4c0-4ee5-bf22-361db7b89e3a",
                    "url": "https://ipt.geome-db.org/rss.do"
                },
                {
                    

In [73]:
req = requests.get(f'{OBIS_URL}/dataset?nodeid={nodeID}')
datasets = req.json()

In [78]:
# dataset
GBIF_URL = 'https://api.gbif.org/v1'
GBIF_UUID = '7afb6dbd-6485-42c2-aee7-c730bd81d2cd'


req = requests.get(f'{GBIF_URL}/dataset/{GBIF_UUID}/endpoint')

req.json()


# create a random id for occurrence
# put occurrenceID into emof.

[{'key': 627708,
  'type': 'DWC_ARCHIVE',
  'url': 'https://www1.usgs.gov/obis-usa/ipt/archive.do?r=ambon_zooplankton_2017',
  'createdBy': 'c3ad790a-d426-4ac1-8e32-da61f81f0117',
  'modifiedBy': 'c3ad790a-d426-4ac1-8e32-da61f81f0117',
  'created': '2021-08-27T20:56:42.963+00:00',
  'modified': '2021-08-27T20:56:42.963+00:00',
  'machineTags': []},
 {'key': 627707,
  'type': 'EML',
  'url': 'https://www1.usgs.gov/obis-usa/ipt/eml.do?r=ambon_zooplankton_2017',
  'createdBy': 'c3ad790a-d426-4ac1-8e32-da61f81f0117',
  'modifiedBy': 'c3ad790a-d426-4ac1-8e32-da61f81f0117',
  'created': '2021-08-27T20:56:42.867+00:00',
  'modified': '2021-08-27T20:56:42.867+00:00',
  'machineTags': []}]

In [80]:
for endpoint in req.json():
    if endpoint['type'] == 'EML':
        r = requests.get(endpoint['url'], allow_redirects=True)
        open(endpoint['url'].split("=")[-1]+'.zip', 'wb').write(r.content)

In [63]:
obis_uuid = 'bc01451e-d990-4ad1-8315-e3fb6e9cf461'
ipt_id = 'ambon_zooplankton_2017'

req = requests.get(f'{OBIS_URL}/dataset/{obis_uuid}')

req = requests.get(f'{OBIS_URL}/occurrence?datasetid={obis_uuid}&mof=true&size=5000')

req.json()

{'total': 4727,
 'results': [{'date_year': 2017,
   'scientificNameID': 'urn:lsid:marinespecies.org:taxname:103413',
   'scientificName': 'Oikopleura (Vexillaria) labradoriensis',
   'dropped': False,
   'aphiaID': 103413,
   'decimalLatitude': 68.8984,
   'phylumid': 1821,
   'familyid': 103356,
   'occurrenceStatus': 'present',
   'basisOfRecord': 'HumanObservation',
   'tribeid': 343701,
   'maximumDepthInMeters': 29,
   'id': '001bc0c8-d7c0-420c-9b61-94cb1ae6f6c9',
   'order': 'Copelata',
   'dataset_id': 'bc01451e-d990-4ad1-8315-e3fb6e9cf461',
   'decimalLongitude': -166.4237,
   'date_end': 1502150400000,
   'speciesid': 103413,
   'occurrenceID': 'AMBON_Zooplankton_2017_CL0_4336',
   'date_start': 1502150400000,
   'genus': 'Oikopleura',
   'tribe': 'Labiata',
   'samplingProtocol': 'TWINRING_150UM_MICROSCOPY',
   'eventDate': '2017-08-08 15:46:00',
   'eventID': 'AMBON_Zooplankton_2017_CL0_2017-08-08T15:46:00Z',
   'absence': False,
   'subfamily': 'Oikopleurinae',
   'genusid'

In [82]:
f'{GBIF_URL}/dataset/{GBIF_UUID}'

'https://api.gbif.org/v1/dataset/7afb6dbd-6485-42c2-aee7-c730bd81d2cd'

In [67]:
import pandas as pd

df_all = pd.DataFrame()

for result in req.json()['results']:
    #print(result)
    df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in result.items() ]))
    df_all = pd.concat([df_all, df], ignore_index=True)

  df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in result.items() ]))


In [65]:
df_all.columns

Index(['date_year', 'scientificNameID', 'scientificName', 'dropped', 'aphiaID',
       'decimalLatitude', 'phylumid', 'familyid', 'occurrenceStatus',
       'basisOfRecord', 'tribeid', 'maximumDepthInMeters', 'id', 'order',
       'dataset_id', 'decimalLongitude', 'date_end', 'speciesid',
       'occurrenceID', 'date_start', 'genus', 'tribe', 'samplingProtocol',
       'eventDate', 'eventID', 'absence', 'subfamily', 'genusid', 'taxonID',
       'originalScientificName', 'marine', 'minimumDepthInMeters',
       'subphylumid', 'subfamilyid', 'countryCode', 'date_mid', 'subgenus',
       'class', 'identificationReferences', 'orderid', 'geodeticDatum',
       'kingdom', 'subgenusid', 'classid', 'depth', 'phylum', 'species',
       'subphylum', 'datasetID', 'family', 'kingdomid', 'node_id', 'flags',
       'sss', 'shoredistance', 'sst', 'bathymetry', 'mof', 'subclassid',
       'terrestrial', 'subclass', 'superclass', 'superclassid', 'brackish',
       'superorder', 'superorderid', 'infracl

In [68]:
df_all['mof']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
4763    NaN
4764    NaN
4765    NaN
4766    NaN
4767    NaN
Name: mof, Length: 4768, dtype: object

In [70]:
df_all.shape

(4768, 93)