In [1]:
import pandas as pd
import numpy as np

In [2]:
stain_status = pd.read_csv('barcodes_from_figshare.tsv', sep='\t')
stain_status.head()

Unnamed: 0,barcode,stain_status
0,140,stained
1,162,stained
2,185,stained
3,209,stained
4,231,stained


In [3]:
barcode_list = stain_status['barcode'].tolist()
len(barcode_list)

15553

In [4]:
from dask.distributed import Client
import dask.bag as db
import json
import s3fs
from PIL import Image

In [5]:
client = Client(n_workers=8, threads_per_worker=4)
client

0,1
Client  Scheduler: tcp://127.0.0.1:60046  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 32  Memory: 17.18 GB


In [6]:
fs = s3fs.S3FileSystem(anon=True)
fs.ls('smithsonian-open-access')

['smithsonian-open-access/media', 'smithsonian-open-access/metadata']

In [7]:
metadata = fs.ls('smithsonian-open-access/metadata/edan/')
metadata

['smithsonian-open-access/metadata/edan/aaa',
 'smithsonian-open-access/metadata/edan/acah',
 'smithsonian-open-access/metadata/edan/acm',
 'smithsonian-open-access/metadata/edan/cfchfolklife',
 'smithsonian-open-access/metadata/edan/chndm',
 'smithsonian-open-access/metadata/edan/eepa',
 'smithsonian-open-access/metadata/edan/fbr',
 'smithsonian-open-access/metadata/edan/fs',
 'smithsonian-open-access/metadata/edan/fsa',
 'smithsonian-open-access/metadata/edan/fsg',
 'smithsonian-open-access/metadata/edan/hac',
 'smithsonian-open-access/metadata/edan/hmsg',
 'smithsonian-open-access/metadata/edan/hsfa',
 'smithsonian-open-access/metadata/edan/naa',
 'smithsonian-open-access/metadata/edan/nasm',
 'smithsonian-open-access/metadata/edan/nmaahc',
 'smithsonian-open-access/metadata/edan/nmafa',
 'smithsonian-open-access/metadata/edan/nmah',
 'smithsonian-open-access/metadata/edan/nmai',
 'smithsonian-open-access/metadata/edan/nmnhanthro',
 'smithsonian-open-access/metadata/edan/nmnhbirds',

In [8]:
b = db.read_text('s3://smithsonian-open-access/metadata/edan/nmnhbotany/*.txt',
                storage_options={'anon': True}).map(json.loads)

In [9]:
botany_example = b.take(1000)
print(json.dumps(botany_example[229], indent=2))

{
  "id": "edanmdm-nmnhbotany_14961062",
  "version": "",
  "unitCode": "NMNHBOTANY",
  "linkedId": "0",
  "type": "edanmdm",
  "content": {
    "descriptiveNonRepeating": {
      "record_ID": "nmnhbotany_14961062",
      "online_media": {
        "mediaCount": 1,
        "media": [
          {
            "thumbnail": "https://ids.si.edu/ids/deliveryService/id/ark:/65665/m3f5411afef94248bca902ff5e4ed900f8/90",
            "idsId": "ark:/65665/m3f5411afef94248bca902ff5e4ed900f8",
            "usage": {
              "access": "CC0"
            },
            "guid": "http://n2t.net/ark:/65665/m3f5411afe-f942-48bc-a902-ff5e4ed900f8",
            "type": "Images",
            "content": "https://ids.si.edu/ids/deliveryService/id/ark:/65665/m3f5411afef94248bca902ff5e4ed900f8",
            "resources": [
              {
                "label": "High-resolution JPEG (6745x6745)",
                "url": "https://ids.si.edu/ids/download?id=NMNH-03420627.jpg"
              },
              {


In [10]:
def extract_ids(record):
    """Take a single NMNH Botany metadata record, and pulls out ids

    Parameters
    ----------
    record : dict
        A single NMNH Botany metadata record in highly-nested dictionary format.

    Returns
    -------
    flattened_record: dict
        An un-nested dictionary that only contains the record id, unit code,
        object title, media_count, media_id, topic list, object type, and
        object medium.
    """
    flattened_record = dict()
    flattened_record['edan_id'] = record['id']
    flattened_record['title'] = record['title']
    
    flattened_record['Barcode'] = np.nan
    
    flattened_record['specimen_guid'] = record['content'].get('descriptiveNonRepeating', {}).get('guid',np.nan)
    
    media_count = record['content'].get('descriptiveNonRepeating', {}).get('online_media',{}).get('mediaCount',np.nan)
    flattened_record['media_count'] = float(media_count)
    media = record['content'].get('descriptiveNonRepeating', {}).get('online_media',{}).get('media',[])   
    if len(media):
        flattened_record['media_guid'] = media[0]['guid']
        if 'resources' in media[0]:
            for media_record in media[0]['resources']:
                if 'JPEG' in media_record['label']:
                    flattened_record['ids_id'] = media_record['url'].split('=')[1].split('.')[0]
    if 'freetext' in record['content']:
        if 'identifier' in record['content']['freetext']:
            for identifier in record['content']['freetext']['identifier']:
                id_type = identifier['label']
                id_number = identifier['content']
                flattened_record[id_type] = id_number
          
    return flattened_record

In [11]:
just_barcodes = (b.filter(lambda record: extract_ids(record)['Barcode'] in barcode_list)
                  .map(extract_ids)
                  .compute())
barcode_df = pd.DataFrame(just_barcodes)
barcode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15627 entries, 0 to 15626
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   edan_id        15627 non-null  object 
 1   title          15627 non-null  object 
 2   Barcode        15627 non-null  object 
 3   specimen_guid  15627 non-null  object 
 4   media_count    14061 non-null  float64
 5   media_guid     14061 non-null  object 
 6   ids_id         14038 non-null  object 
 7   USNM Number    15580 non-null  object 
 8   Other Numbers  8328 non-null   object 
dtypes: float64(1), object(8)
memory usage: 1.1+ MB


In [12]:
barcode_df.head()

Unnamed: 0,edan_id,title,Barcode,specimen_guid,media_count,media_guid,ids_id,USNM Number,Other Numbers
0,edanmdm-nmnhbotany_2095387,Ageratum elachycarpum B.L. Rob.,512770,http://n2t.net/ark:/65665/3d361b9b9-91d0-4a3d-...,1.0,http://n2t.net/ark:/65665/m3890da9ed-3b6d-40da...,NMNH-00512770,1404283,
1,edanmdm-nmnhbotany_2135634,Horsfieldia bartlettii Merr.,513412,http://n2t.net/ark:/65665/319872980-bd6d-409a-...,1.0,http://n2t.net/ark:/65665/m374b20557-3460-4d9d...,NMNH-00513412-000001,2275439,
2,edanmdm-nmnhbotany_2102106,Hypericum crenulatum var. major Boiss.,588520,http://n2t.net/ark:/65665/353447a40-80df-4892-...,1.0,http://n2t.net/ark:/65665/m3566c5f4e-c485-4c1b...,NMNH-00588520,129657,
3,edanmdm-nmnhbotany_2167183,Asclepias brachystephana Engelm. ex Torr. in E...,588654,http://n2t.net/ark:/65665/3cd0feeeb-abcd-415c-...,1.0,http://n2t.net/ark:/65665/m3330aaa7b-0613-4374...,NMNH-00588654-000001,18691,
4,edanmdm-nmnhbotany_2683657,Waltheria indica L.,595768,http://n2t.net/ark:/65665/3cbca3c96-e7ec-4eff-...,2.0,http://n2t.net/ark:/65665/m328e137c2-1b2e-4768...,NMNH-00595768,13147,


In [13]:
barcode_df[barcode_df.duplicated(subset='Barcode', keep=False)].sort_values('Barcode').head()

Unnamed: 0,edan_id,title,Barcode,specimen_guid,media_count,media_guid,ids_id,USNM Number,Other Numbers
12055,edanmdm-nmnhbotany_2140155,Acacia emoryana Benth.,209,http://n2t.net/ark:/65665/350435d2c-8228-4f1c-...,1.0,http://n2t.net/ark:/65665/m3325dc959-7428-4973...,NMNH-00000209-000001,1242171,fiche number : 0364/C04
10874,edanmdm-nmnhbotany_2112595,Acacia emoryana Benth.,209,http://n2t.net/ark:/65665/3ce233bf5-d0e1-4967-...,1.0,http://n2t.net/ark:/65665/m3325dc959-7428-4973...,NMNH-00000209-000001,1242172,
6241,edanmdm-nmnhbotany_10085590,Rhynchosia pringlei Rose,4828,http://n2t.net/ark:/65665/318376e79-5027-4111-...,1.0,http://n2t.net/ark:/65665/m3e433107f-ca89-4bd7...,NMNH-00004828-000001,235115,
4787,edanmdm-nmnhbotany_10078069,Rhynchosia pringlei Rose,4828,http://n2t.net/ark:/65665/34e44b875-4b53-4988-...,1.0,http://n2t.net/ark:/65665/m3e433107f-ca89-4bd7...,NMNH-00004828-000001,235115,fiche number : 0431/E10
1575,edanmdm-nmnhbotany_2789572,Psychotria robynsiana Petit var. robynsiana,8112,http://n2t.net/ark:/65665/379a2f0aa-9747-4c72-...,1.0,http://n2t.net/ark:/65665/m37632c5c8-0120-45a8...,NMNH-00008112-000001,2155406,"""ORNumber"" : 000024"


In [14]:
unique_barcodes = barcode_df.drop_duplicates(subset=['Barcode'], keep=False)
unique_barcodes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15478 entries, 0 to 15626
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   edan_id        15478 non-null  object 
 1   title          15478 non-null  object 
 2   Barcode        15478 non-null  object 
 3   specimen_guid  15478 non-null  object 
 4   media_count    13945 non-null  float64
 5   media_guid     13945 non-null  object 
 6   ids_id         13922 non-null  object 
 7   USNM Number    15431 non-null  object 
 8   Other Numbers  8280 non-null   object 
dtypes: float64(1), object(8)
memory usage: 1.2+ MB


In [15]:
unique_barcodes['media_count'].value_counts()

1.0    11515
2.0     2427
3.0        2
7.0        1
Name: media_count, dtype: int64

In [16]:
unique_barcodes[unique_barcodes['media_count'] > 2].head().to_dict(orient='records')

[{'edan_id': 'edanmdm-nmnhbotany_2105891',
  'title': 'Waltheria pyrolaefolia A. Gray in Wilkes',
  'Barcode': '00479031',
  'specimen_guid': 'http://n2t.net/ark:/65665/341db4b64-d0b0-45d9-acde-786efe91bb34',
  'media_count': 3.0,
  'media_guid': 'http://n2t.net/ark:/65665/m3dc38e176-4dd2-4b2d-a4be-d218ae9b7e6d',
  'ids_id': 'NMNH-00479031-000001',
  'USNM Number': '13159',
  'Other Numbers': 'fiche number : 0542/E04'},
 {'edan_id': 'edanmdm-nmnhbotany_2157357',
  'title': 'Potamogeton marinus var. occidentalis B.L. Rob. in C. King',
  'Barcode': '00086668',
  'specimen_guid': 'http://n2t.net/ark:/65665/3158acf90-ae33-4e1d-abd0-c6e3d9d4e91e',
  'media_count': 7.0,
  'media_guid': 'http://n2t.net/ark:/65665/m3da5d5478-eb8e-4ef3-a2f2-25572f6a0c3a',
  'ids_id': 'NMNH-00086668-000001',
  'USNM Number': '46896',
  'Other Numbers': 'fiche number : 0074/B05'},
 {'edan_id': 'edanmdm-nmnhbotany_10058390',
  'title': 'Sedum spathulifolium Hook.',
  'Barcode': '00898520',
  'specimen_guid': 'http

In [20]:
unique_single = unique_barcodes[unique_barcodes['media_count'] == 1].copy()
unique_single = unique_single.rename(columns={'Barcode':'barcode'})
unique_single.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11515 entries, 0 to 15626
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   edan_id        11515 non-null  object 
 1   title          11515 non-null  object 
 2   barcode        11515 non-null  object 
 3   specimen_guid  11515 non-null  object 
 4   media_count    11515 non-null  float64
 5   media_guid     11515 non-null  object 
 6   ids_id         11498 non-null  object 
 7   USNM Number    11469 non-null  object 
 8   Other Numbers  7565 non-null   object 
dtypes: float64(1), object(8)
memory usage: 899.6+ KB


In [21]:
merged = unique_single.merge(stain_status, on='barcode')
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11515 entries, 0 to 11514
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   edan_id        11515 non-null  object 
 1   title          11515 non-null  object 
 2   barcode        11515 non-null  object 
 3   specimen_guid  11515 non-null  object 
 4   media_count    11515 non-null  float64
 5   media_guid     11515 non-null  object 
 6   ids_id         11498 non-null  object 
 7   USNM Number    11469 non-null  object 
 8   Other Numbers  7565 non-null   object 
 9   stain_status   11515 non-null  object 
dtypes: float64(1), object(9)
memory usage: 989.6+ KB


In [22]:
merged['stain_status'].value_counts()

unstained    6890
stained      4625
Name: stain_status, dtype: int64

In [23]:
merged.to_csv('mercury_list_from_s3.tsv', sep='\t', index=False)

In [25]:
specimen_df_sample = pd.read_csv('nmnh_occurrence_1_35.tsv.gz',
                                 nrows=1000,
                            sep='\t', compression='gzip')
specimen_df_sample.sample(5).to_dict(orient='records')

[{'id': 'http://n2t.net/ark:/65665/300004966-f89f-4f65-9d09-93c9f34b355b',
  'type': 'PhysicalObject',
  'references': nan,
  'institutionID': 'http://biocol.org/urn:lsid:biocol.org:col:34871',
  'institutionCode': 'US',
  'collectionCode': 'Botany',
  'datasetName': 'NMNH Extant Biology',
  'basisOfRecord': 'PreservedSpecimen',
  'occurrenceID': 'http://n2t.net/ark:/65665/300004966-f89f-4f65-9d09-93c9f34b355b',
  'catalogNumber': '3614881',
  'occurrenceRemarks': nan,
  'recordNumber': '14',
  'recordedBy': 'J. Macoun',
  'individualCount': 1,
  'sex': nan,
  'lifeStage': nan,
  'preparations': nan,
  'otherCatalogNumbers': nan,
  'associatedMedia': '12009231',
  'associatedSequences': nan,
  'associatedOccurrences': nan,
  'startDayOfYear': 224.0,
  'endDayOfYear': 224.0,
  'year': 1890.0,
  'month': 8.0,
  'day': 12.0,
  'verbatimEventDate': nan,
  'habitat': nan,
  'fieldNumber': nan,
  'fieldNotes': nan,
  'locationID': nan,
  'higherGeography': 'North America, Canada',
  'contine