In [1]:
import json
import pandas as pd

In [2]:
with open('garden_edan_trees.json','r') as json_in:
    garden_json = json.load(json_in)
len(garden_json)

1702

In [3]:
print(json.dumps(garden_json[119], indent=2))

{
  "id": "edanmdm-ofeo-sg_2011-0917A",
  "title": "Ulmus americana",
  "unitCode": "OFEO-SG",
  "linkedId": "0",
  "type": "edanmdm",
  "url": "edanmdm:ofeo-sg_2011-0917A",
  "content": {
    "descriptiveNonRepeating": {
      "record_ID": "ofeo-sg_2011-0917A",
      "online_media": {
        "mediaCount": 5,
        "media": [
          {
            "thumbnail": "https://ids.si.edu/ids/deliveryService?id=SG-2011-0917A-FAL1-HL",
            "idsId": "SG-2011-0917A-FAL1-HL",
            "usage": {
              "access": "Usage conditions apply",
              "text": ""
            },
            "caption": "Photographed by: Hannele Lahti",
            "type": "Images",
            "content": "https://ids.si.edu/ids/deliveryService?id=SG-2011-0917A-FAL1-HL"
          },
          {
            "thumbnail": "https://ids.si.edu/ids/deliveryService?id=SG-2011-0917A-SPR1-HL",
            "idsId": "SG-2011-0917A-SPR1-HL",
            "usage": {
              "access": "Usage conditions ap

In [4]:
def extract_images(record):
    object_id = record['id']
    image_list = []
    if 'online_media' in record['content']['descriptiveNonRepeating']:
        images = record['content']['descriptiveNonRepeating']['online_media']['media']
        for image in images:
            image_dict = {'object_id':object_id}
            image_dict['ids_id'] = image['idsId']
            image_dict['image_url'] = image['content']
            image_list.append(image_dict)
    return image_list        

In [5]:
def flatten(record):
    record_dict = {}
    record_dict['object_id'] = record['id']
    record_dict['title'] = record['title']
    record_dict['unit_code'] = record['unitCode']
    if 'online_media' in record['content']['descriptiveNonRepeating']:
        record_dict['media_count'] = record['content']['descriptiveNonRepeating']['online_media']['mediaCount']
    if 'freetext' in record['content']:
        if 'identifier' in record['content']['freetext']:
            for identifier in record['content']['freetext']['identifier']:
                key = identifier['label']
                value = identifier['content']
                record_dict[key] = value
        if 'notes' in record['content']['freetext']:
            for note in record['content']['freetext']['notes']:
                key = note['label']
                value = note['content']
                record_dict[key] = value
        if 'physicalDescription' in record['content']['freetext']:
            for pd in record['content']['freetext']['physicalDescription']:
                key = pd['label']
                value = pd['content']
                record_dict[key] = value
    if 'indexedStructured' in record['content']:
        if 'common_name' in record['content']['indexedStructured']:
            record_dict['common_name'] = ','.join(record['content']['indexedStructured']['common_name'])
        if 'scientific_name' in record['content']['indexedStructured']:
            record_dict['scientific_name'] = ','.join(record['content']['indexedStructured']['scientific_name'])
        if 'exhibition' in record['content']['indexedStructured']:
            for k_v in record['content']['indexedStructured']['exhibition']:
                for key, value in k_v.items():
                    record_dict[key] = value
    return record_dict

In [6]:
flattened_records = [flatten(r) for r in garden_json]
garden_df = pd.DataFrame(flattened_records)
garden_df.head()

Unnamed: 0,object_id,title,unit_code,media_count,Accession Number,Provenance,Life Form,Flower Color,common_name,scientific_name,building,Parentage
0,edanmdm-ofeo-sg_2011-0516A,Prunus subhirtella 'Pendula',OFEO-SG,5,2011-0516A,Uncertain,Deciduous tree,Pink,Weeping Higan Cherry,Prunus subhirtella,,
1,edanmdm-ofeo-sg_2011-0569A,Tilia cordata 'June Bride',OFEO-SG,20,2011-0569A,Uncertain,Deciduous tree,Yellow,June Bride Littleleaf Linden,Tilia cordata,,
2,edanmdm-ofeo-sg_2011-1259A,Prunus subhirtella 'Pendula',OFEO-SG,5,2011-1259A,Uncertain,Deciduous tree,Pink,Weeping Higan Cherry,Prunus subhirtella,NASM,
3,edanmdm-ofeo-sg_2011-1257A,Prunus subhirtella 'Pendula',OFEO-SG,4,2011-1257A,Uncertain,Deciduous tree,Pink,Weeping Higan Cherry,Prunus subhirtella,NASM,
4,edanmdm-ofeo-sg_2014-1217A,Betula papyrifera 'Renci' Renaissance Reflection,OFEO-SG,8,2014-1217A,From a cultivated plant not of known wild origin,Deciduous tree,,"Paper birch,White birch,Canoe birch,Renaissanc...",Betula papyrifera,NMNH,


In [15]:
garden_df['Life Form'].value_counts()

Deciduous tree               1160
Evergreen tree                432
Evergreen shrub/sub-shrub      31
Tree                           30
Deciduous shrub/sub-shrub      26
Semi-evergreen tree            21
Shrub/sub-shrub                 1
Palm                            1
Name: Life Form, dtype: int64

In [19]:
column_names = {'Accession Number':'accession_number',
                 'title':'scientific_name',
                 'common_name':'common_name',
                 'building':'building',
                 'Life Form':'life_form',
                 'media_count':'media_count'}
output_df = garden_df[column_names.keys()]
output_df = output_df.rename(columns=column_names)
output_df

Unnamed: 0,accession_number,scientific_name,common_name,building,life_form,media_count
0,2011-0516A,Prunus subhirtella 'Pendula',Weeping Higan Cherry,,Deciduous tree,5
1,2011-0569A,Tilia cordata 'June Bride',June Bride Littleleaf Linden,,Deciduous tree,20
2,2011-1259A,Prunus subhirtella 'Pendula',Weeping Higan Cherry,NASM,Deciduous tree,5
3,2011-1257A,Prunus subhirtella 'Pendula',Weeping Higan Cherry,NASM,Deciduous tree,4
4,2014-1217A,Betula papyrifera 'Renci' Renaissance Reflection,"Paper birch,White birch,Canoe birch,Renaissanc...",NMNH,Deciduous tree,8
...,...,...,...,...,...,...
1697,2011-0107A,Chamaecyparis obtusa 'Gracilis Compacta',Slender Hinoki False Cypress,HMSG,Evergreen shrub/sub-shrub,3
1698,2011-0078A,Thuja occidentalis 'Degroot's Spire',"American Arborvitae cultivar,Arborvitae cultivar",HMSG,Evergreen tree,3
1699,2011-0108A,Chamaecyparis obtusa 'Gracilis Compacta',Slender Hinoki False Cypress,HMSG,Evergreen shrub/sub-shrub,3
1700,2013-0629A,Acer saccharum 'Monumentale',Sugar Maple,HMSG,Deciduous tree,13


In [20]:
output_df.to_csv('garden_trees.tsv', index=False, sep='\t')

In [7]:
garden_images = []
for record in garden_json:
    record_images = extract_images(record)
    garden_images += record_images
len(garden_images)

8097

In [8]:
images_df = pd.DataFrame(garden_images)
images_df.head()

Unnamed: 0,object_id,ids_id,image_url
0,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-FAL1-HL-000003,https://ids.si.edu/ids/deliveryService?id=SG-2...
1,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-SPR1-HL,https://ids.si.edu/ids/deliveryService?id=SG-2...
2,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-SUM1-HL,https://ids.si.edu/ids/deliveryService?id=SG-2...
3,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-BAR1-HL,https://ids.si.edu/ids/deliveryService?id=SG-2...
4,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-BAR2-HL,https://ids.si.edu/ids/deliveryService?id=SG-2...


In [9]:
images_df['image_label'] = images_df['ids_id'].str.split('-').str.get(3)
images_df.head()

Unnamed: 0,object_id,ids_id,image_url,image_label
0,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-FAL1-HL-000003,https://ids.si.edu/ids/deliveryService?id=SG-2...,FAL1
1,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-SPR1-HL,https://ids.si.edu/ids/deliveryService?id=SG-2...,SPR1
2,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-SUM1-HL,https://ids.si.edu/ids/deliveryService?id=SG-2...,SUM1
3,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-BAR1-HL,https://ids.si.edu/ids/deliveryService?id=SG-2...,BAR1
4,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-BAR2-HL,https://ids.si.edu/ids/deliveryService?id=SG-2...,BAR2


In [10]:
images_df['image_label'].value_counts()

FAL1    1282
SUM1    1072
WIN1    1067
WIN      506
SPR1     496
        ... 
LEB3       1
FRU5       1
FAL3       1
SNP        1
FUN1       1
Name: image_label, Length: 70, dtype: int64

In [11]:
images_df['image_category'] = images_df['image_label'].str.slice(start=0, stop=3)
images_df.head()

Unnamed: 0,object_id,ids_id,image_url,image_label,image_category
0,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-FAL1-HL-000003,https://ids.si.edu/ids/deliveryService?id=SG-2...,FAL1,FAL
1,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-SPR1-HL,https://ids.si.edu/ids/deliveryService?id=SG-2...,SPR1,SPR
2,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-SUM1-HL,https://ids.si.edu/ids/deliveryService?id=SG-2...,SUM1,SUM
3,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-BAR1-HL,https://ids.si.edu/ids/deliveryService?id=SG-2...,BAR1,BAR
4,edanmdm-ofeo-sg_2011-0516A,SG-2011-0516A-BAR2-HL,https://ids.si.edu/ids/deliveryService?id=SG-2...,BAR2,BAR


In [12]:
images_df['image_category'].value_counts()

WIN    1590
FAL    1345
SUM    1172
STE     845
BAR     749
SPR     512
LEF     348
BUD     328
SNP     327
FLO     279
FRU     245
LEB     226
CON      98
MIS      15
HL        4
GAL       3
204       2
201       2
202       2
203       1
206       1
THR       1
SG1       1
FUN       1
Name: image_category, dtype: int64

In [13]:
images_df.to_csv('garden_edan_image_data.tsv', sep='\t', index=False)

In [14]:
images_df['image_url'].sample(2).tolist()

['https://ids.si.edu/ids/deliveryService?id=SG-2013-0623A-SPR1-HL',
 'https://ids.si.edu/ids/deliveryService?id=SG-2018-0761A-FAL1-HL']