# Import Pomological Dataset

This notebook will import a subset from the Pomological Watercolors Dataset: https://usdawatercolors.nal.usda.gov/pom/home.xhtml

In [None]:
import zmlp
from zmlp import app_from_env, FileImport

import json

## Read input data

In [None]:
with open('/data/fruits.json', 'r') as file:
        fruits = json.load(file)

## Build list of URIs and import

In [None]:
app = app_from_env()

files = []
for f in fruits:
    name = f['url'].split('id=')[1]
    imageURL = 'http://naldc-legacy.nal.usda.gov/pom/' + name + '/screen.jpg'
    files.append(FileImport(imageURL))

# Import all the files. For a partial import, comment out this part and uncomment the code below
while files:
    app.assets.batch_import_files(files[:100])
    files = files[100:]
    
# Import just 50 files:
#app.assets.batch_import_files(files[:50])

## Assign metadata

Once the import is done, you can assign metadata to the imported assets:

In [None]:
attrs_to_delete = ['id', 'nal_note', 'notes_on_original', 'rights', 'url', 'wikimedia_url']

fruit_dict = {}

for i, f in enumerate(fruits):
    fruit_id = f['url'].split('id=')[1]
    for attr in attrs_to_delete:
        if attr in f:
            del(f[attr])
    fruit_dict[fruit_id] = f

app = app_from_env()

search = app.assets.search({"size": 1550})

for i, a in enumerate(search):
    fruit_id = a.document['source']['path'].split('/')[-2]
    a.set_attr('aux.pomological', fruit_dict[fruit_id])
    print(i)
    app.assets.index(a)