# J. Paul Getty Museum Collection Download
The museum is sharing an art collection dataset with some CC0 license, what we can use to train our CC0 ResNet backbone for other projects!
Their docs page: https://data.getty.edu/museum/collection/docs/ contains documentation how to download the images from their API.
We can use the image as long as it has a valid '"id": "https://creativecommons.org/publicdomain/zero/1.0/"' field in its data, what we will filter on. The dataset may contain up to 150000 objects, and we will need a lot of that to train a ResNet backbone that would have transferable knowledge.

Note that their API is just awful, SPARQL is time outing all the time, there's no way to just get object IDs. Fortunately, Activity stream has links to most of them, so we scrap it from them.

### Constants and imports

In [1]:
import requests
import os
import random
import json
from tqdm.notebook import tqdm
import pandas as pd
from atomicwrites import atomic_write
import rdflib

BASE_METADATA_URL = 'https://data.getty.edu/museum/collection/object'
ACTIVITY_STREAM_URL = 'https://data.getty.edu/museum/collection/activity-stream/page'
CC0_IDENTIFIER = 'http://creativecommons.org/publicdomain/zero/1.0/'
METADATA_DIRECTORY = 'metadata'
DATASET_NAME = 'get'
RAW_IMAGE_DIRECTORY = os.path.join('dataset', 'raw')
DATASET_IMAGES_LIMIT = 70000
DATASET_IDS_LIMIT = 150000
IMAGE_WIDTH = 843
ID_DOWNLOADING_SAVE_PERIOD = 10
METADATA_DOWNLOADING_SAVE_PERIOD = 500
METADATA_PARSING_SAVE_PERIOD = 200
ACCEPT = "application/json"
ACCEPT_LANGUAGE = "en-US,en;q=0.5"

### Preparing directories

In [2]:
if not os.path.exists(RAW_IMAGE_DIRECTORY):
    os.makedirs(RAW_IMAGE_DIRECTORY)
    
if not os.path.exists(METADATA_DIRECTORY):
    os.makedirs(METADATA_DIRECTORY)

### Checking existing metadata

In [3]:
existing_metadata = {}
object_ids = set()
raw_metadata = {}
current_page = 1

BASIC_METADATA_PATH = os.path.join(METADATA_DIRECTORY, f'{DATASET_NAME}.json')
RAW_METADATA_PATH = os.path.join(METADATA_DIRECTORY, f'{DATASET_NAME}-raw.json')
OBJECT_IDS_PATH = os.path.join(METADATA_DIRECTORY, f'{DATASET_NAME}-ids.json')

if os.path.exists(BASIC_METADATA_PATH):
    with open(BASIC_METADATA_PATH, 'r') as f:
        existing_metadata = json.load(f)

if os.path.exists(RAW_METADATA_PATH):
    with open(RAW_METADATA_PATH, 'r') as f:
        raw_metadata = json.load(f)

if os.path.exists(OBJECT_IDS_PATH):
    with open(OBJECT_IDS_PATH, 'r') as f:
        data = json.load(f)
        object_ids = set(data['object_ids'])
        current_page = data['current_page']


In [4]:
def save_metadata():
    with atomic_write(BASIC_METADATA_PATH, overwrite=True) as f:
        json.dump(existing_metadata, f, default=str)

def save_raw_metadata():
    with atomic_write(RAW_METADATA_PATH, overwrite=True) as f:
        json.dump(raw_metadata, f)

def save_object_ids():
    with atomic_write(OBJECT_IDS_PATH, overwrite=True) as f:
        json.dump({'object_ids': list(object_ids), 'current_page': current_page}, f)

In [5]:
def extract_id(value):
    return value.split('/')[6].strip()

### Iterate through the activity stream, gathering object ids

In [6]:
def flatten(d):
    for i in getattr(d, 'values', lambda :d)():
        if isinstance(i, str):
            yield i
        elif i is not None and isinstance(i, (dict, list)):
            yield from flatten(i)

def gather_object_ids(data):
    possible_ids = flatten(data)
    for value in possible_ids:
        if BASE_METADATA_URL in value:
            id = extract_id(value)
            object_ids.add(id)


In [7]:
last_object_ids_length = len(object_ids)
page_limit = 39181 # Taken from https://data.getty.edu/museum/collection/activity-stream

with tqdm(total=DATASET_IDS_LIMIT) as pbar:
    while current_page < page_limit and DATASET_IDS_LIMIT > len(object_ids):
        try:
            url = f'{ACTIVITY_STREAM_URL}/{current_page}'
            data = requests.get(url, headers={'Accept': ACCEPT,  "Accept-Language": ACCEPT_LANGUAGE })
            data = data.json()
            gather_object_ids(data)
    
            current_page += 1
            
            if len(object_ids) - last_object_ids_length >= ID_DOWNLOADING_SAVE_PERIOD:
                save_object_ids()
                last_object_ids_length = len(object_ids)
        
        except Exception as e:
            print(f'Failed at url "{url}" - {e}')
        pbar.update(len(object_ids) - pbar.n)
        pbar.set_description(f'current_page={current_page}')

  0%|          | 0/150000 [00:00<?, ?it/s]

### Gather metadata and images for the scraped IDs

In [8]:
def download_image(url, path):
    if os.path.exists(path):
        return
    image_data = requests.get(url, stream=True)
    if image_data.status_code == 200:
        with atomic_write(path, overwrite=True, mode='wb') as f:
            for chunk in image_data.iter_content(2048):
                f.write(chunk)

In [11]:
last_metadata_length = len(raw_metadata)

def find_id_in(array, id):
    for a in array:
        if 'id' in a and a['id'] == id:
            return True
    return False

downloaded_images = 0
for id in raw_metadata:
    if 'path' in raw_metadata[id]:
        downloaded_images += 1

with tqdm(total=len(object_ids)) as pbar:
    for id in sorted(list(object_ids)):
        try:
            if id in raw_metadata:
                pbar.update(len(raw_metadata) - pbar.n)
                continue
            url = f'{BASE_METADATA_URL}/{id}'
            data = requests.get(url, headers={'Accept': ACCEPT,  "Accept-Language": ACCEPT_LANGUAGE })
            data = data.json()
            if 'shows' not in data or len(data['shows']) < 1 or 'id' not in data['shows'][0]:
                raw_metadata[id] = { 'unavailable': True }
                continue
    
            media_url = data['shows'][0]['id']
            image_data = requests.get(media_url, headers={'Accept': ACCEPT,  "Accept-Language": ACCEPT_LANGUAGE }).json()

            if 'subject_to' not in image_data or image_data['subject_to'][0]['classified_as'][0]['id'] != CC0_IDENTIFIER:
                raw_metadata[id] = { 'unavailable': True }
                continue

            path = os.path.join(RAW_IMAGE_DIRECTORY, f'get_{id}.jpg')

            image_download_url = image_data['digitally_shown_by'][0]['access_point'][0]['id']
            
            download_image(f'{image_download_url}/full/!{IMAGE_WIDTH},/0/default.jpg', path)

            data['image_data'] = image_data
            data['path'] = path
            raw_metadata[id] = data
            downloaded_images += 1
            if len(raw_metadata) - last_metadata_length >= METADATA_DOWNLOADING_SAVE_PERIOD:
                save_raw_metadata()
                last_metadata_length = len(raw_metadata)
        
        except Exception as e:
            print(f'Failed at url "{url}" - {e}')
        pbar.update(len(raw_metadata) - pbar.n)
        pbar.set_description(f'images={downloaded_images}')
save_raw_metadata()

  0%|          | 0/156511 [00:00<?, ?it/s]

### Processing the metadata file to ready the data, as parsing it takes a long time

In [10]:
HAS_CONTENT = '<http://www.cidoc-crm.org/cidoc-crm/P190_has_symbolic_content>'
HAS_LABEL = '<http://www.w3.org/2000/01/rdf-schema#label>'
HAS_BEGIN_DATE = '<http://www.cidoc-crm.org/cidoc-crm/P82a_begin_of_the_begin>'
HAS_END_DATE = '<http://www.cidoc-crm.org/cidoc-crm/P82b_end_of_the_end>'

IS_OF_TYPE = '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>'
CLASSIFIED_AS = '<http://www.cidoc-crm.org/cidoc-crm/P2_has_type>'
IDENTIFIED_BY = '<http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by>'
CURRENT_KEEPER = '<http://www.cidoc-crm.org/cidoc-crm/P50_has_current_keeper>'

CREDIT_LINE = '<http://vocab.getty.edu/aat/300435418>'
WORK_TYPE = '<http://vocab.getty.edu/aat/300435443>'
MATERIALS_DESCRIPTION = '<http://vocab.getty.edu/aat/300435429>'
TIMESPAN = '<http://www.cidoc-crm.org/cidoc-crm/E52_Time-Span>'

last_metadata_length = len(existing_metadata)

def get_first(data, query):
    for p, in data.query(query):
        return p.toPython()
    return None

for id in tqdm(raw_metadata):
    if id in existing_metadata:
        continue
    if 'unavailable' in raw_metadata[id]:
        existing_metadata[id] = { 'unavailable': True }
        continue

    entry = {}
    data = rdflib.Graph().parse(data=json.dumps(raw_metadata[id]), format="json-ld")
    entry['id'] = id
    entry['title'] = raw_metadata[id]['_label']
    entry['type'] = get_first(data, f"SELECT DISTINCT ?content WHERE {{ ?sub {CLASSIFIED_AS} {WORK_TYPE} . ?sub {HAS_CONTENT} ?content . }} ")
    entry['path'] = raw_metadata[id]['path']
    entry['department'] = get_first(data, f"SELECT DISTINCT ?label WHERE {{ ?sub {CURRENT_KEEPER} ?sub2 . ?sub2 {HAS_LABEL} ?label . }} ")
    entry['collection'] = get_first(data, f"SELECT DISTINCT ?content WHERE {{ ?sub {CLASSIFIED_AS} {CREDIT_LINE} . ?sub {HAS_CONTENT} ?content . }} ")
    entry['culture'] = get_first(data, f"SELECT DISTINCT ?content WHERE {{ ?sub {HAS_LABEL} \"Culture Statement\" . ?sub {HAS_CONTENT} ?content . }} ")
    entry['technique'] = get_first(data, f"SELECT DISTINCT ?content WHERE {{ ?sub {CLASSIFIED_AS} {MATERIALS_DESCRIPTION} . ?sub {HAS_CONTENT} ?content . }} ")
                
    entry['date'] = get_first(data, f"SELECT DISTINCT ?content WHERE {{ ?sub {IS_OF_TYPE} {TIMESPAN} . ?sub {IDENTIFIED_BY} ?sub2 . ?sub2 {HAS_CONTENT} ?content . }} ")
    entry['begin_date'] = get_first(data, f"SELECT DISTINCT ?date WHERE {{ ?sub {IS_OF_TYPE} {TIMESPAN} . ?sub {HAS_BEGIN_DATE} ?date . }} ")
    entry['end_date'] = get_first(data, f"SELECT DISTINCT ?date WHERE {{ ?sub {IS_OF_TYPE} {TIMESPAN} . ?sub {HAS_END_DATE} ?date . }} ")
    existing_metadata[id] = entry

    if len(existing_metadata) - last_metadata_length >= METADATA_PARSING_SAVE_PERIOD:
        save_metadata()
        last_metadata_length = len(existing_metadata)
        
save_metadata()
        


  0%|          | 0/156511 [00:00<?, ?it/s]

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#dateTime, Converter=<function parse_datetime at 0x7f1715f156c0>
Traceback (most recent call last):
  File "/home/macron/.pyenv/versions/3.11.6/lib/python3.11/site-packages/rdflib/term.py", line 2119, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
           ^^^^^^^^^^^^^^^^^^
  File "/home/macron/.pyenv/versions/3.11.6/lib/python3.11/site-packages/isodate/isodatetime.py", line 55, in parse_datetime
    tmpdate = parse_date(datestring)
              ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/macron/.pyenv/versions/3.11.6/lib/python3.11/site-packages/isodate/isodates.py", line 203, in parse_date
    raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: '-0299-01-01'
Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#dateTime, Converter=<function par