# J. Paul Getty Museum Collection Download
The museum is sharing an art collection dataset with some CC0 license, what we can use to train our CC0 ResNet backbone for other projects!
Their docs page: https://data.getty.edu/museum/collection/docs/ contains documentation how to download the images from their API.
We can use the image as long as it has a valid '"id": "https://creativecommons.org/publicdomain/zero/1.0/"' field in its data, what we will filter on. The dataset may contain up to 80000 images, and we will need a lot of that to train a ResNet backbone that would have transferable knowledge.

Note that their API is just awful, the SPARQL server randomly time-outs but we do need the ids, so I used https://data.getty.edu/museum/collection/sparql-ui and filled 'metadata/GET_crawler_ids.txt' with them by hand, using this query:

```
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT DISTINCT ?sub  WHERE {
      ?sub ?pred ?obj .
      FILTER (regex(str(?sub), "collection/object") && ?ctt = 6).
      bind(((strlen(str(?sub)) - strlen(replace(str(?sub), "/", ""))) / strlen("/")) as ?ctt)
   } LIMIT 10;
```

### Constants and imports

In [1]:
import requests
import os
import random
import json
from tqdm.notebook import tqdm
import pandas as pd
from atomicwrites import atomic_write

BASE_METADATA_URL = 'https://data.getty.edu/museum/collection/object'
COLLECTION_URL = 'https://data.getty.edu/museum/collection'
METADATA_DIRECTORY = 'metadata'
DATASET_NAME = 'get'
RAW_METADATA_DIRECTORY = os.path.join(METADATA_DIRECTORY, 'raw')
RAW_IMAGE_DIRECTORY = os.path.join('dataset', 'raw')
DATASET_IMAGES_LIMIT = 70000
DATASET_IDS_LIMIT = 100000
DOWNLOADING_SAVE_PERIOD = 5
BASE_CRAWL_FILE = os.path.join(METADATA_DIRECTORY, 'GET_crawler_ids.txt')
ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
ACCEPT_ENCODING = "gzip, deflate, br"
ACCEPT_LANGUAGE = "en-US,en;q=0.5"

### Preparing directories

In [2]:
if not os.path.exists(RAW_IMAGE_DIRECTORY):
    os.makedirs(RAW_IMAGE_DIRECTORY)

if not os.path.exists(RAW_METADATA_DIRECTORY):
    os.makedirs(RAW_METADATA_DIRECTORY)
    
if not os.path.exists(METADATA_DIRECTORY):
    os.makedirs(METADATA_DIRECTORY)

### Loading started ids for crawling

In [3]:
with open(BASE_CRAWL_FILE, 'r') as f:
    starting_crawl_queue = [id.replace('<', '').replace('>', '').strip() for id in f.readlines()]

### Checking existing metadata

In [4]:
existing_metadata = {}
raw_metadata = {}

BASIC_METADATA_PATH = os.path.join(METADATA_DIRECTORY, f'{DATASET_NAME}.json')
RAW_METADATA_PATH = os.path.join(METADATA_DIRECTORY, f'{DATASET_NAME}-raw.json')
CRAWL_QUEUE_PATH = os.path.join(METADATA_DIRECTORY, f'{DATASET_NAME}-ids.json')

if os.path.exists(BASIC_METADATA_PATH):
    with open(BASIC_METADATA_PATH, 'r') as f:
        existing_metadata = json.load(f)

if os.path.exists(RAW_METADATA_PATH):
    with open(RAW_METADATA_PATH, 'r') as f:
        raw_metadata = json.load(f)

if os.path.exists(CRAWL_QUEUE_PATH):
    with open(CRAWL_QUEUE_PATH, 'r') as f:
        starting_crawl_queue = json.load(f)

def save_metadata():
    with atomic_write(BASIC_METADATA_PATH, overwrite=True) as f:
        json.dump(existing_metadata, f)

def save_raw_metadata():
    with atomic_write(RAW_METADATA_PATH, overwrite=True) as f:
        json.dump(raw_metadata, f)

def save_crawl_queue():
    with atomic_write(CRAWL_QUEUE_PATH, overwrite=True) as f:
        json.dump(crawl_queue, f)

In [5]:
crawl_queue = []
all_ids = set()

for id in starting_crawl_queue:
    all_ids.add(id.split('/')[6].strip())
    if id not in raw_metadata:
        crawl_queue.append(id)

### Crawl through the site, readying raw metadata and further IDs to crawl

In [6]:
def flatten(d):
    for i in getattr(d, 'values', lambda :d)():
        if isinstance(i, str):
            yield i
        elif i is not None and isinstance(i, (dict, list)):
            yield from flatten(i)

def fill_crawl_queue(result):
    possible_ids = flatten(result)
    for value in possible_ids:
        if COLLECTION_URL in value:
            trimmed_value = value.split('/')[6].strip()
            if trimmed_value not in raw_metadata and trimmed_value not in all_ids:
                all_ids.add(trimmed_value)
                crawl_queue.append(value)

In [None]:
i = 0 

with tqdm(total=DATASET_IDS_LIMIT) as pbar:
    while len(crawl_queue) > 0 and DATASET_IDS_LIMIT > len(raw_metadata):
        url = crawl_queue[0].strip()
        crawl_queue = crawl_queue[1:]
        data = requests.get(url, headers={'Accept': ACCEPT, "Accept-Encoding": ACCEPT_ENCODING, "Accept-Language": ACCEPT_LANGUAGE}).json()
        if BASE_METADATA_URL in url:
            current_id = url.split('/')[6].strip()
            raw_metadata[current_id] = data
        fill_crawl_queue(data)
        if i % DOWNLOADING_SAVE_PERIOD == 0:
            save_raw_metadata()
            save_crawl_queue()
        i += 1
        pbar.update(len(raw_metadata) - pbar.n)
        pbar.set_description(f'queue_size={len(crawl_queue)}')

  0%|          | 0/100000 [00:00<?, ?it/s]