# Downloading Dataset from The Metropolitan Museum of Art
The museum is sharing an art collection dataset with CC0, what we can use to train our CC0 ResNet backbone for other projects!
Their website: https://metmuseum.github.io/ lists various endpoints, but we'll be mostly using one to download metadata about an object and then their images.

### Constants and imports


In [1]:
import requests
import os
from tqdm.notebook import tqdm
import json
BASE_URL = 'https://collectionapi.metmuseum.org'
METADATA_DIRECTORY = 'metadata'
RAW_IMAGE_DIRECTORY = os.path.join('dataset', 'raw')
DATASET_NAME = 'met'

### Preparing directories

In [None]:
if not os.path.exists(RAW_IMAGE_DIRECTORY):
    os.makedirs(RAW_IMAGE_DIRECTORY)

if not os.path.exists(METADATA_DIRECTORY):
    os.makedirs(METADATA_DIRECTORY)

### Downloading ObjectIDs

In [2]:
available_ids = requests.get(f'{BASE_URL}/public/collection/v1/objects').json()['objectIDs']

In [3]:
print(f'Object IDs count: {len(available_ids)}')

Object IDs count: 486419


### Checking existing metadata

In [4]:
existing_metadata = {}
BASIC_METADATA_PATH = os.path.join(METADATA_DIRECTORY, f'{DATASET_NAME}.json')

if os.path.exists(BASIC_METADATA_PATH):
    with open(BASIC_METADATA_PATH, 'r') as f:
        existing_metadata = json.load(f)

def save_metadata():
    with open(BASIC_METADATA_PATH, 'w') as f:
        json.dump(existing_metadata, f)

In [5]:
image_path_cache = {}
for id in existing_metadata:
    if 'primaryImage' in existing_metadata[id] and existing_metadata[id]['primaryImage'] != '':
        image_path_cache[existing_metadata[id]['primaryImage']] = existing_metadata[id]['path']

### Filtering what we have to download

In [6]:
to_download = []

for id in available_ids:
    if id not in existing_metadata:
        to_download.append(id)

### Downloading it and updating the existing metadata file

In [7]:
def download_image(url, path):
    image_data = requests.get(url, stream=True)
    if image_data.status_code == 200:
        with open(path, 'wb') as f:
            for chunk in image_data.iter_content(2048):
                f.write(chunk)

In [None]:
for id in tqdm(to_download):
    try:
        obj = requests.get(f'{BASE_URL}/public/collection/v1/objects/{id}').json()
        images = []
        if 'primaryImage' in obj and obj['primaryImage'] != '':
            image = obj['primaryImage']
            if image in image_path_cache:
                obj['path'] = image_path_cache[image]
            else:
                path = os.path.join(RAW_IMAGE_DIRECTORY, f'{DATASET_NAME}_{id}.jpg')
                download_image(image, path)
                image_path_cache[image] = path
                obj['path'] = path
        existing_metadata[id] = obj
        save_metadata()
    except Exception as er:
        print(f'Failed to download {id} image - {er}!')

  0%|          | 0/486419 [00:00<?, ?it/s]