# Downloading Dataset from The Art Institute of Chicago
This museum is also sharing an art collection dataset with some CC0 license, what we can use to train our CC0 ResNet backbone for other projects!
Their github page: https://github.com/art-institute-of-chicago/api-data contains metadata for us to download, and they encourage for us to use it for downloading a larger dataset - exactly what we want to do. Keep in mind that this metadata weighs almost 2 GB after uncompressing.

We'll be downloading them in 843 size, as supposedly, it has a large chance to result in a cache hit. In the end, we scale images to 224x224, so that quality should be enough.

As requested, we wait a second before each request and we don't multithread them.

### Constants and imports

In [1]:
import requests
import os
import random
import json
from tqdm.notebook import tqdm
import pandas as pd
import shutil
from time import sleep
from atomicwrites import atomic_write

AIC_DATA_TARBZ = 'https://artic-api-data.s3.amazonaws.com/artic-api-data.tar.bz2'
METADATA_DIRECTORY = 'metadata'
DATASET_NAME = 'aic'
RAW_METADATA_DIRECTORY = os.path.join(METADATA_DIRECTORY, 'raw')
RAW_METADATA_ZIPPED_FILENAME = os.path.join(RAW_METADATA_DIRECTORY, 'aic.tar.bz2')
RAW_METADATA_UNZIP_LOCATION = os.path.join(RAW_METADATA_DIRECTORY, DATASET_NAME)
RAW_METADATA_JSON_LOCATION = os.path.join(RAW_METADATA_UNZIP_LOCATION, 'artic-api-data', 'json')
RAW_ARTWORKS_METADATA_DIRECTORY = os.path.join(RAW_METADATA_JSON_LOCATION, 'artworks')
RAW_IMAGES_METADATA_DIRECTORY = os.path.join(RAW_METADATA_JSON_LOCATION, 'images')
RAW_IMAGE_DIRECTORY = os.path.join('dataset', 'raw')

BASE_URL = 'https://www.artic.edu/iiif/2/'
DATASET_IMAGES_LIMIT = 70000
IMAGE_WIDTH = 843
DOWNLOADING_SAVE_PERIOD = 5

### Preparing directories

In [2]:
 
if not os.path.exists(RAW_METADATA_DIRECTORY):
    os.makedirs(RAW_METADATA_DIRECTORY)


### Downloading the compressed file onto the disk

In [3]:
if not os.path.exists(RAW_METADATA_ZIPPED_FILENAME):
    metadata_data = requests.get(AIC_DATA_TARBZ, stream=True)
    if metadata_data.status_code == 200:
        with open(RAW_METADATA_ZIPPED_FILENAME, 'wb') as f:
            for chunk in metadata_data.iter_content(2048):
                f.write(chunk)


### Unzipping the compressed file

In [4]:
if not os.path.exists(RAW_METADATA_UNZIP_LOCATION) or (os.listdir(RAW_METADATA_UNZIP_LOCATION)) == 0:
    shutil.unpack_archive(RAW_METADATA_ZIPPED_FILENAME, RAW_METADATA_UNZIP_LOCATION)

### Loading JSONs into an array

In [5]:
main_metadata = {}

if os.path.exists(RAW_ARTWORKS_METADATA_DIRECTORY):
    for file in os.listdir(RAW_ARTWORKS_METADATA_DIRECTORY):
        with open(os.path.join(RAW_ARTWORKS_METADATA_DIRECTORY, file), 'r') as f:
            main_metadata[file.replace('.json', '')] = json.load(f)

In [6]:
images = {}

if os.path.exists(RAW_IMAGES_METADATA_DIRECTORY):
    for file in os.listdir(RAW_IMAGES_METADATA_DIRECTORY):
        with open(os.path.join(RAW_IMAGES_METADATA_DIRECTORY, file), 'r') as f:
            image_metadata = json.load(f)
            for artwork_id in image_metadata['artwork_ids']:
                images[str(artwork_id)] = image_metadata

### Filtering the data to only public domain and have an image

In [7]:
valid_artworks = []

for artwork_id in main_metadata:
    if artwork_id not in images or 'is_public_domain' not in main_metadata[artwork_id] or main_metadata[artwork_id]['is_public_domain'] != True:
        continue
    valid_artworks.append(artwork_id)

### Checking existing metadata

In [9]:
existing_metadata = {}
BASIC_METADATA_PATH = os.path.join(METADATA_DIRECTORY, f'{DATASET_NAME}.json')

if os.path.exists(BASIC_METADATA_PATH):
    with open(BASIC_METADATA_PATH, 'r') as f:
        existing_metadata = json.load(f)

def save_metadata():
    with atomic_write(BASIC_METADATA_PATH, overwrite=True) as f:
        json.dump(existing_metadata, f)

### Filtering what we have to download

In [10]:
to_download = []
image_path_cache = {}

for id in valid_artworks:
    if id not in existing_metadata:
        to_download.append(id)
    else:
        image_path_cache[existing_metadata['image_id']] = existing_metadata['path']

len(to_download)

56961

### Downloading it and updating the existing metadata file

In [11]:
def download_image(url, path):
    if os.path.exists(path):
        return
    image_data = requests.get(url, stream=True)
    if image_data.status_code == 200:
        with atomic_write(path, overwrite=True, mode='wb') as f:
            for chunk in image_data.iter_content(2048):
                f.write(chunk)

In [13]:
with tqdm(total=DATASET_IMAGES_LIMIT) as pbar:
    for i, id in enumerate(to_download):
        if len(image_path_cache) >= DATASET_IMAGES_LIMIT:
            break
        try:
            obj = main_metadata[id]
            image_data = images[id]
            image_id = image_data['id']
            if image_id not in image_path_cache:
                path = os.path.join(RAW_IMAGE_DIRECTORY, f'{DATASET_NAME}_{id}.jpg')
                download_image(f'{BASE_URL}{image_id}/full/{IMAGE_WIDTH},/0/default.jpg', path)
                image_path_cache[image_id] = path
            else:
                path = image_path_cache[image_id]
            obj['path'] = path
            existing_metadata[id] = obj
            if i % DOWNLOADING_SAVE_PERIOD == 0:
                save_metadata()
            sleep(1)
        except Exception as er:
            print(f'Failed to download {id} image - {er}!')
        pbar.update(len(image_path_cache) - pbar.n)

  0%|          | 0/70000 [00:00<?, ?it/s]

Failed to download 9016 image - ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))!
Failed to download 116372 image - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))!
Failed to download 67924 image - ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))!
Failed to download 128703 image - ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))!
Failed to download 117249 image - ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))!


KeyboardInterrupt: 