# Downloading Dataset from The Cleveland Museum of Art
The museum is sharing an art collection dataset with some CC0 license, what we can use to train our CC0 ResNet backbone for other projects!
Their github page: https://github.com/ClevelandMuseumArt/openaccess contains up-to-date metadata about the project. We'll download it, filter by the license and then download images from their server. We'll be using only web quality ones, so that we won't be using too much space on the data.

### Constants and imports

In [1]:
import requests
import os
import random
import json
from tqdm.notebook import tqdm
import pandas as pd
from atomicwrites import atomic_write

CMA_GITHUB_DATA_CSV = 'https://github.com/ClevelandMuseumArt/openaccess/raw/master/data.csv'
METADATA_DIRECTORY = 'metadata'
DATASET_NAME = 'cma'
RAW_METADATA_DIRECTORY = os.path.join(METADATA_DIRECTORY, 'raw')
RAW_METADATA_DIRECTORY_FILENAME = os.path.join(RAW_METADATA_DIRECTORY, f'{DATASET_NAME}.csv')
RAW_IMAGE_DIRECTORY = os.path.join('dataset', 'raw')
DATASET_IMAGES_LIMIT = 70000

### Preparing directories

In [2]:
if not os.path.exists(RAW_IMAGE_DIRECTORY):
    os.makedirs(RAW_IMAGE_DIRECTORY)

if not os.path.exists(RAW_METADATA_DIRECTORY):
    os.makedirs(RAW_METADATA_DIRECTORY)
    
if not os.path.exists(METADATA_DIRECTORY):
    os.makedirs(METADATA_DIRECTORY)

### Downloading the raw metadata file onto the disk

In [3]:
if not os.path.exists(RAW_METADATA_DIRECTORY_FILENAME):
    metadata_data = requests.get(CMA_GITHUB_DATA_CSV, stream=True)
    if metadata_data.status_code == 200:
        with open(RAW_METADATA_DIRECTORY_FILENAME, 'wb') as f:
            for chunk in metadata_data.iter_content(2048):
                f.write(chunk)

### Loading the raw metadata file using pandas

In [4]:
raw_metadata = pd.read_csv(RAW_METADATA_DIRECTORY_FILENAME, 
                           dtype={'series': 'string',
                                  'current_location': 'string', 
                                  'series_in_original_language': 'string',
                                  'title_in_original_language': 'string',
                                  'sketchfab_id': 'string',
                                  'sketchfab_url': 'string',
                                  'gallery_donor_text': 'string',
                                  'find_spot': 'string'})
raw_metadata.head()

Unnamed: 0,id,accession_number,share_license_status,tombstone,current_location,title,title_in_original_language,series,series_in_original_language,creation_date,...,alternate_images,creditline,sketchfab_id,sketchfab_url,gallery_donor_text,creators,image_web,image_print,image_full,updated_at
0,74228,2020.113,CC0,"Fishmarket, 1902. Camille Pissarro (French, 18...",,Fishmarket,,,,1902,...,[],Nancy F. and Joseph P. Keithley Collection Gift,,,,"Camille Pissarro (French, 1830–1903), artist",https://openaccess-cdn.clevelandart.org/2020.1...,https://openaccess-cdn.clevelandart.org/2020.1...,https://openaccess-cdn.clevelandart.org/2020.1...,2024-01-21 09:54:14.728000
1,74539,2015.449,CC0,"A Miller's Carriage, c. 1895. Albert-Charles L...",,A Miller's Carriage,,,,c. 1895,...,[],Bequest of Muriel Butkin,,,,"Albert-Charles Lebourg (French, 1849–1928), ar...",https://openaccess-cdn.clevelandart.org/2015.4...,https://openaccess-cdn.clevelandart.org/2015.4...,https://openaccess-cdn.clevelandart.org/2015.4...,2024-01-21 09:54:14.734000
2,74540,2015.451,CC0,"Leda and the Swan, c. 1846–83. Adolphe Yvon (F...",,Leda and the Swan,,,,c. 1846–83,...,[],Bequest of Muriel Butkin,,,,"Adolphe Yvon (French, 1817–1893), artist",https://openaccess-cdn.clevelandart.org/2015.4...,https://openaccess-cdn.clevelandart.org/2015.4...,https://openaccess-cdn.clevelandart.org/2015.4...,2024-01-21 09:54:14.739000
3,74551,2018.1059,CC0,"The Monks, c. 1802–30. François Marius Granet ...",,The Monks,,,,c. 1802–30,...,[],Bequest of Muriel Butkin,,,,"François Marius Granet (French, 1775–1849), ar...",https://openaccess-cdn.clevelandart.org/2018.1...,https://openaccess-cdn.clevelandart.org/2018.1...,https://openaccess-cdn.clevelandart.org/2018.1...,2024-01-21 09:54:14.744000
4,74553,2018.106,CC0,"Study Sheet, c. 1870–80. Alfred Dehodencq (Fre...",,Study Sheet,,,,c. 1870–80,...,[],Bequest of Muriel Butkin,,,,"Alfred Dehodencq (French, 1822–1882), artist",https://openaccess-cdn.clevelandart.org/2018.1...,https://openaccess-cdn.clevelandart.org/2018.1...,https://openaccess-cdn.clevelandart.org/2018.1...,2024-01-21 09:54:14.750000


In [5]:
raw_metadata.dtypes

id                                      int64
accession_number                       object
share_license_status                   object
tombstone                              object
current_location               string[python]
title                                  object
title_in_original_language     string[python]
series                         string[python]
series_in_original_language    string[python]
creation_date                          object
creation_date_earliest                float64
creation_date_latest                  float64
artists_tags                           object
culture                                object
technique                              object
support_materials                      object
department                             object
collection                             object
type                                   object
measurements                           object
state_of_the_work                      object
edition_of_the_work               

### Checking existing metadata

In [6]:
existing_metadata = {}
BASIC_METADATA_PATH = os.path.join(METADATA_DIRECTORY, f'{DATASET_NAME}.json')

if os.path.exists(BASIC_METADATA_PATH):
    with open(BASIC_METADATA_PATH, 'r') as f:
        existing_metadata = json.load(f)

def save_metadata():
    with atomic_write(BASIC_METADATA_PATH, overwrite=True) as f:
        json.dump(existing_metadata, f)

#### Rescuing all downloaded images that got their metadata lost 
Sometimes, running the script multiple separate times and getting a SIGINT may cause the metadata file to get cut, while the images still exists. To combat it, we try to first prepare the metadata for all existing image files.

In [7]:
for image in os.listdir(RAW_IMAGE_DIRECTORY):
    if image.startswith(DATASET_NAME):
        id = int(image.replace(f'{DATASET_NAME}_', '').replace('.jpg', ''))
        if str(id) not in existing_metadata:
            obj = raw_metadata[raw_metadata["id"] == id].iloc[0].to_dict() 
            obj['path'] = os.path.join(RAW_IMAGE_DIRECTORY, image)
            existing_metadata[id] = obj

save_metadata()

### Filtering what we have to download

In [8]:
to_download = raw_metadata[~raw_metadata['id'].isin(existing_metadata)]
to_download = to_download[to_download['share_license_status'] == 'CC0']
to_download = to_download[~to_download['image_web'].isna()]
to_download = to_download['id'].sample(frac=1)

to_download.shape

(2191,)

### Downloading it and updating the existing metadata file

In [9]:
def download_image(url, path):
    image_data = requests.get(url, stream=True)
    if image_data.status_code == 200:
        with atomic_write(path, overwrite=True, mode='wb') as f:
            for chunk in image_data.iter_content(2048):
                f.write(chunk)
    

In [None]:
with tqdm(total=DATASET_IMAGES_LIMIT) as pbar:
    for id in to_download:
        if len(existing_metadata) >= DATASET_IMAGES_LIMIT:
            break
        try:
            obj = raw_metadata[raw_metadata["id"] == id].iloc[0].to_dict()
            image = obj['image_web']
            path = os.path.join(RAW_IMAGE_DIRECTORY, f'{DATASET_NAME}_{id}.jpg')
            download_image(image, path)
            obj['path'] = path
            existing_metadata[id] = obj
            save_metadata()
        except Exception as er:
            print(f'Failed to download {id} image - {er}!')
        pbar.update(len(existing_metadata) - pbar.n)

  0%|          | 0/70000 [00:00<?, ?it/s]