# Downloading Dataset from The National Gallery of Art
The gallery is sharing an art collection dataset with CC0, what we can use to train our CC0 ResNet backbone for other projects!

Their github page: https://github.com/NationalGalleryOfArt/opendata contains .csv files for the artworks, and we'll be downloading two of them and processing, downloading images of some of the artworks.

General data is stored in objects.csv data, and links to the images in published_images.csv data, they're joined together by objectid.

### Constants and imports

In [1]:
import requests
import os
import random
import json
from tqdm.notebook import tqdm
import pandas as pd

CMA_GITHUB_OBJECT_DATA_CSV = 'https://github.com/NationalGalleryOfArt/opendata/raw/main/data/objects.csv'
CMA_GITHUB_IMAGES_DATA_CSV = 'https://github.com/NationalGalleryOfArt/opendata/raw/main/data/published_images.csv'
METADATA_DIRECTORY = 'metadata'
DATASET_NAME = 'nga'
RAW_METADATA_DIRECTORY = os.path.join(METADATA_DIRECTORY, 'raw')
RAW_METADATA_OBJECTS_FILENAME = os.path.join(RAW_METADATA_DIRECTORY, f'{DATASET_NAME}_objects.csv')
RAW_METADATA_IMAGES_FILENAME = os.path.join(RAW_METADATA_DIRECTORY, f'{DATASET_NAME}_images.csv')

RAW_IMAGE_DIRECTORY = os.path.join('dataset', 'raw')
DATASET_IMAGES_LIMIT = 30000
MAX_IMAGE_SIZE = 1000

### Preparing directories

In [2]:
if not os.path.exists(RAW_IMAGE_DIRECTORY):
    os.makedirs(RAW_IMAGE_DIRECTORY)

if not os.path.exists(RAW_METADATA_DIRECTORY):
    os.makedirs(RAW_METADATA_DIRECTORY)
    
if not os.path.exists(METADATA_DIRECTORY):
    os.makedirs(METADATA_DIRECTORY)

### Downloading the raw metadata file onto the disk

In [3]:
if not os.path.exists(RAW_METADATA_OBJECTS_FILENAME):
    metadata_data = requests.get(CMA_GITHUB_OBJECT_DATA_CSV, stream=True)
    if metadata_data.status_code == 200:
        with open(RAW_METADATA_OBJECTS_FILENAME, 'wb') as f:
            for chunk in metadata_data.iter_content(2048):
                f.write(chunk)
                
if not os.path.exists(RAW_METADATA_IMAGES_FILENAME):
    metadata_data = requests.get(CMA_GITHUB_IMAGES_DATA_CSV, stream=True)
    if metadata_data.status_code == 200:
        with open(RAW_METADATA_IMAGES_FILENAME, 'wb') as f:
            for chunk in metadata_data.iter_content(2048):
                f.write(chunk)

### Loading the raw metadata file using pandas

In [4]:
raw_objects_metadata = pd.read_csv(RAW_METADATA_OBJECTS_FILENAME,
                                  dtype={"volume": "string",
                                         "customprinturl": "string"})

raw_objects_metadata.head()

Unnamed: 0,objectid,accessioned,accessionnum,locationid,title,displaydate,beginyear,endyear,visualbrowsertimespan,medium,...,parentid,isvirtual,departmentabbr,portfolio,series,volume,watermarks,lastdetectedmodification,wikidataid,customprinturl
0,30104,1,1943.8.18015,,Technique Demo (Wood Grain),c. 1938,1938.0,1938.0,1926 to 1950,watercolor and graphite on paper,...,,0,CG-W,,,,,2023-05-09 17:01:03.48-04,Q64573339,
1,30110,1,1943.8.18021,,Rooster Weather Vane,1935/1942,1935.0,1942.0,1926 to 1950,watercolor and graphite on paper,...,,0,CG-W,,,,,2023-05-09 17:01:03.48-04,Q64535352,
2,30204,1,1943.10.89,,Veduta dell' ... Colosseo,,1720.0,1778.0,1701 to 1725,etching,...,,0,CG-E,Vedute di Roma,Vedute di Roma [from Keppel volume 1943.10.49-95],,,2023-05-10 13:28:58.583-04,Q65020446,
3,30919,1,1945.5.142,,"Vue D'optique, or A Treat for the Curious",1784,1784.0,1784.0,1776 to 1800,,...,,0,CG-E,,,,,2023-05-09 17:01:03.48-04,Q65025819,
4,30219,1,1943.11.9,,Seated Figure,,1885.0,1930.0,1876 to 1900,graphite on wove paper,...,,0,CG-W,,,,NORMANDY VELLUM-FRANCE,2023-05-09 17:01:03.48-04,Q64591625,


In [5]:
raw_images_metadata = pd.read_csv(RAW_METADATA_IMAGES_FILENAME)

raw_images_metadata.head() 

Unnamed: 0,uuid,iiifurl,iiifthumburl,viewtype,sequence,width,height,maxpixels,created,modified,depictstmsobjectid,assistivetext
0,00007f61-4922-417b-8f27-893ea328206c,https://api.nga.gov/iiif/00007f61-4922-417b-8f...,https://api.nga.gov/iiif/00007f61-4922-417b-8f...,primary,0.0,3365,4332,,2013-07-05 15:41:08-04,2023-07-27 12:06:38-04,17387,
1,0000bd8c-39de-4453-b55d-5e28a9beed38,https://api.nga.gov/iiif/0000bd8c-39de-4453-b5...,https://api.nga.gov/iiif/0000bd8c-39de-4453-b5...,primary,0.0,3500,4688,,2013-08-05 14:31:59-04,2023-07-27 12:11:57-04,19245,
2,0001668a-dd1c-48e8-9267-b6d1697d43c8,https://api.nga.gov/iiif/0001668a-dd1c-48e8-92...,https://api.nga.gov/iiif/0001668a-dd1c-48e8-92...,primary,0.0,3446,4448,,2014-01-02 14:50:50-05,2023-07-27 12:39:11-04,23830,
3,00032658-8a7a-44e3-8bb8-df8c172f521d,https://api.nga.gov/iiif/00032658-8a7a-44e3-8b...,https://api.nga.gov/iiif/00032658-8a7a-44e3-8b...,primary,0.0,2674,3798,,2010-10-13 15:37:25-04,2023-07-27 15:51:54-04,713,
4,0003d4e4-d7fd-4835-8d27-1e9e20672e39,https://api.nga.gov/iiif/0003d4e4-d7fd-4835-8d...,https://api.nga.gov/iiif/0003d4e4-d7fd-4835-8d...,primary,0.0,3000,2648,640.0,2014-11-19 14:24:42-05,2023-11-07 14:13:17-05,71457,


### Joining the two datasources

In [6]:
raw_metadata = raw_objects_metadata.merge(raw_images_metadata, left_on='objectid', right_on='depictstmsobjectid')

raw_metadata.head()

Unnamed: 0,objectid,accessioned,accessionnum,locationid,title,displaydate,beginyear,endyear,visualbrowsertimespan,medium,...,iiifthumburl,viewtype,sequence,width,height,maxpixels,created,modified,depictstmsobjectid,assistivetext
0,30104,1,1943.8.18015,,Technique Demo (Wood Grain),c. 1938,1938.0,1938.0,1926 to 1950,watercolor and graphite on paper,...,https://api.nga.gov/iiif/4602931f-9dc6-4bcd-8d...,primary,0.0,3376,4764,,2014-07-30 11:50:37-04,2023-07-27 13:24:06-04,30104,
1,30110,1,1943.8.18021,,Rooster Weather Vane,1935/1942,1935.0,1942.0,1926 to 1950,watercolor and graphite on paper,...,https://api.nga.gov/iiif/bd65e0f6-49e8-4419-bd...,primary,0.0,3246,4621,,2014-07-31 14:19:10-04,2023-07-27 13:24:24-04,30110,
2,30219,1,1943.11.9,,Seated Figure,,1885.0,1930.0,1876 to 1900,graphite on wove paper,...,https://api.nga.gov/iiif/a79aa2e5-a2cd-4ecb-95...,primary,0.0,4306,3460,640.0,2009-12-03 17:02:53-05,2023-07-27 08:43:00-04,30219,
3,30218,1,1943.11.8,,Peach Blossom,c. 1890-1894,1890.0,1894.0,1876 to 1900,oil on wood,...,https://api.nga.gov/iiif/98459286-8b99-42e6-8f...,primary,0.0,6444,11091,,2019-09-03 18:45:46-04,2024-01-11 22:28:04.103-05,30218,
4,30247,1,1944.2.19,,Salute to the Sun,,1911.0,1944.0,1901 to 1925,lithograph,...,https://api.nga.gov/iiif/60b4acbe-827b-43c9-a0...,primary,0.0,4411,3531,640.0,2012-01-23 12:43:48-05,2023-07-31 12:20:35-04,30247,


### Checking existing metadata

In [7]:
existing_metadata = {}
BASIC_METADATA_PATH = os.path.join(METADATA_DIRECTORY, f'{DATASET_NAME}.json')

if os.path.exists(BASIC_METADATA_PATH):
    with open(BASIC_METADATA_PATH, 'r') as f:
        existing_metadata = json.load(f)

def save_metadata():
    with open(BASIC_METADATA_PATH, 'w') as f:
        json.dump(existing_metadata, f)

### Filtering what we have to download

In [8]:
to_download = raw_metadata[~raw_metadata['objectid'].isin(existing_metadata)]
to_download = to_download[~to_download['iiifthumburl'].isna()]
to_download['iiifthumburl'] = to_download['iiifthumburl'].str.replace('!200,200', f'!{MAX_IMAGE_SIZE},{MAX_IMAGE_SIZE}')
to_download = to_download['objectid'].sample(frac=1)

to_download.shape

(116232,)

### Downloading it and updating the existing metadata file

In [9]:
def download_image(url, path):
    image_data = requests.get(url, stream=True)
    if image_data.status_code == 200:
        with open(path, 'wb') as f:
            for chunk in image_data.iter_content(2048):
                f.write(chunk)

In [None]:
with tqdm(total=DATASET_IMAGES_LIMIT) as pbar:
    for id in to_download:
        if len(existing_metadata) >= DATASET_IMAGES_LIMIT:
            break
        try:
            obj = raw_metadata[raw_metadata["objectid"] == id].iloc[0].to_dict()
            image = obj['iiifthumburl']
            path = os.path.join(RAW_IMAGE_DIRECTORY, f'{DATASET_NAME}_{id}.jpg')
            download_image(image, path)
            obj['path'] = path
            existing_metadata[id] = obj
            save_metadata()
        except Exception as er:
            print(f'Failed to download {id} image - {er}!')
        pbar.update(len(existing_metadata) - pbar.n)

  0%|          | 0/30000 [00:00<?, ?it/s]