In [1]:
import requests
from PIL import Image
import pandas as pd
import os
from io import BytesIO
import numpy as np
from ratelimit import limits, sleep_and_retry

Loading the .json file from MoMas API, cleaning it and saving it as a .csv file to be used later:

In [2]:
moma_data = pd.read_json('moma_artworks.json')

In [3]:
moma_df=pd.DataFrame(moma_data)

In [4]:
moma_df.shape

(140848, 28)

In [5]:
moma_df.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Height (cm)', 'Width (cm)', 'Depth (cm)', 'Weight (kg)',
       'Diameter (cm)', 'Length (cm)', 'Circumference (cm)',
       'Duration (sec.)'],
      dtype='object')

In [6]:
moma_df.head()

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,URL,ThumbnailURL,Height (cm),Width (cm),Depth (cm),Weight (kg),Diameter (cm),Length (cm),Circumference (cm),Duration (sec.)
0,"Ferdinandsbrücke Project, Vienna, Austria (Ele...",[Otto Wagner],[6210],"[Austrian, 1841–1918]",[Austrian],[1841],[1918],[Male],1896,Ink and cut-and-pasted painted pages on paper,...,http://www.moma.org/collection/works/2,http://www.moma.org/media/W1siZiIsIjUyNzc3MCJd...,48.6,168.9,,,,,,
1,"City of Music, National Superior Conservatory ...",[Christian de Portzamparc],[7470],"[French, born 1944]",[French],[1944],[0],[Male],1987,Paint and colored pencil on print,...,http://www.moma.org/collection/works/3,http://www.moma.org/media/W1siZiIsIjUyNzM3NCJd...,40.6401,29.8451,,,,,,
2,"Villa near Vienna Project, Outside Vienna, Aus...",[Emil Hoppe],[7605],"[Austrian, 1876–1957]",[Austrian],[1876],[1957],[Male],1903,"Graphite, pen, color pencil, ink, and gouache ...",...,http://www.moma.org/collection/works/4,http://www.moma.org/media/W1siZiIsIjUyNzM3NSJd...,34.3,31.8,,,,,,
3,"The Manhattan Transcripts Project, New York, N...",[Bernard Tschumi],[7056],"[French and Swiss, born Switzerland 1944]",[],[1944],[0],[Male],1980,Photographic reproduction with colored synthet...,...,http://www.moma.org/collection/works/5,http://www.moma.org/media/W1siZiIsIjUyNzQ3NCJd...,50.8,50.8,,,,,,
4,"Villa, project, outside Vienna, Austria, Exter...",[Emil Hoppe],[7605],"[Austrian, 1876–1957]",[Austrian],[1876],[1957],[Male],1903,"Graphite, color pencil, ink, and gouache on tr...",...,http://www.moma.org/collection/works/6,http://www.moma.org/media/W1siZiIsIjUyNzQ3NSJd...,38.4,19.1,,,,,,


In [7]:
moma_df_update=moma_df[['ObjectID', 'Title', 'Artist', 'Date', 'Medium', 'URL', 'ThumbnailURL']]

In [8]:
moma_df_update[['Medium']].shape

(140848, 1)

In [9]:
moma_df_update[['Medium']].isna().sum()

Medium    9631
dtype: int64

In [10]:
moma_df_update=moma_df_update[moma_df_update['Medium'].notna()]

In [11]:
moma_df_update.shape

(131217, 7)

In [12]:
moma_df_update=moma_df_update[moma_df_update['ThumbnailURL'].notna()]

In [13]:
moma_df_update.shape

(83317, 7)

In [16]:
moma_df_update=moma_df_update[moma_df_update['Date'].notna()]

In [17]:
moma_df_update.shape

(83246, 7)

In [18]:
moma_df_update['intDate']= moma_df_update['Date'].apply(lambda x: int(x[:4]) if len(x)>3 and x[:3].isdigit() else 0)

In [19]:
moma_df_update.head()

Unnamed: 0,ObjectID,Title,Artist,Date,Medium,URL,ThumbnailURL,intDate
0,2,"Ferdinandsbrücke Project, Vienna, Austria (Ele...",[Otto Wagner],1896,Ink and cut-and-pasted painted pages on paper,http://www.moma.org/collection/works/2,http://www.moma.org/media/W1siZiIsIjUyNzc3MCJd...,1896
1,3,"City of Music, National Superior Conservatory ...",[Christian de Portzamparc],1987,Paint and colored pencil on print,http://www.moma.org/collection/works/3,http://www.moma.org/media/W1siZiIsIjUyNzM3NCJd...,1987
2,4,"Villa near Vienna Project, Outside Vienna, Aus...",[Emil Hoppe],1903,"Graphite, pen, color pencil, ink, and gouache ...",http://www.moma.org/collection/works/4,http://www.moma.org/media/W1siZiIsIjUyNzM3NSJd...,1903
3,5,"The Manhattan Transcripts Project, New York, N...",[Bernard Tschumi],1980,Photographic reproduction with colored synthet...,http://www.moma.org/collection/works/5,http://www.moma.org/media/W1siZiIsIjUyNzQ3NCJd...,1980
4,6,"Villa, project, outside Vienna, Austria, Exter...",[Emil Hoppe],1903,"Graphite, color pencil, ink, and gouache on tr...",http://www.moma.org/collection/works/6,http://www.moma.org/media/W1siZiIsIjUyNzQ3NSJd...,1903


In [20]:
moma_df_update=moma_df_update[moma_df_update['intDate'].astype('int')>1900]

In [21]:
moma_df_update.head()

Unnamed: 0,ObjectID,Title,Artist,Date,Medium,URL,ThumbnailURL,intDate
1,3,"City of Music, National Superior Conservatory ...",[Christian de Portzamparc],1987,Paint and colored pencil on print,http://www.moma.org/collection/works/3,http://www.moma.org/media/W1siZiIsIjUyNzM3NCJd...,1987
2,4,"Villa near Vienna Project, Outside Vienna, Aus...",[Emil Hoppe],1903,"Graphite, pen, color pencil, ink, and gouache ...",http://www.moma.org/collection/works/4,http://www.moma.org/media/W1siZiIsIjUyNzM3NSJd...,1903
3,5,"The Manhattan Transcripts Project, New York, N...",[Bernard Tschumi],1980,Photographic reproduction with colored synthet...,http://www.moma.org/collection/works/5,http://www.moma.org/media/W1siZiIsIjUyNzQ3NCJd...,1980
4,6,"Villa, project, outside Vienna, Austria, Exter...",[Emil Hoppe],1903,"Graphite, color pencil, ink, and gouache on tr...",http://www.moma.org/collection/works/6,http://www.moma.org/media/W1siZiIsIjUyNzQ3NSJd...,1903
5,7,"The Manhattan Transcripts Project, New York, N...",[Bernard Tschumi],1976-77,Gelatin silver photograph,http://www.moma.org/collection/works/7,http://www.moma.org/media/W1siZiIsIjUyNzUyMCJd...,1976


In [22]:
moma_df_update.shape

(66400, 8)

In [23]:
moma_df_update.to_csv('moma_df_update.csv')

From here on, I read the csv to my code and try to download images from it. 

In [77]:
moma_img_df = pd.read_csv('moma_df_update.csv')

  moma_img_df = pd.read_csv('moma_df_update.csv')


In [78]:
def image_preprocessor(image):
    """
    takes an image and changes its size to 224x224 pixels for feeding it to the model.
    """
    resized_image = image.resize((224,224))
    return resized_image

In [83]:
moma_img_df.tail(10)

Unnamed: 0.1,Unnamed: 0,ObjectID,Title,Artist,Date,Medium,URL,ThumbnailURL,intDate
97034,139817,434779,Church Hill from the portfolio Our Present Inv...,['Stanley Wolukau-Wanambwa'],2012-14,Inkjet print,http://www.moma.org/collection/works/434779,http://www.moma.org/media/W1siZiIsIjUyMDYwOSJd...,2012.0
97035,139818,434780,Church Hill from the portfolio Our Present Inv...,['Stanley Wolukau-Wanambwa'],2012-14,Inkjet print,http://www.moma.org/collection/works/434780,http://www.moma.org/media/W1siZiIsIjUyMDYxMCJd...,2012.0
97036,139819,434781,Scott's Addition from the portfolio Our Presen...,['Stanley Wolukau-Wanambwa'],2012-14,Inkjet print,http://www.moma.org/collection/works/434781,http://www.moma.org/media/W1siZiIsIjUyMDYxMSJd...,2012.0
97037,139820,434782,W. Baker Street from the portfolio Our Present...,['Stanley Wolukau-Wanambwa'],2012-14,Inkjet print,http://www.moma.org/collection/works/434782,http://www.moma.org/media/W1siZiIsIjUyMDYxMiJd...,2012.0
97038,139821,434783,Southwood from the portfolio Our Present Inven...,['Stanley Wolukau-Wanambwa'],2012-14,Inkjet print,http://www.moma.org/collection/works/434783,http://www.moma.org/media/W1siZiIsIjUyMDYxMyJd...,2012.0
97039,139822,434784,Colonial Heights from the portfolio Our Presen...,['Stanley Wolukau-Wanambwa'],2012-14,Inkjet print,http://www.moma.org/collection/works/434784,http://www.moma.org/media/W1siZiIsIjUyMDYxNCJd...,2012.0
97040,139823,434785,Swift Creek from the portfolio Our Present Inv...,['Stanley Wolukau-Wanambwa'],2012-14,Inkjet print,http://www.moma.org/collection/works/434785,http://www.moma.org/media/W1siZiIsIjUyMDYxNSJd...,2012.0
97041,139824,434786,Briarcliffe Court from the portfolio Our Prese...,['Stanley Wolukau-Wanambwa'],2012-14,Inkjet print,http://www.moma.org/collection/works/434786,http://www.moma.org/media/W1siZiIsIjUyMDYxNiJd...,2012.0
97042,139825,434787,Dupuy Road from the portfolio Our Present Inve...,['Stanley Wolukau-Wanambwa'],2012-14,Inkjet print,http://www.moma.org/collection/works/434787,http://www.moma.org/media/W1siZiIsIjUyMDYxNyJd...,2012.0
97043,139856,435713,Untitled,['American Artist'],2019,Curtains and bleachers\r\n,http://www.moma.org/collection/works/435713,http://www.moma.org/media/W1siZiIsIjUyMTk0NyJd...,2019.0


In [127]:
moma_img_df.sample(20)

Unnamed: 0.1,Unnamed: 0,ObjectID,Title,Artist,Date,Medium,URL,ThumbnailURL,intDate
1693,2848,3723,Egg Cups,['Wilhelm Wagenfeld'],1938,Pressed glass,http://www.moma.org/collection/works/3723,http://www.moma.org/media/W1siZiIsIjIxMTIyNCJd...,1938.0
60016,81188,89303,New York City Transit Authority,,,,,,
30876,74304,80000,Gas,['Edward Hopper'],1940,Oil on canvas,http://www.moma.org/collection/works/80000,http://www.moma.org/media/W1siZiIsIjQ1NzQxNyJd...,1940.0
53282,81188,89303,New York City Transit Authority,,,,,,
92427,132090,284066,The Seven Ways to Achieve Composure,['Jerome Neuner'],2007,Artist's book,http://www.moma.org/collection/works/284066,http://www.moma.org/media/W1siZiIsIjQzMDc5OCJd...,2007.0
86644,120443,187574,MaKey MaKey,"['Jay Silver', 'Eric Rosenbaum']",2012-2014,Electronic components,http://www.moma.org/collection/works/187574,http://www.moma.org/media/W1siZiIsIjMxMTY0NSJd...,2012.0
9786,32612,34199,Seated Nude,['Fernand Léger'],1913,Ink on paper,http://www.moma.org/collection/works/34199,http://www.moma.org/media/W1siZiIsIjUyMDI4MyJd...,1913.0
45547,81188,89303,New York City Transit Authority,,,,,,
15721,45702,48469,Spain,['Josef Koudelka'],1971,Gelatin silver print,http://www.moma.org/collection/works/48469,http://www.moma.org/media/W1siZiIsIjUyMDE1NCJd...,1971.0
96363,138710,423400,Studies for an Antifascist Memorial,['Bogdan Bogdanović'],1960–1970,Ink on mylar,http://www.moma.org/collection/works/423400,http://www.moma.org/media/W1siZiIsIjUwMDQ5OSJd...,1960.0


In [31]:
url=moma_img_df.iloc[25, 7]

In [32]:
url

'http://www.moma.org/media/W1siZiIsIjUyNzU0MSJdLFsicCIsImNvbnZlcnQiLCItcmVzaXplIDMwMHgzMDBcdTAwM2UiXV0.jpg?sha=67be608d42bd645a'

In [20]:
import matplotlib.pyplot as plt

Gunthers Code:

In [80]:
# url = 'https://www.moma.org/media/W1siZiIsIjIyNjUxMiJdLFsicCIsImNvbnZlcnQiLCItcmVzaXplIDMwMHgzMDBcdTAwM2UiXV0.jpg?sha=481b05786494f6eb'

file_name = url.split('/')[-1].split('?')[0]

response = requests.get(
    url,
    headers={
        'authority': 'www.moma.org',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'en-GB,en;q=0.9',
        'cache-control': 'no-cache',
        'cookie': '_gorilla_csrf=MTY3ODM2NDY3NnxJa2xQYkV0b1NrSnJjaXREVnpWMGQwVTVMemx2TjFKdFNYWk9SREF4ZDFSc0syOXNhWFV5V1hONUszTTlJZ289fDzirBgF_op8ZBg9OrQRosZs96_OoyzzTRK4N2Z1Nm9U; viewedCookieBanner=true; sessionHighlightColor=0; global=MTY3ODM2NTA5NHxOd3dBTkVneVVWRkJXRmhZVGsxRE4wbGFWMUpPVUZCWFNUTTBWa3hCUWxkTlFVUmFVVnBJU1V4RFMwaENXRkkxV2swME5Vb3pXbEU9fMTTVMx8jUCEMc0rOJAX_SI4D3aqPgtr-geehbeVT9Pr',
        'dnt': '1',
        'pragma': 'no-cache',
        'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
    }
)

# print status code
# print(response.status_code)
# save image
with open(file_name, 'wb') as f:
    f.write(response.content)

In [81]:
# @sleep_and_retry
# @limits(calls=CALLS, period=RATE_LIMIT)
def momaimage_downloader(df):
    """
    takes the path to a csv-file and downloads the images from the URLs provided in its imageURL-column.
    Then saves the downloaded images to the output path.
    """
    for index, row in df[['ObjectID', 'ThumbnailURL']].iterrows():
        id= row['ObjectID']
        if pd.notna(row['ThumbnailURL']):
            url= row['ThumbnailURL']
            file_name= f'{id}.jpg'            
            response = requests.get(
            url,
            headers={
                'authority': 'www.moma.org',
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'accept-language': 'en-GB,en;q=0.9',
                'cache-control': 'no-cache',
                'cookie': '_gorilla_csrf=MTY3ODM2NDY3NnxJa2xQYkV0b1NrSnJjaXREVnpWMGQwVTVMemx2TjFKdFNYWk9SREF4ZDFSc0syOXNhWFV5V1hONUszTTlJZ289fDzirBgF_op8ZBg9OrQRosZs96_OoyzzTRK4N2Z1Nm9U; viewedCookieBanner=true; sessionHighlightColor=0; global=MTY3ODM2NTA5NHxOd3dBTkVneVVWRkJXRmhZVGsxRE4wbGFWMUpPVUZCWFNUTTBWa3hCUWxkTlFVUmFVVnBJU1V4RFMwaENXRkkxV2swME5Vb3pXbEU9fMTTVMx8jUCEMc0rOJAX_SI4D3aqPgtr-geehbeVT9Pr',
                'dnt': '1',
                'pragma': 'no-cache',
                'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': '"macOS"',
                'sec-fetch-dest': 'document',
                'sec-fetch-mode': 'navigate',
                'sec-fetch-site': 'none',
                'sec-fetch-user': '?1',
                'upgrade-insecure-requests': '1',
                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'
            }
            )

            if response.status_code == 200:
                image = Image.open(BytesIO(response.content))
                resized_image= image_preprocessor(image)
                file_name= f'{id}.jpg'
                resized_image.save(f"images_moma/{file_name}")
        if index%50==0:
            print(f'{index} images downloaded.')
        
        

To start downloading again where we stopped, I use this code: 

image 89302.jpg is the last image that corresponds to an index in the dataframe!

In [138]:
moma_img_df.shape

(97044, 9)

In [137]:
selected_row = moma_img_df.loc[moma_img_df['ObjectID'] == 106758]
selected_index = selected_row.index
selected_index
selected_row
#print(selected_index[0])

Unnamed: 0.1,Unnamed: 0,ObjectID,Title,Artist,Date,Medium,URL,ThumbnailURL,intDate


In [134]:
new_df=moma_img_df.loc[selected_index[0]:]

Unnamed: 0.1,Unnamed: 0,ObjectID,Title,Artist,Date,Medium,URL,ThumbnailURL,intDate
34891,81187,89302,OPPOSITIONS 3,"['Massimo Vignelli', 'Lella Vignelli']",1974.0,Lithograph with screenprinted cover,http://www.moma.org/collection/works/89302,http://www.moma.org/media/W1siZiIsIjE3MjIwNyJd...,1974.0
34892,81188,89303,New York City Transit Authority,,,,,,
34893,81188,89303,New York City Transit Authority,,,,,,
34894,81188,89303,New York City Transit Authority,,,,,,
34895,81188,89303,New York City Transit Authority,,,,,,
34896,81188,89303,New York City Transit Authority,,,,,,
34897,81188,89303,New York City Transit Authority,,,,,,
34898,81188,89303,New York City Transit Authority,,,,,,
34899,81188,89303,New York City Transit Authority,,,,,,
34900,81188,89303,New York City Transit Authority,,,,,,


In [135]:
momaimage_downloader(new_df)

34900 images downloaded.
34950 images downloaded.
35000 images downloaded.
35050 images downloaded.
35100 images downloaded.
35150 images downloaded.
35200 images downloaded.
35250 images downloaded.
35300 images downloaded.
35350 images downloaded.
35400 images downloaded.
35450 images downloaded.
35500 images downloaded.
35550 images downloaded.
35600 images downloaded.
35650 images downloaded.
35700 images downloaded.
35750 images downloaded.
35800 images downloaded.
35850 images downloaded.
35900 images downloaded.
35950 images downloaded.
36000 images downloaded.
36050 images downloaded.
36100 images downloaded.
36150 images downloaded.
36200 images downloaded.
36250 images downloaded.
36300 images downloaded.
36350 images downloaded.
36400 images downloaded.
36450 images downloaded.
36500 images downloaded.
36550 images downloaded.
36600 images downloaded.
36650 images downloaded.
36700 images downloaded.
36750 images downloaded.
36800 images downloaded.
36850 images downloaded.


52600 images downloaded.
52650 images downloaded.
52700 images downloaded.
52750 images downloaded.
52800 images downloaded.
52850 images downloaded.
52900 images downloaded.
52950 images downloaded.
53000 images downloaded.
53050 images downloaded.
53100 images downloaded.
53150 images downloaded.
53200 images downloaded.
53250 images downloaded.
53300 images downloaded.
53350 images downloaded.
53400 images downloaded.
53450 images downloaded.
53500 images downloaded.
53550 images downloaded.
53600 images downloaded.
53650 images downloaded.
53700 images downloaded.
53750 images downloaded.
53800 images downloaded.
53850 images downloaded.
53900 images downloaded.
53950 images downloaded.
54000 images downloaded.
54050 images downloaded.
54100 images downloaded.
54150 images downloaded.
54200 images downloaded.
54250 images downloaded.
54300 images downloaded.
54350 images downloaded.
54400 images downloaded.
54450 images downloaded.
54500 images downloaded.
54550 images downloaded.


KeyboardInterrupt: 