In [42]:
import requests
from PIL import Image
import pandas as pd
import os
from io import BytesIO
import numpy as np
from ratelimit import limits, sleep_and_retry

In [53]:
moma_data = pd.read_json('moma_artworks.json')

In [65]:
moma_df=pd.DataFrame(moma_data)

In [66]:
moma_df.shape

(140848, 28)

In [67]:
moma_df.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Height (cm)', 'Width (cm)', 'Depth (cm)', 'Weight (kg)',
       'Diameter (cm)', 'Length (cm)', 'Circumference (cm)',
       'Duration (sec.)'],
      dtype='object')

In [75]:
moma_df.head()

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,URL,ThumbnailURL,Height (cm),Width (cm),Depth (cm),Weight (kg),Diameter (cm),Length (cm),Circumference (cm),Duration (sec.)
0,"Ferdinandsbrücke Project, Vienna, Austria (Ele...",[Otto Wagner],[6210],"[Austrian, 1841–1918]",[Austrian],[1841],[1918],[Male],1896,Ink and cut-and-pasted painted pages on paper,...,http://www.moma.org/collection/works/2,http://www.moma.org/media/W1siZiIsIjUyNzc3MCJd...,48.6,168.9,,,,,,
1,"City of Music, National Superior Conservatory ...",[Christian de Portzamparc],[7470],"[French, born 1944]",[French],[1944],[0],[Male],1987,Paint and colored pencil on print,...,http://www.moma.org/collection/works/3,http://www.moma.org/media/W1siZiIsIjUyNzM3NCJd...,40.6401,29.8451,,,,,,
2,"Villa near Vienna Project, Outside Vienna, Aus...",[Emil Hoppe],[7605],"[Austrian, 1876–1957]",[Austrian],[1876],[1957],[Male],1903,"Graphite, pen, color pencil, ink, and gouache ...",...,http://www.moma.org/collection/works/4,http://www.moma.org/media/W1siZiIsIjUyNzM3NSJd...,34.3,31.8,,,,,,
3,"The Manhattan Transcripts Project, New York, N...",[Bernard Tschumi],[7056],"[French and Swiss, born Switzerland 1944]",[],[1944],[0],[Male],1980,Photographic reproduction with colored synthet...,...,http://www.moma.org/collection/works/5,http://www.moma.org/media/W1siZiIsIjUyNzQ3NCJd...,50.8,50.8,,,,,,
4,"Villa, project, outside Vienna, Austria, Exter...",[Emil Hoppe],[7605],"[Austrian, 1876–1957]",[Austrian],[1876],[1957],[Male],1903,"Graphite, color pencil, ink, and gouache on tr...",...,http://www.moma.org/collection/works/6,http://www.moma.org/media/W1siZiIsIjUyNzQ3NSJd...,38.4,19.1,,,,,,


In [86]:
moma_df[['Medium']].shape

(140848, 1)

In [112]:
isnull_var= pd.isnull(moma_df_update['Medium'])

In [97]:
moma_df_update=moma_df[['ObjectID', 'Title', 'Artist', 'Date', 'Medium', 'URL', 'ThumbnailURL']]

moma_df_update=moma_df_update[moma_df_update['Medium']!= None]


In [98]:
moma_df_update.shape

(140848, 7)

In [80]:
moma_df_update[moma_df_update['Medium'].str.contains('paint')]

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [None]:
CALLS = 79
RATE_LIMIT = 1

@sleep_and_retry
@limits(calls=CALLS, period=RATE_LIMIT)
def imagelink_collector(df):
    """
    adds the 'imageURL' column to the input dataframe.
    """

    df['imageURL'] = ''
    for index, row in df.iterrows():

        result = requests.get(f"https://collectionapi.metmuseum.org/public/collection/v1/objects/{row['objectID']}").json()

        if 'primaryImage' in result and result['primaryImage']:
            df.loc[index, 'imageURL'] = result['primaryImage']

        if index%50 == 0:
            print(f"processing row {index}")
    #dropnas
    df['imageURL'].replace('', np.nan, inplace=True)
    df = df[df['imageURL'].notna()]

    return df

In [None]:
def imagelink_csv_maker(df):
    """
    takes a dataframe that includes the imageURL-column and saves it as a csv-file on the hard drive.
    """
    df.to_csv('image_links.csv')

In [None]:
def image_preprocessor(image):
    """
    takes an image and changes its size to 224x224 pixels for feeding it to the model.
    """
    resized_image = image.resize((224,224))
    return resized_image

In [None]:
@sleep_and_retry
@limits(calls=CALLS, period=RATE_LIMIT)
def image_downloader(input_path, output_path):
    """
    takes the path to a csv-file and downloads the images from the URLs provided in its imageURL-column.
    Then saves the downloaded images to the output path.
    """
    df= pd.read_csv(input_path)
    for index, row in df[['objectID', 'imageURL']].iterrows():
        id= row['objectID']
        url= row['imageURL']

        response = requests.get(url)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content))
            resized_image= image_preprocessor(image)
            file_name= f'{id}.jpg'
            resized_image.save(f"{output_path}/{file_name}")

In [3]:
df = pd.read_json('moma_artworks.json')

In [4]:
df.head()

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,URL,ThumbnailURL,Height (cm),Width (cm),Depth (cm),Weight (kg),Diameter (cm),Length (cm),Circumference (cm),Duration (sec.)
0,"Ferdinandsbrücke Project, Vienna, Austria (Ele...",[Otto Wagner],[6210],"[Austrian, 1841–1918]",[Austrian],[1841],[1918],[Male],1896,Ink and cut-and-pasted painted pages on paper,...,http://www.moma.org/collection/works/2,http://www.moma.org/media/W1siZiIsIjUyNzc3MCJd...,48.6,168.9,,,,,,
1,"City of Music, National Superior Conservatory ...",[Christian de Portzamparc],[7470],"[French, born 1944]",[French],[1944],[0],[Male],1987,Paint and colored pencil on print,...,http://www.moma.org/collection/works/3,http://www.moma.org/media/W1siZiIsIjUyNzM3NCJd...,40.6401,29.8451,,,,,,
2,"Villa near Vienna Project, Outside Vienna, Aus...",[Emil Hoppe],[7605],"[Austrian, 1876–1957]",[Austrian],[1876],[1957],[Male],1903,"Graphite, pen, color pencil, ink, and gouache ...",...,http://www.moma.org/collection/works/4,http://www.moma.org/media/W1siZiIsIjUyNzM3NSJd...,34.3,31.8,,,,,,
3,"The Manhattan Transcripts Project, New York, N...",[Bernard Tschumi],[7056],"[French and Swiss, born Switzerland 1944]",[],[1944],[0],[Male],1980,Photographic reproduction with colored synthet...,...,http://www.moma.org/collection/works/5,http://www.moma.org/media/W1siZiIsIjUyNzQ3NCJd...,50.8,50.8,,,,,,
4,"Villa, project, outside Vienna, Austria, Exter...",[Emil Hoppe],[7605],"[Austrian, 1876–1957]",[Austrian],[1876],[1957],[Male],1903,"Graphite, color pencil, ink, and gouache on tr...",...,http://www.moma.org/collection/works/6,http://www.moma.org/media/W1siZiIsIjUyNzQ3NSJd...,38.4,19.1,,,,,,


In [13]:
df.shape

(140848, 28)

In [14]:
df.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Height (cm)', 'Width (cm)', 'Depth (cm)', 'Weight (kg)',
       'Diameter (cm)', 'Length (cm)', 'Circumference (cm)',
       'Duration (sec.)'],
      dtype='object')

In [21]:
df.iloc[0, 18]

'http://www.moma.org/collection/works/2'

Trying to download images from MOMA the old way: 

In [22]:
url= df.iloc[0, 18]
response = requests.get(url)
if response.status_code == 200:
    image = Image.open(BytesIO(response.content))
    #resized_image= image_preprocessor(image)
    file_name='TEST.jpg'
    image.save(file_name)
    


In [32]:
url= df.iloc[0, 18]
response = requests.get(url)
response

<Response [403]>

In [37]:
import urllib.request
req = urllib.request.Request(url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})

with urllib.request.urlopen(req) as response:
    img_data = response.read()

# Open the image from the data and save it to a file
image = Image.open(BytesIO(img_data))
image.save('TEST.jpg')


#img_data = urllib.request.urlopen(response).read()
#with open('TEST.jpg', 'wb') as f:
#    f.write(img_data.content)

UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7f882a431c60>