In [1]:
import os
import random
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup

In [2]:
import socket
socket.setdefaulttimeout(30)

In [30]:
ARTS_LIST = 'query_results.csv'
DIR = 'data'

ID, CULTURE, URL = range(0, 3)

In [8]:
with open(ARTS_LIST) as f:
    arts_to_download = f.readlines()
    arts_to_download = [line.strip() for line in arts_to_download]

In [14]:
data = []

for line in arts_to_download:
    object_id, *culture, url = line.split(',')
    culture = ' '.join(culture)
    data.append((object_id, culture, url))

In [15]:
data[-3:]

[('157369',
  'American or European',
  'http://www.metmuseum.org/art/collection/search/157369'),
 ('81727',
  'American or European',
  'http://www.metmuseum.org/art/collection/search/81727'),
 ('106645',
  'American or European',
  'http://www.metmuseum.org/art/collection/search/106645')]

In [17]:
data_df = pd.DataFrame(data, columns=['object_id', 'culture', 'url'])
data_df.tail()

Unnamed: 0,object_id,culture,url
245,157610,American or European,http://www.metmuseum.org/art/collection/search...
246,157668,American or European,http://www.metmuseum.org/art/collection/search...
247,157369,American or European,http://www.metmuseum.org/art/collection/search...
248,81727,American or European,http://www.metmuseum.org/art/collection/search...
249,106645,American or European,http://www.metmuseum.org/art/collection/search...


In [19]:
data_df.groupby('culture').count().sort_values(by='object_id', ascending=False)

Unnamed: 0_level_0,object_id,url
culture,Unnamed: 1_level_1,Unnamed: 2_level_1
Japan,125,125
American or European,61,61
China,50,50
Tibet,3,3
"""French, Paris""",2,2
Indian (Gujerat),2,2
Japan (Ainu),2,2
"""Japan (Okinawa, Ry奴ky奴 Islands)""",1,1
French,1,1
Japan (Kyoto),1,1


In [20]:
test_url = data[-1][-1]
test_url

'http://www.metmuseum.org/art/collection/search/106645'

In [23]:
ua_list = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.8.0 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.8.0 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (LHTML, like Gecko) Chrome/17"
]

user_agent = random.choice(ua_list)
request = urllib.request.Request(test_url)
request.add_header("User-Agent", user_agent)

response = urllib.request.urlopen(request, timeout=5)
encoding = response.headers.get_content_charset()

In [24]:
html = response.read().decode(encoding)
soup = BeautifulSoup(html)

In [25]:
finds_list = soup.find_all("a", class_="gtm__download__image")

In [26]:
for a in finds_list:
    print(a['href'])

https://images.metmuseum.org/CRDImages/ci/original/C.I.39.112.3_F.jpg


In [27]:
def get_download_link(art_page):
    '''
    find the download links of the art given the museum page
    
    :type str: art_page
    :rtype list of str
    '''
    
    ua_list = [
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Opera/9.8.0 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.8.0 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (LHTML, like Gecko) Chrome/17"
    ]
    
    user_agent = random.choice(ua_list)
    request = urllib.request.Request(art_page)
    request.add_header("User-Agent", user_agent)

    try:
        response = urllib.request.urlopen(request, timeout=5)
        encoding = response.headers.get_content_charset()
    
        if not encoding:
            return []
    
        html = response.read().decode(encoding)
        soup = BeautifulSoup(html)
        finds_list = soup.find_all("a", class_="gtm__download__image")
    
        return [a['href'] for a in finds_list]
    
    except:
        return []

In [28]:
data[-1][-1]

'http://www.metmuseum.org/art/collection/search/106645'

In [29]:
get_download_link(data[-1][-1])

[]

In [35]:
for index, row in enumerate(data[88:]):
    print("processing image {} ...".format(index))
    i = 0
    
    for img_link in get_download_link(row[URL]):
        print('image linked retrived, ', img_link)
        
        save_folder = os.path.join(DIR, row[CULTURE])
        file_name = os.path.join(save_folder, "{}-{}.jpg".format(row[ID], i))
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        print(file_name)
    
        try:
            urllib.request.urlretrieve(img_link, file_name)
            print('file saved.')
            print('-' * 20)
            i += 1
        
        except:
            pass
        

processing image 0 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/as/original/RT751.jpg
data\Japan\44946-0.jpg
processing image 1 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/as/original/DP330788.jpg
data\Japan\45387-0.jpg
file saved.
--------------------
processing image 2 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/as/original/184622.jpg
data\Japan\55963-0.jpg
file saved.
--------------------
processing image 3 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/as/original/178214.jpg
data\Japan\55965-0.jpg
file saved.
--------------------
processing image 4 ...
processing image 5 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/as/original/DT204734.jpg
data\Japan\50803-0.jpg
file saved.
--------------------
processing image 6 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/as/original/DT11467.jpg
data\Japan\55960-0.jpg
file saved.
--------------------
processing image

file saved.
--------------------
processing image 54 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/as/original/LC-2001_428_39-001.jpg
data\Japan\61834-0.jpg
file saved.
--------------------
processing image 55 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/as/original/LC-2001_428_40-001.jpg
data\Japan\61835-0.jpg
file saved.
--------------------
processing image 56 ...
processing image 57 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/as/original/LC-2001_428_43-001.jpg
data\Japan\61838-0.jpg
file saved.
--------------------
processing image 58 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/as/original/LC-2001_428_44-001.jpg
data\Japan\61839-0.jpg
file saved.
--------------------
processing image 59 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/as/original/LC-2001_428_45-001.jpg
data\Japan\61840-0.jpg
processing image 60 ...
image linked retrived,  https://images.metmuseum.org/CRDImages

image linked retrived,  https://images.metmuseum.org/CRDImages/ci/original/CI49.33_F.jpg
data\American or European\106964-0.jpg
file saved.
--------------------
processing image 112 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/ci/original/CI52.44_F.jpg
data\American or European\106966-0.jpg
file saved.
--------------------
processing image 113 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/ci/original/CI55.48_F.jpg
data\American or European\84722-0.jpg
file saved.
--------------------
processing image 114 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/ci/original/1980.145.4_F.jpg
data\American or European\91982-0.jpg
file saved.
--------------------
processing image 115 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/ci/original/DT412.jpg
data\American or European\81107-0.jpg
file saved.
--------------------
processing image 116 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/ci/original

processing image 157 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/ci/original/49.212a-b_CP4.jpg
data\American or European\157610-0.jpg
file saved.
--------------------
processing image 158 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/ci/original/55.207.43a-b_front_CP4.jpg
data\American or European\157668-0.jpg
file saved.
--------------------
processing image 159 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/ci/original/65.181.3_CP1.jpg
data\American or European\157369-0.jpg
file saved.
--------------------
processing image 160 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/ci/original/C.I.69.24.1_F.jpg
data\American or European\81727-0.jpg
processing image 161 ...
image linked retrived,  https://images.metmuseum.org/CRDImages/ci/original/C.I.39.112.3_F.jpg
data\American or European\106645-0.jpg
file saved.
--------------------
