In [1]:
import requests

def get_image_urls(url, items=[]):
    '''
    Retrieves the image URLs for items that have public URLs available. 
    Skips over items that are for the colletion as a whole or web pages about the collection.
    Handles pagination. 
    '''
    
    
    # @TODO clean out anomalies
    # ex: {
    #  'work': 'http://www.loc.gov/item/48036706/',
    #  'volume': 'https://www.loc.gov/resource/muspre1800.101266/'
    # }
    # Also: 'https://www.loc.gov/resource/rbctos.2017rosen1137/'
    
    # request pages of 100 results at a time
    params = {"fo": "json", "c": 100, "at": "results,pagination"}
    call = requests.get(url, params=params)
    data = call.json()
    results = data['results']
    
#     print(results)
    
    for result in results:
        if "collection" not in result.get("original_format") and "web page" not in result.get("original_format"):
            
            id = result.get("id");
            for segment in result.get("segments") or result.get("resources"):
                
                image_list = segment["url"].split('/')[-2].split('.')
    
                cdn = f"https://cdn.loc.gov/service/rbc/{image_list[0]}/{image_list[1][:4]}/{image_list[1]}/"
                
                item = {"work" : id, "volume" : segment["url"], "cdn" : cdn }
                items.append(item)     
                            

    if data["pagination"]["next"] is not None: # make sure we haven't hit the end of the pages
        next_url = data["pagination"]["next"]
        print("getting next page: {0}".format(next_url))
        get_image_urls(next_url, items)
        
    print(items)
    return items


In [2]:
# ?sp=2
work_volume_list = get_image_urls("https://www.loc.gov/collections/lessing-j-rosenwald/", items=[])

getting next page: https://www.loc.gov/collections/lessing-j-rosenwald/?at=results,pagination&c=100&fo=json&sp=2
getting next page: https://www.loc.gov/collections/lessing-j-rosenwald/?at=results,pagination&c=100&fo=json&sp=3
getting next page: https://www.loc.gov/collections/lessing-j-rosenwald/?at=results,pagination&c=100&fo=json&sp=4
[{'work': 'http://www.loc.gov/item/51006716/', 'volume': 'https://www.loc.gov/resource/rbc0001.2009rosen1938v3/', 'cdn': 'https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/'}, {'work': 'http://www.loc.gov/item/51006716/', 'volume': 'https://www.loc.gov/resource/rbc0001.2009rosen1938v1/', 'cdn': 'https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v1/'}, {'work': 'http://www.loc.gov/item/51006716/', 'volume': 'https://www.loc.gov/resource/rbc0001.2009rosen1938v4/', 'cdn': 'https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v4/'}, {'work': 'http://www.loc.gov/item/51006716/', 'volume': 'https://www.loc.gov/resource/rbc0001.2009rose

In [3]:
len(work_volume_list)

357

In [4]:
work_volume_list[:5]

[{'work': 'http://www.loc.gov/item/51006716/',
  'volume': 'https://www.loc.gov/resource/rbc0001.2009rosen1938v3/',
  'cdn': 'https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/'},
 {'work': 'http://www.loc.gov/item/51006716/',
  'volume': 'https://www.loc.gov/resource/rbc0001.2009rosen1938v1/',
  'cdn': 'https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v1/'},
 {'work': 'http://www.loc.gov/item/51006716/',
  'volume': 'https://www.loc.gov/resource/rbc0001.2009rosen1938v4/',
  'cdn': 'https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v4/'},
 {'work': 'http://www.loc.gov/item/51006716/',
  'volume': 'https://www.loc.gov/resource/rbc0001.2009rosen1938v2/',
  'cdn': 'https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v2/'},
 {'work': 'http://www.loc.gov/item/65059076/',
  'volume': 'https://www.loc.gov/resource/rbctos.2017rosen1137/',
  'cdn': 'https://cdn.loc.gov/service/rbc/rbctos/2017/2017rosen1137/'}]

In [5]:
# change to download somewhere other than inside this directory
!mkdir images

mkdir: cannot create directory ‘images’: File exists


In [6]:
import os
from requests.exceptions import HTTPError

# https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0001v.jpg
# https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0002v.jpg

def get_image_files(image_urls_list, dir_name):
    '''
    Takes as input a list of URLs for loc.gov item pages and 
    a path to a directory in which to save image files, e.g. "data". 
    '''    
    for count, url in enumerate(image_urls_list):
                
        if count % 100 == 0:
            print("at item {0}".format(count))
        
        # for each volume, initialize first image
        i = 1
        
        # function checks if image exists
        def url_validator(url):
            try:
                r = requests.get(url)
                r.raise_for_status()
                
            except HTTPError:
                
                return "404"
        
        # assume each volume has at least one image, "0001"
        is_valid_url = "200"
            
        while (is_valid_url != "404"):
                         
            try:
                # create a string in format "0001", ..."0254"
                image = str(i).zfill(4)

                # http://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0254v.jpg
                full_url = f"{url['cdn']}{image}v.jpg"

                # do some ugly stuff to generate an image name for download
                string_array = full_url.split("/")
                
                image_name = "-".join(string_array[5:])

                filename = os.path.join(dir_name, image_name)       

                # @TODO refactor to use image_response to check url validity
                image_response = requests.get(full_url, stream=True)
                
                with open(filename, 'wb') as fd:
                    for chunk in image_response.iter_content(chunk_size=100000):
                        fd.write(chunk)
                
                # increment image number
                i += 1
                
                # don't repeat yourself!
                image = str(i).zfill(4)

                full_url = f"{url['cdn']}{image}v.jpg"
                
                # check if the new image url is valid
                is_valid_url = url_validator(full_url)
            
                print(full_url)

            except ConnectionError as e:
                print(e)

In [7]:
get_image_files(work_volume_list, "images")

at item 0
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0002v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0003v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0004v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0005v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0006v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0007v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0008v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0009v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0010v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0011v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0012v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0013v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0014v.jpg
https://cdn.loc.gov/service/rbc/rbc0001/2009/2009rosen1938v3/0015v.

KeyboardInterrupt: 

In [None]:
! ls -la images

### Connecting the image file to the metadata
The filename the code creates is the item's identifier, so you reconstruct a URL for the item's metadata. For example, to examine at the metadata for the first item in the list, 2007685715.jpg, you can add ``https://www.loc.gov/item/`` before the identifier. 

``https://www.loc.gov/item/2007685715``

You can also request the metadata in JSON format by adding ``?fo=json&at=item`` at the end. 

In [None]:
r = requests.get("https://www.loc.gov/item/2007685715/?fo=json")
r_data = r.json()
print(json.dumps(r_data["item"], indent=2))