In [None]:
import requests
import os
from urllib.parse import urlparse

base_url = 'https://www.loc.gov/collections/clara-barton-papers'
images_folder = 'clara-barton-papers'


def write_image_file(image, filename):
    # request the image and write to path
    image_response = requests.get(image, stream=True)
    with open(filename, 'wb') as fd:
        for chunk in image_response.iter_content(chunk_size=100000):
            fd.write(chunk)


def get_item_images(item_url, path):
    params = {"fo": "json"}
    item_call = requests.get(item_url, params)
    item_result = item_call.json()
    
    # Retrieve the item.
    image_files = item_result.get("resources")[0]

    counter = 1    
    # Loop through all images in this item and save them all to the folder
    for item_image in image_files.get("files"):
        image = item_image[-2].get("url")
        
        # create a filename that's the image number
        filename = "{0}.jpg".format(counter)
        filename = os.path.join(path, filename)        
        write_image_file(image, filename)
        counter = counter + 1


def get_and_save_images(results_url, path):
    '''
    Takes as input the url for the collection or results set
    e.g. https://www.loc.gov/collections/baseball-cards
    and a list of items (used for pagination)
    '''
    params = {"fo": "json", "c": 25, "at": "results,pagination"}
    call = requests.get(results_url, params=params)
    data = call.json()
    results = data['results']
    for result in results:
        # don't try to get images from the collection-level result or web page results
        if "collection" not in result.get("original_format") \
            and "web page" not in result.get("original_format"):

            if result.get("image_url") and result.get("id"):                
                identifier = urlparse(result["id"])[2].rstrip('/')
                identifier = identifier.split('/')[-1]
                
                if (not result.get("hassegments") or result.get("hassegments")==False):
                    image = "https:" + result.get("image_url")[-1]
                    # create a filename that's the identifier portion of the item URL
                    filename = "{0}.jpg".format(identifier)
                    filename = os.path.join(path, filename)

                    write_image_file(image, filename)
                else:
                    dest_folder = os.path.join(path, identifier)
                    if not os.path.exists(dest_folder):
                        os.makedirs(dest_folder)
                    get_item_images(result.get("id"), dest_folder)

    if data["pagination"]["next"] is not None: # make sure we haven't hit the end of the pages
        next_url = data["pagination"]["next"]
        print("getting next page: {0}".format(next_url))
        get_and_save_images(next_url, path)

get_and_save_images(base_url, images_folder)


{'source_collection': 'Clara Barton papers, 1805-1958', 'display_offsite': True, 'contributors': [{'barton, clara': 'https://www.loc.gov/search/?fa=contributor:barton,+clara&fo=json'}], 'access_restricted': False, 'site': [], 'original_format': ['manuscript/mixed material'], 'genre': ['Manuscripts'], 'subject_headings': ['Manuscripts'], 'created_published': ['1849'], 'extract_urls': ['http://hdl.loc.gov/loc.mss/ms005010.mss11973.0001#mss11973'], 'id': 'http://www.loc.gov/item/mss119730001/', 'partof': [{'count': 69, 'url': 'https://www.loc.gov/search/?fa=partof:clara+barton+papers:++diaries+and+journals,+1849-1911&fo=json', 'title': 'clara barton papers:  diaries and journals, 1849-1911'}, {'count': 935, 'url': 'https://www.loc.gov/collections/clara-barton-papers/?fo=json', 'title': 'clara barton papers'}, {'count': 142627, 'url': 'https://www.loc.gov/search/?fa=partof:manuscript+division&fo=json', 'title': 'manuscript division'}], 'subject': ['manuscripts'], 'index': 1, 'digital_id': 