This script is based on instructions given in [this lesson](https://github.com/HeardLibrary/digital-scholarship/blob/master/code/scrape/pylesson/lesson2-api.ipynb). 

## Import libraries and load API key from file

The API key should be the only item in a text file called `flickr_api_key.txt` located in the user's home directory. No trailing newline and don't include the "secret".

In [None]:
from pathlib import Path
import requests
import json
import csv
from time import sleep
import webbrowser
import pandas as pd

api_sleep = 1 # Flickr API limits calls to 1/second, so throttle by inserting a delay of 1 s after each call

# define some canned functions we need to use

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

def get_license_history(photo_id):
    endpoint_url = 'https://www.flickr.com/services/rest'
    method = 'flickr.photos.licenses.getLicenseHistory'

    param_dict = {
        'method' : method,
        #'per_page' : '1',  # default is 100, maximum is 500. Use paging to retrieve more than 500.
        #'page' : '1',
        'photo_id' : photo_id,
        'api_key' : api_key,
        'nojsoncallback' : '1', # this parameter causes the API to return actual JSON instead of its weird default string
        'format' : 'json' # overrides the default XML serialization for the search results
        }

    #print(param_dict)
    metadata_response = requests.get(endpoint_url, params = param_dict)

    # print(metadata_response.url) # uncomment this if testing is needed, again don't reveal key in notebook
    data = metadata_response.json()

    #print(json.dumps(data, indent=4))
    if data['stat'] == 'ok':
        sleep(api_sleep)
        license_history = data['license_history']
        return license_history
    else:
        return None

home = str(Path.home()) #gets path to home directory; supposed to work for Win and Mac
key_filename = 'flickr-api-keys-tang-song.txt'
api_key_path = home + '/' + key_filename

try:
    with open(api_key_path, 'rt', encoding='utf-8') as file_object:
        api_key = file_object.read()
        if api_key[-1] == '\n':
            print('Warning: your key has a trailing newline that needs to be deleted!')
        # print(api_key) # delete this line once the script is working; don't want the key as part of the notebook
except:
    print(key_filename + ' file not found - is it in your home directory?')

In [None]:
image_metadata = pd.read_csv('act_all_202209291736.csv', na_filter=False, dtype = str)
image_metadata = image_metadata.set_index('RecordNumber')
image_url_df = image_metadata.loc[:, 'CopyrightStatement':'CopyrightStatement'].copy()
image_url_df

In [None]:
flickr_url_df = image_url_df.loc[image_url_df['CopyrightStatement'].str.contains('flickr.com', case=False)]
flickr_url_df

In [None]:
results = []
for act_id, image in flickr_url_df.iterrows():
    url = image['CopyrightStatement']
    out_dict = {'act_id': act_id, 'url': url}
    print(act_id, url)
    
    # Handle edge case where "http" is omitted 
    if 'http' in url:
        photo_id = image['CopyrightStatement'].split('/')[5]
    else:
        photo_id = image['CopyrightStatement'].split('/')[3]
        
    # In cases where there is text following the URL, remove it.
    if ' ' in photo_id:
        photo_id = photo_id.split(' ')[0]
    print(photo_id)

    fail = False
    
    # Handle edge case where there isn't any image number in the URL
    if photo_id == '':
        out_dict['license'] = ''
        out_dict['flickr_id'] = ''
        fail = True
    else:
        out_dict['flickr_id'] = photo_id
        licenses = get_license_history(photo_id)
        #print(licenses)

        if licenses is None:
            out_dict['license'] = ''
            fail = True
        elif len(licenses) == 0:
            out_dict['license'] = ''
            fail = True
        elif len(licenses) > 1:
            out_dict['license'] = licenses
            fail = True
        else:
            if licenses[0]['old_license'] == 'All Rights Reserved':
                out_dict['license'] = 'All Rights Reserved'
            elif 'publicdomain' in licenses[0]['old_license_url']:
                out_dict['license'] = 'zero/1.0/'                
            else:
                out_dict['license'] = licenses[0]['old_license_url'].split('https://creativecommons.org/licenses/')[1]
                
            if licenses[0]['new_license'] == 'All Rights Reserved':
                out_dict['new_license'] = 'All Rights Reserved'
            else:
                if licenses[0]['new_license_url'] == '':
                    out_dict['new_license'] = ''
                elif 'publicdomain' in licenses[0]['new_license_url']:
                    out_dict['new_license'] = 'zero/1.0/'                
                else:
                    out_dict['new_license'] = licenses[0]['new_license_url'].split('https://creativecommons.org/licenses/')[1]
    if fail:
        out_dict['new_license'] = ''
    
    results.append(out_dict)
    
    # write the data to a file after each image in case script crashes
    filename = 'licenses.csv'
    fieldnames = out_dict.keys() # use the keys from the last dictionary for column headers; assume all are the same
    write_dicts_to_csv(results, filename, fieldnames)

    print()

print('done')


In [None]:
licenses

## Create the loops to do the paging

Flickr limits the number of photos that can be requested to 500. Since we have more than that, we need to request the data 500 photos at a time.

In [None]:
per_page = 5   # use 500 for full download, use smaller number like 5 for testing
pages = number_photos // per_page   # the // operator returns the integer part of the division ("floor")
table = []

#for page_number in range(0, pages + 1):  # need to add one to get the final partial page
for page_number in range(0, 1):  # use this to do only one page for testing
    print('retrieving page ', page_number + 1)
    page_string = str(page_number + 1)
    param_dict = {
        'method' : method,
        'extras' : 'description,license,original_format,date_taken,original_format,geo,tags,machine_tags,media,url_t,url_o',
        'per_page' : str(per_page),  # default is 100, maximum is 500.
        'page' : page_string,
        'user_id' : user_id,
        'oauth_consumer_key' : api_key,
        'nojsoncallback' : '1', # this parameter causes the API to return actual JSON instead of its weird default string
        'format' : 'json' # overrides the default XML serialization for the search results
        }
    metadata_response = requests.get(endpoint_url, params = param_dict)
    data = metadata_response.json()
#    print(json.dumps(data, indent=4))  # uncomment this line for testing
    
    # data['photos']['photo'] is the number of photos for which data was returned
    for image_number in range(0, len(data['photos']['photo'])):
        photo_dictionary = extract_data(image_number, data)
        table.append(photo_dictionary)

    # write the data to a file
    # We could just do this for all the data at the end.
    # But if the search fails in the middle, we will at least get partial results
    fieldnames = photo_dictionary.keys() # use the keys from the last dictionary for column headers; assume all are the same
    write_dicts_to_csv(table, filename, fieldnames)

    sleep(1) # wait a second to avoid getting blocked for hitting the API to rapidly

print('Done')