In [None]:
# Importing packages
import numpy as np
import pandas as pd
import urllib.request
import time
import random
import string
import urllib.parse
import hashlib
import hmac
import base64
import requests
import webbrowser
import sys
from requests.auth import HTTPBasicAuth
from datetime import datetime
from datetime import timedelta
import json

In [None]:
# Key and Secret for the api calls.
api_key = '#########################'
api_secret = '################'

## Oauth

Oauth is the authentication process of flickr, which consists of three steps. It is important to complete these steps as you will not retrieve all posts when not authorized.

In [None]:
# Create a random string function for the nonce
def randomString(stringLength=10):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(stringLength))

In [None]:
# Create timestamp and nonce variables needed for the call
timestamp = str(int(time.time()))
nonce = randomString()

#### Request Token

In [None]:
# Create the request string
requestString = ('http://www.flickr.com/services/oauth/request_token' + '?oauth_nonce=' + nonce +
                 '&oauth_timestamp=' + timestamp + '&oauth_consumer_key=' + api_key +
                 '&oauth_callback=oob' + '&oauth_signature_method=HMAC-SHA1')

Signature

In [None]:
# Create the base string
baseString = ('GET&' + urllib.parse.quote_plus('http://www.flickr.com/services/oauth/request_token') + '&')

In [None]:
# Create the authentication parameters --> need to be sorted alphabetically
parameters = ('oauth_callback=oob&oauth_consumer_key=' + api_key + '&oauth_nonce=' + nonce
             + '&oauth_signature_method=HMAC-SHA1' + '&oauth_timestamp=' + timestamp)

In [None]:
# Encode the parameters
parameters = urllib.parse.quote_plus(parameters)

In [None]:
parameters = parameters.replace('+', '%20')
#strrep(parameters, '+', '%20'); % correction for url-safe

In [None]:
# Update the baseString
baseString = baseString + parameters

In [None]:
# Create key
key = api_secret + '&'

In [None]:
# Function for HMAC_SHA1
def HMAC_SHA1(message, key):
    
    key = bytes(key, 'UTF-8')
    message = bytes(message, 'UTF-8')
    
    digester = hmac.new(key, message, hashlib.sha1)
    signature1 = digester.digest()
    
    signature2 = base64.standard_b64encode(signature1)    
    
    return str(signature2, 'UTF-8')

In [None]:
# Create the signature
signature = HMAC_SHA1(baseString, key)

In [None]:
# Encode signature
signature = urllib.parse.quote_plus(signature)

In [None]:
# Update the request string to complete it.
requestString = requestString + '&oauth_signature=' + signature

In [None]:
requestString

In [None]:
# Get the response and save it to rawResponse
res= requests.get(requestString)
rawResponse = res.text

In [None]:
rawResponse

In [None]:
# Split response
rawResponseSplit = rawResponse.split('&')
rawResponseSplit

In [None]:
# Get confirmed
confirmed = rawResponseSplit[0]
confirmed = confirmed.split('=')

In [None]:
# Get response token
responseToken = rawResponseSplit[1]
responseToken = responseToken.split('=')

In [None]:
# Get response secret
responseSecret = rawResponseSplit[2]
responseSecret = responseSecret.split('=')

In [None]:
# Get the requestToken and requestSecret
requestToken = responseToken[1]
requestSecret = responseSecret[1]

#### Authorisation

In [None]:
# Open authorisation window
webbrowser.open('http://www.flickr.com/services/oauth/authorize?' + 'oauth_token=' + requestToken)
print('Enter the code here: ')
userinput = input()

#### Access token

In [None]:
# Create timestamp and nonce variables needed for the call
timestamp = str(int(time.time()))
nonce = randomString()

In [None]:
# Request url for the access token
requestURL = 'http://www.flickr.com/services/oauth/access_token'

In [None]:
# Parameters for the call
parameters = ('oauth_consumer_key=' + api_key + '&oauth_nonce=' + nonce + '&oauth_signature_method=HMAC-SHA1' +
             '&oauth_timestamp=' + timestamp + '&oauth_token=' + requestToken + '&oauth_verifier=' + userinput)

In [None]:
# Create the base string
baseString = ('GET&' + urllib.parse.quote_plus(requestURL) + '&' +urllib.parse.quote_plus(parameters))

In [None]:
# Create key
key = (api_secret + '&' + requestSecret)

In [None]:
# Create the signature
signature = HMAC_SHA1(baseString, key)

In [None]:
# Encode the signature
signature = urllib.parse.quote_plus(signature)

In [None]:
requestString = (requestURL + '?' + parameters + '&oauth_signature=' + signature)

In [None]:
# Get the response and save it to authorisationResponse
auth= requests.get(requestString)
authorisationResponse = auth.text

In [None]:
authorisationResponse

In [None]:
# Split to get all the details
authSplit = authorisationResponse.split('&')

In [None]:
# Get fullname
responseFullname = authSplit[0].split('=')
responseFullname = responseFullname[1]

In [None]:
# Get access token
accessToken = authSplit[1].split('=')
accessToken = accessToken[1]

In [None]:
# Get access secret
responseSecret = authSplit[2].split('=')
responseSecret = responseSecret[1]

In [None]:
# Get nsid
nsid = authSplit[3].split('=')
nsid = nsid[1]

In [None]:
# Get username
username = authSplit[4].split('=')
username = username[1]

In [None]:
# Combine this information to save it 
authCredentials = {'Fullname': responseFullname, 'AccessToken': accessToken,
                  'ResponseSecret': responseSecret, 'nsid':nsid, 'username':username}

In [None]:
authCredentials

In [None]:
credentials = pd.DataFrame(authCredentials, index=[0])

In [None]:
credentials.to_csv('FlickrCredentials.csv', index=False)

#### Reuse the access token

In [None]:
# Get the access token from storage.
credentials = pd.read_csv('FlickrCredentials.csv', sep= ',', low_memory = False, lineterminator='\n')

In [None]:
# Create timestamp and nonce variables needed for the call
timestamp = str(int(time.time()))
nonce = randomString()

In [None]:
# Get access token, nsid and responsesecret
accessToken = credentials['AccessToken'][0]
responseSecret = credentials['ResponseSecret'][0]
nsid = credentials['nsid'][0]

In [None]:
# Base address
address = 'http://api.flickr.com/services/rest'

In [None]:
# Method
method = 'flickr.prefs.getPrivacy'

In [None]:
# Parameters, sorted alphabetically
parameters = ('method=' + method + '&oauth_consumer_key=' + api_key +  '&oauth_nonce=' + nonce +
             '&oauth_signature_method=HMAC-SHA1&oauth_timestamp=' + timestamp + '&oauth_token=' +
              accessToken + '&user_id=' + nsid
             )

In [None]:
# Create base string
baseString = ('GET&' + urllib.parse.quote_plus(address) + '&' + urllib.parse.quote_plus(parameters))

In [None]:
# Create key and signature
key = (api_secret + '&' + responseSecret)
signature = HMAC_SHA1(baseString, key)

In [None]:
# Create requestString
requestString = (address + '?' + parameters + '&oauth_signature=' + signature)

In [None]:
# Do the request
request = requests.get(requestString)
request = request.text

# Getting all the information and putting it in a dataframe

This consists of five main steps:

1. Get a list of photos.
2. Parse the JSON file.
3. For each photo in the JSON file, get more information by using the photo ID and secret.
4. Parse the JSON file of the individual photo.
5. Combine everything in a dataframe.

#### Two functions to handle the JSON responses

In [None]:
def handlePhotoList(JSON):
    
    # Get the number of photos in the list
    perPage = int(JSON['photos']['perpage'])
    total = int(JSON['photos']['total'])
    page = int(JSON['photos']['page'])
    pages = int(JSON['photos']['pages'])
    numPhotos = perPage
    
    # On the last page, see how many photos are left
    if page == pages:
        numPhotos = total - (perPage * (page - 1))
    
    data = {'ID': [], 'Secret': []}
    for i in range(numPhotos):
        
        # For each photo, get the id and secret and store it in the data dict.
        data['ID'].append(JSON['photos']['photo'][i]['id'])
        data['Secret'].append(JSON['photos']['photo'][i]['secret'])
      
    # Create dataframe from dict.
    df = pd.DataFrame(data=data)
    
    # Return the df
    return df
    

In [None]:
def handlePhoto(JSON):
    
    # General ID information
    photoID = str(JSON['photo']['id'])
    photoSecret = str(JSON['photo']['secret'])
    uploadDate = str(JSON['photo']['dateuploaded'])
    
    # Title and description
    photoTitle = str(JSON['photo']['title']['_content'])
    photoDescription = str(JSON['photo']['description']['_content'])
    
    # Dates
    postedDate = str(JSON['photo']['dates']['posted'])
    takenDate = str(JSON['photo']['dates']['taken'])
    lastUpdateDate = str(JSON['photo']['dates']['lastupdate'])
    
    # User information
    userID = str(JSON['photo']['owner']['nsid'])
    username = str(JSON['photo']['owner']['username'])
    userLocation = str(JSON['photo']['owner']['location'])
    
    # Location information
    longitude = str(JSON['photo']['location']['longitude'])
    latitude = str(JSON['photo']['location']['latitude'])
    try:
        locality = str(JSON['photo']['location']['locality']['_content'])
    except:
        locality = 'null'
    neighbourhood = str(JSON['photo']['location']['neighbourhood']['_content'])
    
    # Store everything in a dict.
    photoInformation = {'photoID': photoID, 'photoSecret': photoSecret, 'uploadDate': uploadDate, 
                        'title': photoTitle, 'description':photoDescription, 'postedDate': postedDate,
                       'takenDate': takenDate, 'lastUpdateDate': lastUpdateDate, 'userID': userID,
                       'userName': username, 'userLocation': userLocation, 'longitude': longitude,
                       'latitude': latitude, 'locality': locality, 'neighbourhood': neighbourhood}
    
    # Return the information
    return photoInformation
    

#### Two functions that do the API calls

In [None]:
def getPhotoList(api_key, accessToken, minUploadDate, maxUploadDate, pageNumber):
    
    # Create timestamp and nonce variables needed for the call
    timestamp = str(int(time.time()))
    nonce = randomString()
    
    # Base url
    addr = 'http://api.flickr.com/services/rest'
    
    # Define the method
    method = 'flickr.photos.search'
    
    # Define the parameters
    parameters = ('api_key=' + api_key + '&bbox=4.419937%2C51.8415%2C4.633141%2C51.97452' + 
              '&format=json&nojsoncallback=1' +'&has_geo=1' +
              '&max_upload_date=' + maxUploadDate +'&method=' + method + 
              '&min_upload_date=' + minUploadDate + '&oauth_consumer_key=' + api_key +
              '&oauth_nonce=' + nonce + '&oauth_signature_method=HMAC-SHA1&oauth_timestamp=' + timestamp +
              '&oauth_token=' + accessToken + '&page=' + str(pageNumber) + '&per_page=250' + '&sort=date-posted-asc')
    
    # Final base string
    baseString = ('GET&' + urllib.parse.quote_plus(addr) + '&' + urllib.parse.quote_plus(parameters))
    
    # Define the key
    key = api_secret + '&' + responseSecret
    
    # Create the signature
    signature = HMAC_SHA1(baseString, key)
    
    # Encode the signature
    signature = urllib.parse.quote_plus(signature)
    
    # Create the final request
    requestString = (addr + '?' + parameters + '&oauth_signature=' + signature)
    
    # Do the request
    request = requests.get(requestString)
    request = request.text
    
    # Create JSON
    JSON = json.loads(request)

    return JSON

In [None]:
def getPhotoDetails(api_key, accessToken, photoID, photoSecret):
    
    # Create timestamp and nonce variables needed for the call
    timestamp = str(int(time.time()))
    nonce = randomString()
    
    # Base url
    addr = 'http://api.flickr.com/services/rest'
    
    # Define the method
    method = 'flickr.photos.getInfo'
    
    # Define the parameters
    parameters = ('api_key=' + api_key + '&format=json&nojsoncallback=1' +
              '&method=' + method + '&oauth_consumer_key=' + api_key +
              '&oauth_nonce=' + nonce + '&oauth_signature_method=HMAC-SHA1&oauth_timestamp=' + timestamp +
              '&oauth_token=' + accessToken + '&photo_id=' + photoID + '&secret=' + photoSecret)
    
    # Final base string
    baseString = ('GET&' + urllib.parse.quote_plus(addr) + '&' + urllib.parse.quote_plus(parameters))
    
    # Define the key
    key = api_secret + '&' + responseSecret
    
    # Create the signature
    signature = HMAC_SHA1(baseString, key)
    
    # Encode the signature
    signature = urllib.parse.quote_plus(signature)
    
    # Create the final request
    requestString = (addr + '?' + parameters + '&oauth_signature=' + signature)
    
    # Do the request
    request = requests.get(requestString)
    request = request.text
    
    # Create JSON
    JSON = json.loads(request)
    
    return JSON

#### Filling in the dataframe in a loop

In [None]:
# Define min upload date and max upload date
minUploadDate = datetime(2018,1,1)
maxUploadDate = minUploadDate + timedelta(days=5)

In [None]:
# Define the pagenumber variables
pagenumber = 1
pagenumberMax = -1

In [None]:
# Create a dataframe.
Flickr2018 = pd.DataFrame() 

In [None]:
# Loop over all the pages for one entire year.
while minUploadDate.year < 2019:
    
    # Transform the date to something the API can work with
    minUploadDateStr = str(int((minUploadDate - datetime(1970, 1, 1)) / timedelta(seconds=1)))
    maxUploadDateStr = str(int((maxUploadDate - datetime(1970, 1, 1)) / timedelta(seconds=1)))
    
    
    while True:
        
        # Do the api call to get all the photos in JSON format.
        photoList = getPhotoList(api_key, accessToken, minUploadDateStr, maxUploadDateStr, pagenumber)
    
        # Get the number of pages.
        pagenumberMax = photoList['photos']['pages']
    
        # Extract the information from the JSON and return a dataframe.
        photolistDF = handlePhotoList(photoList)
    
        # For each photo in the DF, get more information.
        for index, row in photolistDF.iterrows():
        
            # Get the photo details through the API.
            photodetails = getPhotoDetails(api_key, accessToken, row['ID'], row['Secret'])
        
            # Parse the JSON.
            photodetailsDict = handlePhoto(photodetails)
        
            # Put it in the final dataframe.
            Flickr2018 = Flickr2018.append(photodetailsDict, ignore_index=True)
        
        
        # If there are more pages then the current page number, add one and continue with the next iteration.
        if pagenumber < pagenumberMax:
            print('{} - {}'.format(minUploadDate, maxUploadDate))
            print('-----------------------------------------')
            print('Added page {} of {} pages'.format(pagenumber, pagenumberMax))
            print('Total number of photos retrieved: {}'.format(len(Flickr2018)))
            print('Total number of unique photos: {}'.format(Flickr2018['photoID'].nunique()))
            print('-----------------------------------------')
            print('-----------------------------------------')
            pagenumber += 1
        
        # Else, stop the inner while loop and reset everything to go into the next outer loop.
        else:
            
            print('-------- Fininshed {} - {} ---------'.format(minUploadDate, maxUploadDate))
            print('Total number of photos retrieved: {}'.format(len(Flickr2018)))
            print('Total number of unique photos: {}'.format(Flickr2018['photoID'].nunique()))
            # Reset the pagenumber.
            pagenumber = 1
            pagenumberMax = -1
            
            # Move the days by 5.
            minUploadDate = maxUploadDate
            maxUploadDate = minUploadDate + timedelta(days=5)
            
            break    
    

In [None]:
Flickr2018.to_csv('Flickr2018.csv', index=False)