In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

In [3]:
with open('D:\My Documents\GitHub\data-enrichment-wk14-activity-mapping-yelp-api-results\.secret\yelp_api.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['client-id', 'api-key'])

In [27]:
# instantiate YelpAPI variable
yelp_api = YelpAPI(login['api-key'], timeout_s = 5.0)
yelp_api

<yelpapi.yelpapi.YelpAPI at 0x166ef8fd540>

In [5]:
# Set API call parameters 
LOCATION = 'NY,NY'
TERM = 'Pizza'

In [12]:
# Specifying JSON_FILE filename (can include a folder)
# Include the search terms in the filename 
JSON_FILE = 'D:/My Documents/GitHub/Stack-3/results_in_progress_NY_pizza.json'
JSON_FILE

'D:/My Documents/GitHub/Stack-3/results_in_progress_NY_pizza.json'

In [13]:
## Check if JSON_FILE exists
file_exists = os.path.isfile(JSON_FILE)
## If it does not exist: 
if file_exists == False:
    
    ## CREATE ANY NEEDED FOLDERS
    # Get the Folder Name only
    folder = os.path.dirname(JSON_FILE)
    ## If JSON_FILE included a folder:
    if len(folder)>0:
        # create the folder
        os.makedirs(folder,exist_ok=True)
        
        
    ## INFORM USER AND SAVE EMPTY LIST
    print(f'[i] {JSON_FILE} not found. Saving empty list to file.')
    
    
    # save an empty list
    with open(JSON_FILE,'w') as f:
        json.dump([],f)  
# If it exists, inform user
else:
    print(f"[i] {JSON_FILE} already exists.")


[i] D:/My Documents/GitHub/Stack-3/results_in_progress_NY_pizza.json already exists.


In [14]:
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')

- 0 previous results found.


In [15]:
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                               offset=n_results)
results.keys()

dict_keys(['businesses', 'total', 'region'])

In [16]:
## How many results total?
total_results = results['total']
total_results

12500

In [17]:
## How many did we get the details for?
results_per_page = len(results['businesses'])
results_per_page

20

There are over 12500 businesses to retrieve from our API, and we can get 20 results at a time (per "page").
We can calculate the # of results remaining by subtracting our offset (length of our previous results) from our total.
Then we can determine how many pages we will need by dividing the results by 20 (or whatever the value happens to be for results per page)
Note that we need to round up the number of pages in order to get all of the results. Even if there is only 1 result on the last page, we want to include that page! To do this, we will use math.ceil.

In [18]:
# import additional pacakges for controlling our loop
import time, math 
# Use math.ceil to round up for the total number of pages of results 
n_pages = math.ceil((results['total'] - n_results)/results_per_page)
n_pages

625

In [19]:
# join new results with old list with extend and save to file 
previous_results.extend(results['businesses'])
with open(JSON_FILE, 'w') as f: 
    json.dump(previous_results, f)

In [22]:
# TQDM is a pacakge designed for adding animated progress bars to Python processes 
for i in tqdm_notebook(range(n_pages)):
    # adds 200 ms pause
    time.sleep(.2)

  0%|          | 0/625 [00:00<?, ?it/s]

In [25]:
for i in tqdm_notebook( range(1,n_pages+1)):
    
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=n_results)
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    
    # add a 200ms pause
    time.sleep(.2)

  0%|          | 0/625 [00:00<?, ?it/s]

HTTPError: 400 Client Error: Bad Request for url: https://api.yelp.com/v3/businesses/search?location=NY%2CNY&term=Pizza&offset=1000

In [29]:
for i in tqdm_notebook( range(1,n_pages+1)):
    
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    
    if (n_results + results_per_page) > 1000:
        print('Exceeded 1000 api calls. Stopping loop.')
        break
    
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=n_results)
    
    
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
    # display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    
    time.sleep(.2)

  0%|          | 0/625 [00:00<?, ?it/s]

Exceeded 1000 api calls. Stopping loop.
