In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

In [2]:
# Connecting to yelp api
with open('/Users/Dido Admin/.secret/yelp_api.json') as f:
    login = json.load(f)
yelp_api = YelpAPI(login['api-key'], timeout_s = 5.0)

In [3]:
LOCATION = 'Seattle'
TERM = 'Coffee'

In [5]:
JSON_FILE = "Data/results_in_progress_Seattle_coffee.json"

In [6]:
# Method for creating JSON file, provided by Coding Dojo
def create_json_file(JSON_FILE, delete_if_exists = False):
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    # If it exists:
    if file_exists == True:
        # If the user specifies to delete
        if delete_if_exists == True:
            # Inform the user and delete.
            print(f"[WARNING] {JSON_FILE} already exists. Deleting previous file.")
            os.remove(JSON_FILE)
            # Then create a new file.
            create_json_file(JSON_FILE, delete_if_exists = False)
        # Otherwise inform the user it already exists.
        else:
            print(f"[INFORMATION] {JSON_FILE} already exists.")
    # If it doesn't exist:
    else:
        # Inform the user.
        print(f"[INFORMATION] {JSON_FILE} not found. Saving empty list to new file.")
        # Create the necessary folders
        folder = os.path.dirname(JSON_FILE)
        if len(folder) > 0:
            os.makedirs(folder, exist_ok = True)
        # Save an empty list to start the json file.
        with open(JSON_FILE, 'w') as f:
            json.dump([], f)

In [18]:
# Creating the json file.
create_json_file(JSON_FILE, delete_if_exists = True)

[INFORMATION] Data/results_in_progress_Seattle_coffee.json not found. Saving empty life to new file.


In [19]:
# load previous results
with open(JSON_FILE, 'r') as f:
    previous_results = json.load(f)

# offset by number of previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found')

results =  yelp_api.search_query(location = LOCATION, term = TERM, offset = n_results)
total_results = results['total']
results_per_page = len(results['businesses'])
n_pages = math.ceil((results['total'] - n_results) / results_per_page)
n_pages

- 0 previous results found


180

In [20]:
# Looping the api calls
for i in tqdm_notebook(range(1,n_pages+1)):
    
    # Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    # save number of results for to use as offset
    n_results = len(previous_results)
    
    if (n_results + results_per_page) > 1000:
        print('Exceeded 1000 api calls. Stopping loop.')
        break
    
    # use n_results as the OFFSET 
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=n_results)
    
    
    
    # append new results and save to file
    previous_results.extend(results['businesses'])
    
    # display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    
    time.sleep(.05)

  0%|          | 0/180 [00:00<?, ?it/s]

Exceeded 1000 api calls. Stopping loop.


In [21]:
# Creating a dataframe from the file.
final_df = pd.read_json(JSON_FILE)
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1000 non-null   object 
 1   alias          1000 non-null   object 
 2   name           1000 non-null   object 
 3   image_url      1000 non-null   object 
 4   is_closed      1000 non-null   bool   
 5   url            1000 non-null   object 
 6   review_count   1000 non-null   int64  
 7   categories     1000 non-null   object 
 8   rating         1000 non-null   float64
 9   coordinates    1000 non-null   object 
 10  transactions   1000 non-null   object 
 11  price          778 non-null    object 
 12  location       1000 non-null   object 
 13  phone          1000 non-null   object 
 14  display_phone  1000 non-null   object 
 15  distance       1000 non-null   float64
dtypes: bool(1), float64(2), int64(1), object(12)
memory usage: 118.3+ KB


In [22]:
# Checking for duplicates.
final_df.duplicated(subset = 'id').sum()

0

In [23]:
# Dipsplaying the dataframe
final_df.head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,FVzl8rDPiTWEtrNEuCu-Xg,storyville-coffee-company-seattle-9,Storyville Coffee Company,https://s3-media3.fl.yelpcdn.com/bphoto/TecGMk...,False,https://www.yelp.com/biz/storyville-coffee-com...,2105,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",4.5,"{'latitude': 47.60895949363687, 'longitude': -...","[delivery, pickup]",$$,"{'address1': '94 Pike St', 'address2': 'Ste 34...",12067805777.0,(206) 780-5777,1867.01945
1,K-X3rRAVbeMZ0VIDWuPX_Q,anchorhead-coffee-seattle,Anchorhead Coffee,https://s3-media2.fl.yelpcdn.com/bphoto/OITyX_...,False,https://www.yelp.com/biz/anchorhead-coffee-sea...,694,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",4.5,"{'latitude': 47.6133808022766, 'longitude': -1...",[delivery],$$,"{'address1': '1600 7th Ave', 'address2': 'Ste ...",12062222222.0,(206) 222-2222,1339.997321
2,3jrQdJz7YPGOi6KbDLmnTg,aroom-coffee-seattle,Aroom Coffee,https://s3-media3.fl.yelpcdn.com/bphoto/eUozLm...,False,https://www.yelp.com/biz/aroom-coffee-seattle?...,83,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",5.0,"{'latitude': 47.6532043, 'longitude': -122.343...",[],$,"{'address1': '3801 Stone Way N', 'address2': '...",,,3144.803799
3,qKkHnsG-f4-BhdkR6yRrgw,moore-coffee-shop-seattle,Moore Coffee Shop,https://s3-media4.fl.yelpcdn.com/bphoto/FIBeRa...,False,https://www.yelp.com/biz/moore-coffee-shop-sea...,1207,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",4.5,"{'latitude': 47.61163752302497, 'longitude': -...",[delivery],$,"{'address1': '1930 2nd Ave', 'address2': None,...",12068837044.0,(206) 883-7044,1591.154613
4,jG9m-nR8doMJHkecls9pDw,203-farenheit-coffee-seattle,203° Farenheit Coffee,https://s3-media3.fl.yelpcdn.com/bphoto/8BwHNN...,False,https://www.yelp.com/biz/203-farenheit-coffee-...,11,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",4.5,"{'latitude': 47.6253, 'longitude': -122.33683}",[],$$,"{'address1': '610 Terry Ave N', 'address2': 'S...",14252859382.0,(425) 285-9382,21.23488


In [24]:
# Saving the dataframe.
final_df.to_csv('Data/final_results_Seattle_coffee.csv.gz', compression = 'gzip', index = False)