## This notebook queries the Yelp API based on geographic coordinates, runs a business search on all the returned IDs and outputs to a csv


## Import packages and define functions

functions 'gen_coordinates' and 'gen_coords_helper' from Egemen's file 'generate coordinates'

In [1]:
import geopy.distance
import numpy as np
import pandas as pd
import csv

 ## CHANGE to your API key
 Where the key string is passed to YelpAPI(): change that to your own!!

In [2]:
from yelpapi import YelpAPI 
#yelp_api = YelpAPI('your Key')
yelp_api = YelpAPI('your key')

In [3]:
def gen_coordinates(topleft=(42.018708, -87.822461), botright=(41.644748, -87.524522), radius=200):
    '''
    Generates a list of coordinates corresponding to center points of non-intersecting circles.
    
    Inputs:
        topleft: tuple that contains the coordinates for the northwest corner of the rectangle.
        botright: tuple that contains the coordinates for the southeast corner of the rectangle.
        radius: interval at which the coordinates will be generated (in meters).
        
    Outputs:
        list of tuples containing coordinates partially covering the area of the rectangle. 
    '''
    
    #trying to find out what 0.000001 change in lat and lon corresponds to in meters
    
    #make an estimate by taking the average of meter change in the farthest 2 points of the rectangle
    
    compare_tl = (topleft[0] + 1/1000000, topleft[1])
    compare_br = (botright[0] + 1/1000000, botright[1])
    
    #geopy.distance.distance(c1, c2).m gives the distance between coordinates c1 and c2 in meters.
    
    lat_unit = (geopy.distance.distance(compare_tl, topleft).m + 
                geopy.distance.distance(compare_br, botright).m) / 2
    
    compare_tl = (topleft[0], topleft[1] + 1/1000000)
    compare_br = (botright[0], botright[1] + 1/1000000)
    long_unit = (geopy.distance.distance(compare_tl, topleft).m + 
                geopy.distance.distance(compare_br, botright).m) / 2
    
    v = radius / lat_unit / 1000000
    h = radius / long_unit / 1000000

    #h and v are how much we should change the lat and long values to move north/south and 
    #east/west by radius meters.
    
    lats = np.arange(min(topleft[0], botright[0]), max(topleft[0], botright[0]), v * 2)
    longs = np.arange(min(topleft[1], botright[1]), max(topleft[1], botright[1]), h * 2)
    
    coords = [(round(lat, 6), round(long, 6)) for lat in lats for long in longs]
    
    #fillers will be used to generate lats and longs that cover the areas between the circles
    
    filler_tl = (round(topleft[0] - v, 6), round(topleft[1] + h, 6))
    filler_br = (round(botright[0] + v, 6), round(botright[1] - h, 6))

    
    
    return(coords, filler_tl, filler_br)

In [4]:
def gen_coords_helper(topleft=(42.018708, -87.822461), botright=(41.644748, -87.524522), radius=200):
    '''
    Helper function that calls gen_coordinates two times and aggregates results to cover for the 
    areas between the initial set of circles.
    
    Inputs:
        topleft: tuple that contains the coordinates for the northwest corner of the rectangle.
            Default value is the coordinates for the northwest corner of Chicago. 
        botright: tuple that contains the coordinates for the southeast corner of the rectangle.
            Default value is the coordinates for the southeast corner of Chicago. 
        radius: interval at which the coordinates will be generated (in meters).
        
    Outputs:
        list of tuples containing coordinates covering the area of the rectangle. 
    '''
    coords, filler_tl, filler_br = gen_coordinates(topleft, botright, radius)
    #add the coordinates for the areas in between the initial circles
    coords += gen_coordinates(filler_tl, filler_br, radius)[0]
    return coords

In [5]:
def get_yelp_query(coords, radius, yelpapi, outputfile):
    '''
    coords: list of tuples for coordinates
    radius: radius in m to be used by yelp api
    yelpapi: YelpAPI(YOUR-KEY)
    '''
    count_no_returns = 0
    
    businesses = []
    i = 0
    count_no_return = 0
    
    for lat, long in coords:
        i += 1
        
        #this loop makes the code try the coordinate again if it receives an Internal Error
        #it tries a maximum of 10 times but it almost never fails consecutively
        for attempt in range(10):
            try:
                response = yelpapi.search_query(latitude=lat, longitude=long, radius=radius, limit=50)['businesses']
                businesses += response

                    
                #I think print statements are useful in this case to understand what how many
                #coordinates are left and which ones are done - and we'll know if we have more than 50 returned on a location
                if len(response) == 0:
                    count_no_returns += 1
                    #I found this print statement was being printed too often... will be summarized at end
                    #print(i, 'is done and returned', len(response), 'businesses')
                    
                elif len(response) < 50:
                    print(i, 'is done and returned', len(response), 'businesses')
                    
                elif len(response) == 50:
                    print(i, 'returned', len(response), 'businesses. Running again with offset')
                    responseoffset = yelpapi.search_query(latitude=lat, longitude=long, radius=radius, limit=50, offset=50)['businesses']
                    businesses += responseoffset

                break
            
            except YelpAPI.YelpAPIError:
                print('trying again for coordinate {}'.format(i))
            
            if attempt == 9:
                # if you get this print statement, that means that the coordinate never worked
                print("tried all 10 attempts for", lat, long)
    
    print("number of coordinates that returned 0 businesses:", count_no_returns)
    df = pd.DataFrame(businesses).drop_duplicates('id')
    
    df.to_csv(outputfile)
    
    return df

In [6]:
def businesses_query(business_ID_list, yelpapi, outputfile):
    '''
    business_ID_list: list of business IDs to query for
    yelpapi: YelpAPI(YOUR-KEY)
    outputfile: name of your outputfile
    
    outputs: dataframe of business query results (also written out to CSV)
    '''
    count_no_returns = 0
    
    businesses = []
    i = 0
    
    for b_id in business_ID_list:
        i += 1
        
        #this loop makes the code try the coordinate again if it receives an Internal Error
        #it tries a maximum of 10 times but it almost never fails consecutively
        for attempt in range(10):
            try:
                response = yelp_api.business_query(b_id)
                businesses.append(response)

                    
                #I think print statements are useful in this case to understand what how many
                #business IDs are left and which ones are done
                if len(response) == 0:
                    count_no_returns += 1
                    #I found this print statement was being printed too often... will be summarized at end
                    #print(i, 'is done and returned', len(response), 'businesses')
                    
                elif len(response) > 0:
                    print(i, 'is done and returned', len(response), 'variables')

                break
            
            except YelpAPI.YelpAPIError:
                print('trying again for coordinate {}'.format(i))
            
            if attempt == 9:
                # if you get this print statement, that means that the coordinate never worked
                print("tried all 10 attempts for", b_id)
                count_no_returns += 1
                
    df_businesses = pd.DataFrame(businesses).drop_duplicates('id')

    df_businesses.to_csv(outputfile)
    
    
    print("number of coordinates that returned empty lists", count_no_returns)
    return df_businesses

There are > 8500 businesses listed as in Chicago

Complete list of parameters and output can be found here:
https://www.yelp.com/developers/documentation/v3/business_search

List and descriptions of output: https://www.yelp.com/developers/documentation/v3/business

## Create a list of all coordinates to grab todays search coordinates from


In [7]:
coordinates = gen_coords_helper(radius=150)

take a look at some of the coordinates

In [8]:
len(coordinates) # = 22,853

22853

In [9]:
#break down by groups of 1000 to avoid errors (can then run 5 of these groups a day)
#todays_coords1 = coordinates[0:1000]
example_coords10 = coordinates[1300:1310]

In [30]:
#testresponse = yelp_api.search_query(latitude = 41.968868, longitude = -87.678001, radius = 150, limit = 50, yelp_api)

## Using radius 150 - assign coordinates: (inclusive on lower bound, non inclusive on upperbound)

Sabina:
total coordinates[0:5714]

Egemen:
total coordinates[5714:11428]

Jade: 
total coordinates[11428:17142]

Max:
total coordinates[17142:22854]

In [15]:
#first1000_df = get_yelp_query(todays_coords1, 150, yelp_api, "first1000.csv")

1 is done and returned 0 businesses
2 is done and returned 0 businesses
3 is done and returned 0 businesses
4 is done and returned 0 businesses
5 is done and returned 0 businesses
6 is done and returned 0 businesses
7 is done and returned 0 businesses
8 is done and returned 0 businesses
9 is done and returned 0 businesses
10 is done and returned 0 businesses
11 is done and returned 0 businesses
12 is done and returned 0 businesses
13 is done and returned 0 businesses
14 is done and returned 0 businesses
15 is done and returned 0 businesses
16 is done and returned 0 businesses
17 is done and returned 0 businesses
18 is done and returned 0 businesses
19 is done and returned 0 businesses
20 is done and returned 0 businesses
21 is done and returned 0 businesses
22 is done and returned 0 businesses
23 is done and returned 0 businesses
24 is done and returned 10 businesses
25 is done and returned 6 businesses
26 is done and returned 0 businesses
27 is done and returned 0 businesses
28 is don

220 is done and returned 0 businesses
221 is done and returned 0 businesses
222 is done and returned 0 businesses
223 is done and returned 0 businesses
224 is done and returned 0 businesses
225 is done and returned 0 businesses
226 is done and returned 0 businesses
227 is done and returned 0 businesses
228 is done and returned 0 businesses
229 is done and returned 0 businesses
230 is done and returned 0 businesses
231 is done and returned 0 businesses
232 is done and returned 0 businesses
233 is done and returned 0 businesses
234 is done and returned 0 businesses
235 is done and returned 0 businesses
236 is done and returned 0 businesses
237 is done and returned 0 businesses
238 is done and returned 0 businesses
239 is done and returned 0 businesses
240 is done and returned 0 businesses
241 is done and returned 0 businesses
242 is done and returned 0 businesses
243 is done and returned 2 businesses
244 is done and returned 0 businesses
245 is done and returned 0 businesses
246 is done 

436 is done and returned 0 businesses
437 is done and returned 0 businesses
438 is done and returned 0 businesses
439 is done and returned 0 businesses
440 is done and returned 0 businesses
441 is done and returned 0 businesses
442 is done and returned 0 businesses
443 is done and returned 0 businesses
444 is done and returned 0 businesses
445 is done and returned 0 businesses
446 is done and returned 0 businesses
447 is done and returned 0 businesses
448 is done and returned 0 businesses
449 is done and returned 0 businesses
450 is done and returned 0 businesses
451 is done and returned 0 businesses
452 is done and returned 0 businesses
453 is done and returned 0 businesses
454 is done and returned 10 businesses
455 is done and returned 6 businesses
456 is done and returned 0 businesses
457 is done and returned 0 businesses
458 is done and returned 0 businesses
459 is done and returned 0 businesses
460 is done and returned 0 businesses
461 is done and returned 1 businesses
462 is done

652 is done and returned 0 businesses
653 is done and returned 0 businesses
654 is done and returned 0 businesses
655 is done and returned 0 businesses
656 is done and returned 0 businesses
657 is done and returned 0 businesses
658 is done and returned 0 businesses
659 is done and returned 0 businesses
660 is done and returned 0 businesses
661 is done and returned 0 businesses
662 is done and returned 0 businesses
663 is done and returned 0 businesses
664 is done and returned 0 businesses
665 is done and returned 0 businesses
666 is done and returned 0 businesses
667 is done and returned 0 businesses
668 is done and returned 0 businesses
669 is done and returned 0 businesses
670 is done and returned 0 businesses
671 is done and returned 0 businesses
672 is done and returned 5 businesses
673 is done and returned 0 businesses
674 is done and returned 0 businesses
675 is done and returned 0 businesses
676 is done and returned 0 businesses
677 is done and returned 0 businesses
678 is done 

868 is done and returned 1 businesses
869 is done and returned 0 businesses
870 is done and returned 2 businesses
871 is done and returned 0 businesses
872 is done and returned 0 businesses
873 is done and returned 0 businesses
874 is done and returned 0 businesses
875 is done and returned 0 businesses
876 is done and returned 0 businesses
877 is done and returned 0 businesses
878 is done and returned 0 businesses
879 is done and returned 0 businesses
880 is done and returned 0 businesses
881 is done and returned 0 businesses
882 is done and returned 0 businesses
883 is done and returned 0 businesses
884 is done and returned 0 businesses
885 is done and returned 0 businesses
886 is done and returned 0 businesses
887 is done and returned 0 businesses
888 is done and returned 0 businesses
889 is done and returned 0 businesses
890 is done and returned 0 businesses
891 is done and returned 0 businesses
892 is done and returned 0 businesses
893 is done and returned 0 businesses
894 is done 

Ran the following as a tester - somewhere in the middle of the data set (likely close to city center) to check out density - 2/10 returned 0 businesses, others returned between 1 and 15 (1: 1, 2: 1, 5:1, 6:2, 7:1, 8:1, 15:1) so we can see that it's usually not that dense - in more residential areas maybe we can understand that large amounts of the coords do not return any?

In [12]:
example_out = get_yelp_query(example_coords10, 150, yelp_api, "example.csv")

2 is done and returned 2 businesses
5 is done and returned 2 businesses
number of coordinates that returned 0 businesses: 8


This next chunk simply makes it so you don't always have to change the variable going forward (just un-comment whichever df you are using)

In [13]:
#current_df = first1000_df
#current_df = second1000_df
#current_df = third1000_df
#current_df = fourth1000_df
#current_df = fifth1000_df
current_df = example_out

Can look at the number of businesses returned here: (length may be a better way to 'look')

In [14]:
len(current_df) 
#below are my results in case that is helpful to compare (note: since I have coordinates 0:5714 it's likely that the earlier in that group the coordinate is, the lower the population of that area likely is (and thus the lower the density of shops likely is))
#242 businesses in first query (coords[0:1000])
#389 businesses in second query (coords[1000:2000])
#527 businesses in third query (coords[2000:3000])
#449 businesses in fourth query (coords[3000:4000])
#711 businesses in fifth query (coords[4000:5000])

4

In [15]:
current_df.head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,S8OjwHGTs_ThbDt76Bkk1Q,mannys-tacos-and-burritos-chicago,Manny's Tacos & Burritos,https://s3-media1.fl.yelpcdn.com/bphoto/ihY4u-...,False,https://www.yelp.com/biz/mannys-tacos-and-burr...,24,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",4.0,"{'latitude': 41.684074, 'longitude': -87.620399}",[delivery],$,"{'address1': '11543 S Michigan', 'address2': '...",17737855663,(773) 785-5663,132.543006
1,USHNhhoWaKR8-23Jyonl2A,roseland-pizza-and-tacos-chicago,Roseland Pizza & Tacos,https://s3-media2.fl.yelpcdn.com/bphoto/NHO63j...,False,https://www.yelp.com/biz/roseland-pizza-and-ta...,11,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",2.5,"{'latitude': 41.685194, 'longitude': -87.6194099}",[delivery],$,"{'address1': '135 E 115th St', 'address2': '',...",17732642000,(773) 264-2000,67.542903
2,VjXf0VUM6N0U5lfo13fiEg,cal-harbor-restaurant-and-lounge-chicago,Cal-Harbor Restaurant & Lounge,https://s3-media1.fl.yelpcdn.com/bphoto/8p1WZ6...,False,https://www.yelp.com/biz/cal-harbor-restaurant...,47,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",4.0,"{'latitude': 41.6857, 'longitude': -87.60967}",[delivery],$,"{'address1': '546 E 115th St', 'address2': '',...",17732645435,(773) 264-5435,49.764838
3,3AUUTqy0XDWwLH48x6ntDw,mcdonalds-chicago-188,McDonald's,https://s3-media3.fl.yelpcdn.com/bphoto/bRqBMh...,False,https://www.yelp.com/biz/mcdonalds-chicago-188...,13,"[{'alias': 'hotdogs', 'title': 'Fast Food'}, {...",1.5,"{'latitude': 41.6856867627975, 'longitude': -8...",[delivery],$,"{'address1': '600 E 115th St', 'address2': '',...",17739958586,(773) 995-8586,150.004856


just out of curiousity, check if any of the shops are permanently closed - for each of the 5 rounds of a thousand queries, 0 are closed --> this is how we discovered that Yelp does not return closed businesses when only querying coordinates! Prompted our work around

In [17]:

countClosed = 0
for row in range(len(current_df)):
    if current_df.iloc[row]['is_closed'] == True:
        count +=1 
        
countClosed


0

For further research: let's see what keys we're working with from the results:
['id',
 'alias',
 'name',
 'image_url',
 'is_closed',
 'url',
 'review_count',
 'categories',
 'rating',
 'coordinates',
 'transactions',
 'price',
 'location',
 'phone',
 'display_phone',
 'distance']

In [19]:
list_keys = current_df.columns.tolist()
list_keys

['id',
 'alias',
 'name',
 'image_url',
 'is_closed',
 'url',
 'review_count',
 'categories',
 'rating',
 'coordinates',
 'transactions',
 'price',
 'location',
 'phone',
 'display_phone',
 'distance']

### They provide another endpoint to make matching records from another database (e.g. City) easier too!
https://www.yelp.com/developers/documentation/v3/business_match

## Now conduct a Business Search
Now we can use those initial search query results to get the business search (another API access from YELP) to get a few more variables:

In [21]:
lst_ids = []
for row in range(len(current_df)):
    id_ = current_df.loc[row]['id']
    lst_ids.append(id_)

lst_ids

['S8OjwHGTs_ThbDt76Bkk1Q',
 'USHNhhoWaKR8-23Jyonl2A',
 'VjXf0VUM6N0U5lfo13fiEg',
 '3AUUTqy0XDWwLH48x6ntDw']

In [22]:
b_query_df = businesses_query(lst_ids, yelp_api, "minitest.csv")

1 is done and returned 18 variables
2 is done and returned 18 variables
3 is done and returned 18 variables
4 is done and returned 18 variables
number of coordinates that returned empty lists 0


In [23]:
b_query_df  #as an example to look at the output

Unnamed: 0,id,alias,name,image_url,is_claimed,is_closed,url,phone,display_phone,review_count,categories,rating,location,coordinates,photos,price,hours,transactions
0,S8OjwHGTs_ThbDt76Bkk1Q,mannys-tacos-and-burritos-chicago,Manny's Tacos & Burritos,https://s3-media1.fl.yelpcdn.com/bphoto/ihY4u-...,False,False,https://www.yelp.com/biz/mannys-tacos-and-burr...,17737855663,(773) 785-5663,24,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",4.0,"{'address1': '11543 S Michigan', 'address2': '...","{'latitude': 41.684074, 'longitude': -87.620399}",[https://s3-media1.fl.yelpcdn.com/bphoto/ihY4u...,$,"[{'open': [{'is_overnight': False, 'start': '0...",[delivery]
1,USHNhhoWaKR8-23Jyonl2A,roseland-pizza-and-tacos-chicago,Roseland Pizza & Tacos,https://s3-media2.fl.yelpcdn.com/bphoto/NHO63j...,False,False,https://www.yelp.com/biz/roseland-pizza-and-ta...,17732642000,(773) 264-2000,11,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",2.5,"{'address1': '135 E 115th St', 'address2': '',...","{'latitude': 41.685194, 'longitude': -87.6194099}",[https://s3-media2.fl.yelpcdn.com/bphoto/NHO63...,$,"[{'open': [{'is_overnight': False, 'start': '1...",[delivery]
2,VjXf0VUM6N0U5lfo13fiEg,cal-harbor-restaurant-and-lounge-chicago,Cal-Harbor Restaurant & Lounge,https://s3-media1.fl.yelpcdn.com/bphoto/8p1WZ6...,False,False,https://www.yelp.com/biz/cal-harbor-restaurant...,17732645435,(773) 264-5435,47,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",4.0,"{'address1': '546 E 115th St', 'address2': '',...","{'latitude': 41.6857, 'longitude': -87.60967}",[https://s3-media1.fl.yelpcdn.com/bphoto/8p1WZ...,$,"[{'open': [{'is_overnight': False, 'start': '0...",[delivery]
3,3AUUTqy0XDWwLH48x6ntDw,mcdonalds-chicago-188,McDonald's,https://s3-media3.fl.yelpcdn.com/bphoto/bRqBMh...,True,False,https://www.yelp.com/biz/mcdonalds-chicago-188...,17739958586,(773) 995-8586,13,"[{'alias': 'hotdogs', 'title': 'Fast Food'}, {...",1.5,"{'address1': '600 E 115th St', 'address2': '',...","{'latitude': 41.6856867627975, 'longitude': -8...",[https://s3-media3.fl.yelpcdn.com/bphoto/bRqBM...,$,"[{'open': [{'is_overnight': True, 'start': '00...",[delivery]


Notice some new columns! Additional information that might be useful later