In [None]:
"""
    This Python notebook uses Yelp's API to gather data for cafes by latitude and longitude.
    This is used for scraping the Chicago Area.
    Problems with this method:
        Incredibly inefficient (requests are bringing in similar results)
        Slow.
        Unpredictable.
"""

In [None]:
from yelpapi import YelpAPI
import json
import pandas as pd
from pandas.io.json import json_normalize
import numpy

api_key = 'Z41R0DQquIn-_Y88pmgfAoavZA_kOFv-96EtbOcnGGdPVwqik70FkOXfIy_CRlcikfZ7nuzDtbx18DYg_vKJVJs32T3x7Zgx2y1bSxNGumP2U2S6bYtPBAOig4G3W3Yx'
yelp_api = YelpAPI(api_key)


In [None]:
# Ballparked coordinates of 4 vertices of Zip Code 60637.
upper_left_coordinate = [41.800792, -87.628796]  # [latitude, longitude]
upper_right_coordinate = [41.800792, -87.574454]
bottom_left_coordinate = [41.764310, -87.628796]
bottom_right_coordinate = [41.764310, -87.574454]
VDistance = abs(upper_left_coordinate[0] - bottom_left_coordinate[0])
HDistance = abs(upper_left_coordinate[1] - upper_right_coordinate[1])

# GPS distance to meters
# Converted using website below
# http://boulter.com/gps/distance/?from=41.800792+-87.628796&to=41.800792+-87.574454&units=k
HDistance_m = 4520  # Distance from upperleft to upperright in meters.
number_of_circles = 9
# Yelp only takes integer values for radius
circle_radius = round(HDistance_m / number_of_circles)  
circle_radius


In [None]:
# sample search
search_result = yelp_api.search_query(
    latitude=upper_right_coordinate[0],
    longitude=upper_right_coordinate[1],
    radius=circle_radius,
    limit=50)
df = pd.DataFrame.from_dict(search_result['businesses'], orient='columns')
df.head()


In [None]:
def MoveMap(upper_left_coordinate, upper_right_coordinate, bottom_left_coordinate,
            number_of_circles, circle_radius,
            category):
    '''
        Start at the upper left corner of the grid.
        Get the information within a certain radius.
        Move to the left.
        Repeat until the upper right point is reached.
        Move back to to the longitude of upper left point.
        Move downwards.
        Repeat.
    '''
    distance_between_circles_h = abs(
        upper_right_coordinate[1] - upper_left_coordinate[1]) / number_of_circles
    distance_between_circles_v = abs(
        upper_left_coordinate[0] - bottom_left_coordinate[0]) / number_of_circles
    latitude = upper_left_coordinate[0]
    longitude = upper_left_coordinate[1]

    df = pd.DataFrame()
    for v_step in range(number_of_circles):
        for h_step in range(number_of_circles):
            search_result = yelp_api.search_query(
                term=category,
                latitude=latitude,
                longitude=longitude,
                radius=circle_radius,
                limit=50)
            normalize = pd.DataFrame.from_dict(
                json_normalize(search_result['businesses']), orient='columns')
            # df = df.append (pd.DataFrame.from_dict(dfadd, orient='columns'))
            df = df.append(normalize)
            longitude += distance_between_circles_h
        longitude = upper_left_coordinate[1]
        latitude -= distance_between_circles_v
    return df.drop_duplicates(['id']).reset_index().drop('index',axis=1)


In [None]:
data_in_60637 = MoveMap(upper_left_coordinate, upper_right_coordinate, bottom_left_coordinate,
              number_of_circles=9, circle_radius=circle_radius,
              category='')


In [None]:
data_in_60637.head()

In [None]:
def get_yelp_data_by_location(location, number_of_calls):
    '''
        Given a location (ex. ZipCode),
        use Yelp API to retrieve data.
        Repeat by number of calls.
        Returns a dataframe.
    '''
    df = pd.DataFrame()
    for call in range (number_of_calls):
        search_result = yelp_api.search_query(
                    location=location,
                    limit=50)
        normalize = pd.DataFrame.from_dict(
                    json_normalize(search_result['businesses']), orient='columns')
        df = df.append(normalize)
    return df

location_data = get_yelp_data_by_location (
                    location='60637',
                    number_of_calls=2)
location_data.drop_duplicates(['id']).head()
# Calling any amount of times in the same location produces same results

In [None]:
data_in_60637.to_csv ('Businesses in 60637.csv')

In [None]:
cafe_in_60637 = MoveMap(upper_left_coordinate, upper_right_coordinate, bottom_left_coordinate,
              number_of_circles=9, circle_radius=circle_radius,
              category='cafe')

In [None]:
cafe_in_60637.head()

In [None]:
# All these numbers are overestimates.

# Ballparked coordinates of 4 vertices of Chicago
chi_u_l = [42.031355, -87.946627] # [latitude, longitude]
chi_u_r = [42.031355, -87.512777]
chi_b_l = [41.633678, -87.946627]
chi_b_r = [41.633678, -87.512777]
"""
    About 36 km from u_l to u_r
    About 45 km from u_l to b_l
    Area is about: 1620 km^2 (1.62e+9 m^2)
    Based on Yelp(https://www.yelp.com/search?find_desc=cafe&find_loc=Chicago%2C+IL&ns=1):
    There are about 2500 cafes in the Chicago area
    (possibly up to 7500 in the grid I chose because the area is less than 3 times bigger).
    That means about 1 cafe every 216000 m^2 (result seems off),
    or 50 cafes every 10800000 m^2.
    Then the radius I should pick is 1900 meters.
    45/1.9 = 24, so the number of circles should be 24
"""
cafe_in_chicago = MoveMap(chi_u_l, chi_u_r, chi_b_l,
              number_of_circles=24, circle_radius=1900,
              category='cafe')

In [None]:
cafe_in_chicago.to_csv ('cafe_in_chicago.csv')
cafe_in_chicago.head()

In [None]:
"""
    Cutting down coordinates, 
    increasing number of circles,
    and decreasing circle size
"""
chi_u_l = [42.030116, -87.946627] # [latitude, longitude]
chi_u_r = [42.030116, -87.512777]
chi_b_l = [41.633678, -87.946627]
chi_b_r = [41.633678, -87.512777]

cafe_in_chicago = MoveMap(chi_u_l, chi_u_r, chi_b_l,
              number_of_circles=45, circle_radius=1000,
              category='cafe')

In [None]:
cafe_in_chicago.to_csv ('cafe_in_chicago.csv')
cafe_in_chicago.head()

In [None]:
cafe_in_chicago.shape[0] # 1595. The one before that had 1300 cafes.