In [1]:
# imports
import pandas as pd
import numpy as np
import requests
import os

# Foursquare

Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. 

In [2]:
# Load data on our stations from Part 1
berlin_bikes = pd.read_csv('../data/berlin_bikes.csv')

# Whoops, it exported with an extra column that just holds the index. Let's drop it.
berlin_bikes = berlin_bikes.drop('Unnamed: 0',axis=1)

# In Foursquare, lat & long params need to be passed as '<lat>,<long>'
# Create a column of strings formatted as such so we don't have to do this nasty conversion when passing 'll' in params
berlin_bikes['lat_long'] = berlin_bikes['lat'].astype(str) + ',' + berlin_bikes['long'].astype(str)

# Drop the old redundant columns that are cluttering our simple dataframe.
berlin_bikes = berlin_bikes.drop(['lat','long'],axis=1)
print(berlin_bikes.head(3))


# [ Our station data is now set up appropriately ]

   bikes_available  bikes_in_use  total_bikes             lat_long
0                4             0            4  52.504157,13.335328
1                0             4            4   52.496986,13.29121
2                3             2            5  52.498323,13.296157


In [3]:
# Set out API key for Foursquare
api_key = os.environ["FOURSQUARE_API_KEY"]

# Set Foursquare url
url = 'https://api.foursquare.com/v3/places/search'

# Function that sends a request with each passed location
def general_1km_scan(location):
    # Request .Json response and attatch our key.
    headers = {
        'accept': 'application/json',
        'Authorization': api_key
    }
    # only needed parameter is lat and long
    parameters = {
        'll': location,
        'radius':'500'
    }

    response = requests.get(url,headers=headers,params=parameters)

    return response.json()


# [ This for loop calls the above function for every station in city ]
'''
for station in berlin_bikes['lat_long'].values:
    print(general_1km_scan(station))
'''
# There are 1808 stations in Berlin so I'm going to run a variation of the loop that takes the first 3


for station in berlin_bikes['lat_long'][:3].values:
    # print(general_1km_scan(station))
    # uncomment the above print to get an idea
    pass
# This outputs a ton of data

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [4]:
# I'm going to try use a class here to make things more readable.
class harvestData:
    def __init__(self, _response,origin):
        # The only thing we need to pass is the .json response
        self.name = pd.Series(_response['name'],name='name')
        self.distance = pd.Series(_response['distance'],name='distance_meters')
        try:self.category = pd.Series(_response['categories'][0]['name'],name='category')
        except:self.category = pd.Series(np.nan,name='category')
        self.lat = pd.Series(_response['geocodes']['main']['latitude'],name='lat')
        self.long = pd.Series(_response['geocodes']['main']['longitude'],name='long')
        self.origin = pd.Series(origin,name='origin')
        # These three fields are standard 

        # However these two 'POI Rich Data' fields are said to not always be populated. For that reason we'll use try and except.
        try: self.popularity = pd.Series(_response['popularity'],name='popularity')
        except: self.popularity = pd.Series(np.nan,name='popularity')

        try:self.rating = pd.Series(_response['rating'],name='rating')
        except:self.rating = pd.Series(np.nan,name='rating')
        # now we don't need to use these ugly nested dictionaries and lists again!


# This will only be called if Foursquare doesn't return any nearby POIs
class emptyStation:
    def __init__(self, _response,origin):
        # Set everything to nan
        self.name = pd.Series(np.nan, name='name')
        self.distance = pd.Series(np.nan, name='distance_meters')
        self.category = pd.Series(np.nan, name='category')
        self.lat = pd.Series(np.nan, name='lat')
        self.long = pd.Series(np.nan, name='long')
        self.origin = pd.Series(origin,name='origin')
        self.popularity = pd.Series(np.nan,name='popularity')
        self.rating = pd.Series(np.nan,name='rating')

In [5]:
# Trim the fat on this response by using the fields parameter
def general_100m_scan(location):
    headers = {
        'accept': 'application/json',
        'Authorization': api_key
    }
    parameters = {
        'll': location,
        'radius':'100', # lower radius is the only way to attempt to avoid overlap that would harm analysis.
        'fields' : 'name,distance,categories,popularity,rating,geocodes',
        'limit':'50' # max limit is 50
    }

    response = requests.get(url,headers=headers,params=parameters)

    return response.json()
poi = general_100m_scan(berlin_bikes['lat_long'][0])
poi = harvestData(poi['results'][0],berlin_bikes['lat_long'][0])

In [6]:
print('name -',poi.name[0])
print('distance -',poi.distance[0],'meters')
print('category -',poi.category[0])
print('lat -',poi.lat[0])
print('long -',poi.long[0])

# These two will be np.nan if the rich data is not available.
print('popularity -',poi.popularity[0])
print('rating -',poi.rating[0])
print('origin -',poi.origin[0])
# This looks way better than when I didn't use a custom class in city_bikes.

name - Curry Wolf
distance - 29 meters
category - Fast Food Restaurant
lat - 52.503902
long - 13.335662
popularity - 0.9036409859379204
rating - 7.8
origin - 52.504157,13.335328


Put your parsed results into a DataFrame

In [7]:
compiled_df = pd.DataFrame()

def add_rows(poi):
    tmp = pd.concat([
        poi.name,
        poi.distance,
        poi.category,
        poi.popularity,
        poi.rating,
        poi.lat,
        poi.long,
        poi.origin
    ], axis=1)
    main = pd.concat([compiled_df,tmp])
    return main

# Iterate through coordinates of each bike station in berlin.
for station in berlin_bikes.lat_long.values: # (add '[:n]' after values for sample.)
    # Scan area for POIs
    response = general_100m_scan(station)

    # If nothing is detected. Enter a row of mostly NaN
    if len(response['results']) == 0:
        poi = emptyStation(response,station)
        compiled_df = add_rows(poi)
    else:
        # Otherwise, add each result as a row
        for result in response['results']:
            poi = harvestData(result,station)
            compiled_df = add_rows(poi)
    
# I did a test with 3 and it went well and my exception catching seems to work.
# I'm now going to run it on the full df and take a break.

In [8]:
# 10 minutes later and we have a nice 46k row dataframe!
print(compiled_df['category'].value_counts().head(5),'\n')
print(compiled_df['category'].value_counts().tail(5),'\n')
print(compiled_df['origin'].value_counts().tail(10))
compiled_df.head(3)

Business and Professional Services    2987
Clothing Store                        1162
Attorney / Law Office                 1067
Hair Salon                             968
Café                                   869
Name: category, dtype: int64 

Sri Lankan Restaurant    1
Party Supply Store       1
Petroleum Supplier       1
Gelato Shop              1
Rental Service           1
Name: category, dtype: int64 

52.634606,13.492751    1
52.541985,13.4535      1
52.500952,13.393774    1
52.480084,13.483233    1
52.54404,13.59604      1
52.468993,13.478447    1
52.471606,13.466764    1
52.517845,13.371651    1
52.472433,13.419843    1
52.512978,13.574224    1
Name: origin, dtype: int64


Unnamed: 0,name,distance_meters,category,popularity,rating,lat,long,origin
0,Curry Wolf,29.0,Fast Food Restaurant,0.903641,7.8,52.503902,13.335662,"52.504157,13.335328"
0,Pull&Bear Ber-Tauentzienstrasse,29.0,Clothing Store,0.988963,7.8,52.504021,13.335963,"52.504157,13.335328"
0,Levi Strauss Germany GmbH,19.0,Clothing Store,0.936147,6.7,52.504051,13.334821,"52.504157,13.335328"


In [12]:
#compiled_df.to_csv('../data/foursquare_bikes.csv')
# avoid that 10 min execution.

# Yelp

Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

In [42]:
url = 'https://api.yelp.com/v3/businesses/search'
yelp_api = os.environ["YELP_KEY"]
headers = { 
    'accept':'application/json',
    'Authorization' : 'Bearer '+ yelp_api
    }
params = {
    'latitude' : '52.504157', # long and lat have to be passed seperately for this API.
    'longitude' : '13.335328',
    'radius' : '100', # same radius
    'limit' : '50' # again, 50 is the max limit
}
response = requests.get(url, headers=headers, params=params)
response = response.json()

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [62]:
# Same classes as before, slightly modified to work with Yelp's Json structure.

class harvestDataYelp:
    def __init__(self, _response,origin):
        self.name = pd.Series(_response['name'],name='name')
        self.distance = pd.Series(_response['distance'],name='distance_meters')
        try:self.category = pd.Series(_response['categories'][0]['title'],name='category') # Here we'll grab title instead of alias as title should give us better patterns.
        except:self.category = pd.Series(np.nan,name='category')
        self.lat = pd.Series(_response['coordinates']['latitude'],name='lat')
        self.long = pd.Series(_response['coordinates']['longitude'],name='long')
        self.origin = pd.Series(origin,name='origin')

        try:self.rating = pd.Series(_response['rating'],name='rating')
        except:self.rating = pd.Series(np.nan,name='rating')


# This will only be called if Foursquare doesn't return any nearby POIs
class emptyStationYelp:
    def __init__(self, _response,origin):
        # Set everything to nan
        self.name = pd.Series(np.nan, name='name')
        self.distance = pd.Series(np.nan, name='distance_meters')
        self.category = pd.Series(np.nan, name='category')
        self.lat = pd.Series(np.nan, name='lat')
        self.long = pd.Series(np.nan, name='long')
        self.origin = pd.Series(origin,name='origin')
        self.rating = pd.Series(np.nan,name='rating')

def general_100m_yelp_scan(location):
    url = 'https://api.yelp.com/v3/businesses/search'
    ll = location.split(',') # convert location from '123,321' to ['123','321'] for our params.
    headers = { 
        'accept':'application/json',
        'Authorization' : 'Bearer '+ yelp_api
    }
    params = {
        'latitude' : ll[0],
        'longitude' : ll[1],
        'radius' : '100',
        'limit' : '50' # again, 50 is the max limit
    }
    response = requests.get(url,headers=headers,params=params)

    return response.json()

In [67]:
# ugly test response and clean
response = general_100m_yelp_scan('52.504157,13.335328')
poi = harvestDataYelp(response['businesses'][0],'52.504157,13.335328')
print(poi.name[0], poi.distance[0],poi.category[0],poi.lat[0],poi.long[0],poi.origin[0],poi.rating[0])
# Works as expected

Kaiser Wilhelm Memorial Church 73.32558211459008 Churches 52.5048070449392 13.3351458426792 52.504157,13.335328 4.0


Put your parsed results into a DataFrame

In [None]:
# Lets use the same structure as we did for Foursquare

compiled_df2 = pd.DataFrame()

def add_rows(poi):
    tmp = pd.concat([
        poi.name,
        poi.distance,
        poi.category,
        poi.rating,
        poi.lat,
        poi.long,
        poi.origin
    ], axis=1)
    main = pd.concat([compiled_df2,tmp])
    return main

# Iterate through coordinates of each bike station in berlin.
for station in berlin_bikes.lat_long.values: # (add '[:n]' after values for sample.)
    # Scan area for POIs
    response = general_100m_yelp_scan(station)

    # If nothing is detected. Enter a row of mostly NaN
    if len(response['businesses']) == 0:
        poi = emptyStationYelp(response,station)
        compiled_df2 = add_rows(poi)
    else:
        # Otherwise, add each result as a row
        for business in response['businesses']:
            poi = harvestDataYelp(business,station)
            compiled_df2 = add_rows(poi)

In [106]:
compiled_df2.to_csv('../data/yelp_bikes.csv')
compiled_df2.head()

Unnamed: 0,name,distance_meters,category,rating,lat,long,origin
0,Kaiser Wilhelm Memorial Church,73.325582,Churches,4.0,52.504807,13.335146,"52.504157,13.335328"
0,Restaurant Heising,100.350295,French,4.5,52.50322,13.33488,"52.504157,13.335328"
0,Curry Wolf,36.344645,Curry Sausage,4.0,52.50385,13.335511,"52.504157,13.335328"
0,Upper Burger Grill,107.624815,Steakhouses,4.0,52.50339,13.33436,"52.504157,13.335328"
0,Falafel Salam,118.29567,Falafel,4.0,52.5034,13.3341,"52.504157,13.335328"


# Comparing Results

### Which API provided you with more complete data? Provide an explanation. 

Without a doubt the Foursquare API beat the yelp API in every respect.
* Foursquare = 46,208 rows of data // Yelp = 9,720 rows of data.
* Foursquare has 66 stations with no nearby POIs (3.65%) // Yelp has 408 (22.56%).
* Despite returning nearly 5x the information, Foursquare created its database 30 seconds faster.
* I like the selection of information Foursquare provides over the information Yelp provides. (This does come down to preference / application as both have information the other does not. In general I think Foursquare's is more applicable though).

Two positive things about Yelp.
* The autocomplete/general term searching functionality. I haven't experimented with this but I can think of a couple instances where this would be useful.
* You can make a request using more types of locational info. Foursquare is restriced to lat and long or areas declared by lat and long. Yelp can have many different types of locations passed in. I prefer to use lat and long but again this has its uses.

If given the choice I would use Foursquare over Yelp simply due to its ability to return so much more data.

Get the top 10 restaurants according to their rating

In [141]:
top_restaurants = compiled_df[['name','category','rating']]
# We only need these two columns.
top_restaurants = top_restaurants.drop_duplicates()
# Remove overlap from nearby bike stations.
top_restaurants = top_restaurants.dropna()
# We don't need Restaurants without reviews and keeping NaN will impair our ability to use a bool mask.

top_restaurants[top_restaurants['category'].str.endswith('Restaurant')].sort_values(by='rating', ascending=False).head(10)
# I tried more masks but kept getting the same result


# If we run:
# top_restaurants['category'].value_counts().head(60)
# We see that it's very uniform for restaurants to follow the exact mask that I used above.

Unnamed: 0,name,category,rating
0,Nonne & Zwerg,Mediterranean Restaurant,9.2
0,Enzo Sushi Bar,Sushi Restaurant,9.2
0,Picoteo,Spanish Restaurant,9.1
0,Umami,Vietnamese Restaurant,9.1
0,Knödelwirtschaft SÜD,Restaurant,9.1
0,Rüyam Gemüse Kebab,Doner Restaurant,9.1
0,Banh Mi Stable,Fast Food Restaurant,9.1
0,Dan Thai Food,Thai Restaurant,9.1
0,Facil,Eastern European Restaurant,9.0
0,Khao Taan,Thai Restaurant,9.0
