# Setup/Global Dependencies

In [1]:
# Libraries
import json
import requests
import pandas as pd

# I saved my keys as global environment keys on my PC, meaning I need OS access to use them
import os

In [2]:
# Load bike station CSV dataframe
stations_df = pd.read_csv('../data/bike_data.csv', sep=',')
stations_df.head()

Unnamed: 0,id,latitude,longitude,name,est_bike_slots
0,45dbb0009135e465f49f054517cbe74d,43.259126,-79.877212,hess at king,12
1,cd881edad122a75c57d94b381cdfc6c0,43.269288,-79.871327,bayfront park,31
2,024a3edf037cb411d16acc08a7fcb954,43.267859,-79.867923,bay at strachan,25
3,b933317ff2861c45aacbea4cbf4b541f,43.263198,-79.871803,bay at mulberry,14
4,20dc109608315db09a8332d6e6940c75,43.256132,-79.874499,city hall,16


# Foursquare

Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. 

In [14]:
# Access Setup
api_key = os.environ["FOURSQUARE_API_KEY"]

# query; location commented out because I'm doing all of my queries based on coordinates,
# and keeping it in could limit some from neighbouring municipalities/counties
# location = "Hamilton,Canada"
url = "https://api.foursquare.com/v3/places/search"#?near=" + location

# Headers, API KEY get
headers = {"Accept": "application/json"}
headers['Authorization'] = api_key

# Request Access:
response = requests.get(url, headers=headers)
response

<Response [200]>

Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [79]:
## FourSquare Functions:

# Function to run a query for the currently chosen station; limit of 30 set because stations
# are all fairly close to one another, so this reduces both overlap and API request traffic
def run_fsq_query(query, category, lat, long):
    latlong = str(lat) + ',' + str(long)
    params = {
        "query": query,
        "categories": category,
        "ll": latlong,
        "radius": 1000,
        "sort": 'DISTANCE',
        "limit": 30
    }
    response = requests.get(url, params=params, headers=headers)
    result = response.json()
    temp_json = result["results"]
    return temp_json

# A function to join query results together
def joint_query_fsq(query, category):
    # Beginning JSON object 
    query_list = []
    for s, r in stations_df.iterrows():
        lat = r['latitude']
        long = r['longitude']
        temp_list = run_fsq_query(query, category, lat, long)
        query_list = query_list + temp_list
    return query_list

# A function to get total number of POIs near bike stations, for later analysis
def total_fsq_pois(query, category):
    poi_counts = pd.DataFrame({'poi_counts': pd.Series(dtype=int)})
    for s, r in stations_df.iterrows():
        lat = r['latitude']
        long = r['longitude']
        temp_list = run_fsq_query(query, category, lat, long)
        poi_counts.loc[len(poi_counts)] = len(temp_list)
    return poi_counts

#### Fetch Data

In [16]:
# A simple "park" query turned up parking lots, stores with "park" in their names, etc.;
# A category param is to filter further, even if some parking lots still make it through due to
# categorization issues in FourSquare's database
park_query = joint_query_fsq("park", 16032)

# Same, Bars:
bar_query = joint_query_fsq("bar", 13003)

# Same, Restaurants; the query search term is left blank to minimize risk of exclusion:
food_query = joint_query_fsq("", 13065)

# Same, Grocery Stores:
grocer_query = joint_query_fsq("", 17069)

# Same, Trails:
trail_query = joint_query_fsq("trail", 16004)

In [101]:
# For getting poi counts:
park_pois = total_fsq_pois("park", 16032)

# Same, Bars:
bar_pois = total_fsq_pois("bar", 13003)

# Same, Restaurants; the query search term is left blank to minimize risk of exclusion:
food_pois = total_fsq_pois("", 13065)

# Same, Grocery Stores:
grocer_pois = total_fsq_pois("", 17069)

# Same, Trails:
trail_pois = total_fsq_pois("trail", 16004)

In [119]:
# Rename poi sheets, join, and export:
park_pois = park_pois.rename(columns={
    'poi_counts':'park_poi_counts'})
bar_pois = bar_pois.rename(columns={
     'poi_counts':'bar_poi_counts'})
food_pois = food_pois.rename(columns={
     'poi_counts':'restaurant_poi_counts'})
grocer_pois = grocer_pois.rename(columns={
     'poi_counts':'grocer_poi_counts'})
trail_pois = trail_pois.rename(columns={
     'poi_counts':'trail_poi_counts'})

poi_frames = [park_pois, bar_pois, food_pois, grocer_pois, trail_pois]
joint_poi_df = pd.concat(poi_frames, axis=1)
joint_poi_df.head()

# Create .csv and show final DataFrame:
joint_poi_df.to_csv('../data/fsq_poi_counts.csv', index=False)

In [22]:
# Convert Data & Add Consistent Category ID, allowing me to drop categories column later
park_df = pd.json_normalize(park_query)
park_df["fsq_am_cat"] = '0.0'
trail_df = pd.json_normalize(trail_query)
trail_df["fsq_am_cat"] = '0.1'
bar_df = pd.json_normalize(bar_query)
bar_df["fsq_am_cat"] = '1.0'
food_df = pd.json_normalize(food_query)
food_df["fsq_am_cat"] = '1.1'
grocer_df = pd.json_normalize(grocer_query)
grocer_df["fsq_am_cat"] = '2.0'

Put your parsed results into a DataFrame

In [72]:
# I started by putting everything in different dataframes, so I'm sort of doing the steps 
# backwards for the same end result and doing some cleaning after is amalgamation.

df_frames = [food_df, bar_df, park_df, grocer_df, trail_df]
fsq_df = pd.concat(df_frames)
fsq_df.head()

Unnamed: 0,fsq_id,categories,chains,distance,link,name,timezone,geocodes.main.latitude,geocodes.main.longitude,geocodes.roof.latitude,...,location.region,geocodes.drop_off.latitude,geocodes.drop_off.longitude,related_places.parent.fsq_id,related_places.parent.name,related_places.children,geocodes.front_door.latitude,geocodes.front_door.longitude,location.po_box,fsq_am_cat
0,5796abdc498eb9b6b1d59458,"[{'id': 13055, 'name': 'Fried Chicken Joint', ...",[],19,/v3/places/5796abdc498eb9b6b1d59458,Coop Wicked Chicken Hamilton,America/Toronto,43.259271,-79.877373,43.259271,...,ON,,,,,,,,,1.1
1,5436c40b498eaa25254aea14,"[{'id': 13097, 'name': 'Caribbean Restaurant',...",[],78,/v3/places/5436c40b498eaa25254aea14,Island Carribbean Takeout & Catering,America/Toronto,43.258578,-79.876931,43.258578,...,ON,,,,,,,,,1.1
2,4baf6568f964a520b3fc3be3,"[{'id': 13049, 'name': 'Diner', 'icon': {'pref...",[],88,/v3/places/4baf6568f964a520b3fc3be3,Sunrise Restaurant,America/Toronto,43.258848,-79.876327,43.258848,...,ON,,,,,,,,,1.1
3,4ba3cb84f964a5202e6038e3,"[{'id': 13003, 'name': 'Bar', 'icon': {'prefix...",[],90,/v3/places/4ba3cb84f964a5202e6038e3,Gown & Gavel,America/Toronto,43.258338,-79.877864,43.258338,...,ON,,,,,,,,,1.1
4,0d4da555da574fd56a6c5729,"[{'id': 13016, 'name': 'Lounge', 'icon': {'pre...",[],96,/v3/places/0d4da555da574fd56a6c5729,Sizzle Steak House & Lounge,,43.258263,-79.877388,43.258263,...,ON,43.258349,-79.877595,,,,,,,1.1


In [73]:
# Doing some cleaning, and I want to drop the following:
#  chains, link, location.formatted_address, timezone, roof and drop_off coordinates, 
#  related_places data, cross_street, and address_extended; they do not provide much insight
fsq_df.drop(['chains', 
             'link', 
             'location.formatted_address', 
             'timezone', 
             'geocodes.roof.latitude', 
             'geocodes.roof.longitude', 
             'geocodes.drop_off.latitude', 
             'geocodes.drop_off.longitude',
             'geocodes.front_door.latitude',
             'geocodes.front_door.longitude',
             'related_places.parent.fsq_id',
             'related_places.parent.name',
             'related_places.children',
             'location.cross_street',
             'location.address_extended',
             'location.po_box',
             'categories',
             'distance'
            ], axis=1, inplace=True)

# Sidenote: I considered removing some location data, like country/province indicitators, 
# postal codes, etc., but figured they could be useful to keep around.

# I did do some more cleaning though:
fsq_df['name'] = fsq_df['name'].apply(str.lower)

# Create .csv and show finl DataFrame:
fsq_df.to_csv('../data/fsq_data.csv', index=False)
fsq_df.head()

Unnamed: 0,fsq_id,name,geocodes.main.latitude,geocodes.main.longitude,location.address,location.country,location.locality,location.postcode,location.region,fsq_am_cat
0,5796abdc498eb9b6b1d59458,coop wicked chicken hamilton,43.259271,-79.877373,274 King St W,CA,Hamilton,L8P 1J6,ON,1.1
1,5436c40b498eaa25254aea14,island carribbean takeout & catering,43.258578,-79.876931,228 King St W,CA,Hamilton,,ON,1.1
2,4baf6568f964a520b3fc3be3,sunrise restaurant,43.258848,-79.876327,242 King St W,CA,Hamilton,L8P 1A9,ON,1.1
3,4ba3cb84f964a5202e6038e3,gown & gavel,43.258338,-79.877864,24 Hess St S,CA,Hamilton,L8P 3M8,ON,1.1
4,0d4da555da574fd56a6c5729,sizzle steak house & lounge,43.258263,-79.877388,25 Hess St S,CA,Hamilton,L8P 3M7,ON,1.1


# Yelp

Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

In [3]:
# Access Setup
api_key = os.environ["YELP_API_KEY"]

# query; location commented out because I'm doing all of my queries based on coordinates,
# and keeping it in could limit some from neighbouring municipalities/counties
url = "https://api.yelp.com/v3/businesses/search?"

# Headers, API KEY get
headers = {'Authorization': 'Bearer {}'.format(api_key)}


def run_yelp_query(category, lat, long):
    params = {'latitude': lat,
              'longitude': long,
              'categories': category,
              'radius': 1000,
              'limit': 50}

    response = requests.get(url=url, headers=headers, params=params)
    result = response.json()
    temp_json = result["businesses"]
    return temp_json

def joint_query_yelp(category):
    # Beginning JSON object 
    query_list = []
    for s, r in stations_df.iterrows():
        lat = r['latitude']
        long = r['longitude']
        temp_list = run_yelp_query(category, lat, long)
        query_list = query_list + temp_list
    return query_list

def add_yelp_pois(category):
    poi_counts = pd.DataFrame({'yelp_counts': pd.Series(dtype=int)})
    for s, r in stations_df.iterrows():
        lat = r['latitude']
        long = r['longitude']
        temp_list = run_yelp_query(category, lat, long)
        poi_counts.loc[len(poi_counts)] = len(temp_list)
    return poi_counts

In [9]:
# Run full query
yelp_query = joint_query_yelp(["bars", "restaurants", "parks", "grocery", "bicyclepaths"])

# Note: I initially had a limit of 50 for this search to keep 

In [4]:
# Get total Yelp pois for each station
yelp_pois = add_yelp_pois(["bars", "restaurants", "parks", "grocery", "bicyclepaths"])

In [5]:
# Create .csv and show final DataFrame:
yelp_pois.to_csv('../data/yelp_poi_totals.csv', index=False)

yelp_pois.head()

Unnamed: 0,yelp_counts
0,50
1,50
2,50
3,50
4,50


Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [70]:
yelp_df = pd.json_normalize(yelp_query)

# Cleaning again
yelp_df.drop(['alias',
              'image_url',
              'is_closed',
              'url',
              'review_count',
              'location.address2',
              'location.address3',
              'location.display_address',
              'display_phone',
              'transactions',
              'price',
              'phone',
              'distance'
            ], axis=1, inplace=True)

# Names to lower case:
yelp_df['name'] = yelp_df['name'].apply(str.lower)

Put your parsed results into a DataFrame

In [71]:
# Create .csv and show final DataFrame:
yelp_df.to_csv('../data/yelp_data.csv', index=False)
yelp_df.head()

Unnamed: 0,id,name,categories,rating,coordinates.latitude,coordinates.longitude,location.address1,location.city,location.zip_code,location.country,location.state
0,vqyK2q3zJ74TIT1-7Bf3Tg,la luna,"[{'alias': 'mideastern', 'title': 'Middle East...",4.0,43.259422,-79.878488,306 King Street W,Hamilton,L8P 1B1,CA,ON
1,752Fv2jKafftvoS3Twkqyg,hambrgr,"[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",4.5,43.25721,-79.8669,49 King William Street,Hamilton,L8R 1A2,CA,ON
2,Q4oLgsU62VPR28pBm0vCXw,earth to table : bread bar,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",4.0,43.25284,-79.88702,258 Locke Street S,Hamilton,L8P 4B9,CA,ON
3,bHecMQ85o3ayw1t9hRA90g,the french,"[{'alias': 'bistros', 'title': 'Bistros'}, {'a...",4.0,43.25741,-79.86748,37 King William Street,Hamilton,L8R 1A1,CA,ON
4,9oIPWjU3DvtSdORv6I2toQ,the ship,"[{'alias': 'seafood', 'title': 'Seafood'}, {'a...",4.0,43.25215,-79.87,23 Augusta Street,Hamilton,L8N 1P6,CA,ON


In [None]:
# Create .csv and show final DataFrame:
yelp_df.to_csv('../data/yelp_poi_counts.csv', index=False)

### Note: Extra cleaning takes place in the joining_data notebook.

# Comparing Results

Which API provided you with more complete data? Provide an explanation. 

Overall, I would say that Yelp provided me with a more complete dataset; I got ratings, phone numbers, things like address unit numbers or rating counts, and even just more results in general; the same POI types that I used for FourSquare turned up more resutls with Yelp's API. That said, the available categories are more limited than FourSquare, and even if I dropped a lot of the data in both sets due to them being irrelevant or else incomplete I like that FourSquare has fields for things like "front door coordinates" and what local places are related to one another.

Get the top 10 restaurants according to their rating

In [60]:
yelp_10 = yelp_df.sort_values(['rating'],ascending=False).groupby(["rating"]).head(10)
print(yelp_10.iloc[:10])

                          id                          name  \
4735  oGZGV89ZnJR0ex2Wsjn9Bw      cadillac jax bar & grill   
4168  3TTizB54szCx7njZo8palw                    fisticuffs   
5007  6SFt2-N1L7kKhQj0g-Zvcw    tall tree sandwich company   
1491  Wb3Rs6B5y6Z-5Pdi4-j6tQ       chung chun rice hot dog   
3035  kaoPyoYyyaUJhreSDQfycg           the hearty hooligan   
4203  VD1YD4oALbOUWUspR4_3fQ  mancala monk board game cafe   
1486  8By5FIyw81Ld_LQBuCI-HA                        mr sub   
4205  YkFCtf82ISv_mvE6uPUrhg                osten beerhall   
730   VD1YD4oALbOUWUspR4_3fQ  mancala monk board game cafe   
1483  VuzHMIIxvFxJHcu2mmggxA     porcelain hotpot & lounge   

                                             categories  rating  \
4735  [{'alias': 'bars', 'title': 'Bars'}, {'alias':...     5.0   
4168  [{'alias': 'chinese', 'title': 'Chinese'}, {'a...     5.0   
5007  [{'alias': 'sandwiches', 'title': 'Sandwiches'...     5.0   
1491  [{'alias': 'korean', 'title': 'Korean'}, {'