# Yelp Foursquare EDA

In [28]:
import requests
import json
import pandas as pd
import os

#### Importing CityBike dataframe

In [29]:
df_bike = pd.read_csv(r'../data/bike_df.csv')

In [30]:
df_bike.head(1)

Unnamed: 0,station_id,name,lat,lon,num_of_bikes
0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954,6


In [31]:
df_bike.shape

(691, 5)

#### Cleaning CityBikes

In [32]:
#fixing column headers

df_bike.columns = ['Station Id', 'Name', 'Latitude', 'Longitude', 'Number of Bikes']

In [33]:
df_bike.head(1)

Unnamed: 0,Station Id,Name,Latitude,Longitude,Number of Bikes
0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954,6


# Foursquare

#### Send a request to Foursquare with a small radius (1000m) for all the bike stations in your city of choice. 

In [34]:
FOURSQUARE_KEY = os.environ.get('FS_AK')

In [35]:
#function for retrieving data from Foursquare API

def get_venues_fs(latitude, longitude, radius, api_key, categories, fields, limit):
    
    base_url = 'https://api.foursquare.com/v3/places/search'

    params = {
        'll': f"{latitude},{longitude}",
        'radius': radius,
        'categoryId': categories,
        'fields': fields,
        'limit': limit
    }

    headers = {'accept': 'application/json',
               'Authorization': api_key              
              }

    response = requests.get(base_url, params = params, headers=headers)

    return response.json()

I ended up making multiple requests from the Foursquare API as I was playing with the settings in order to get the data that I wanted. 

#### FS ALL CATEGORIES - Did not use this data for the project.

In [None]:
"""
#Retrieving Foursquare data through API with all the chosen category codes.

venues_all_cats_2 = []

# Iterate through each row in the CityBikes DataFrame for latitude and longitude
for index, row in df_bike.iterrows():
    latitude = row['lat']
    longitude = row['lon']
    
# Make a request to Foursquare API using the latitude and longitude values from CityBikes
    venues = get_venues_fs(latitude, longitude, radius=1000, api_key=FOURSQUARE_KEY, categories='13003,10003,10069,10018,10020,10024,10032,12080,13042,16032', fields='fsq_id,name,distance,categories,stats,rating,popularity,price,tastes', limit=50)
    venues_all_cats_2.append(venues)
"""

In [36]:
#json load - with all desired categories

with open('../data/fs_results_all_categories.json', 'r') as json_file:
    venues_all_cats_2 = json.load(json_file)

In [37]:
venues_all_cats_2[0]['results'][:1]

[{'fsq_id': '59d19c050c9f3155d662a0c8',
  'categories': [{'id': 13034,
    'name': 'Café',
    'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/cafe_',
     'suffix': '.png'}}],
  'distance': 40,
  'name': 'CAFE Dispensary',
  'popularity': 0.827288364889599,
  'price': 1,
  'rating': 7.8,
  'stats': {'total_photos': 2, 'total_ratings': 8, 'total_tips': 0}}]

#### FS SOME CATEGORIES, NO BARS - Did not use this data for the project.

In [None]:
"""
#retrieving Foursquare data from API, omitting bar/restaurant codes

fs_venues_no_bars2 = []

# Iterate through each row in the 'data' DataFrame
for index, row in df_bike.iterrows():
    latitude = row['lat']
    longitude = row['lon']
    
# Make a request to Foursquare API using the latitude and longitude values from CityBikes
    venues = get_venues_fs(latitude, longitude, radius=1000, api_key=FOURSQUARE_KEY, categories='10003,10069,10024,12080,16032', fields='fsq_id,name,distance,categories,stats,rating,popularity,price,tastes', limit=50)
    fs_venues_no_bars2.append(venues)
"""

In [38]:
#json load - with all above categories, excluding bars and restaurants

with open('../data/fs_results_no_bars2.json', 'r') as json_file:
    fs_venues_no_bars = json.load(json_file)

#### FS NO CATEGORIES - This is the data that will be used for the project

In [None]:
"""
#Retrieving Foursquare data using no category codes.

fs_venues_no_cats = []

# Iterate through each row in the 'data' DataFrame
for index, row in df_bike.iterrows():
    latitude = row['lat']
    longitude = row['lon']
    
# Make a request to Foursquare API using the latitude and longitude values from City Bikes
    venues = get_venues_fs(latitude, longitude, radius=1000, api_key=FOURSQUARE_KEY, categories=None, fields='fsq_id,name,distance,categories,stats,rating,popularity,price,tastes', limit=50)
    fs_venues_no_cats.append(venues)
"""

In [42]:
#json load

with open('../data/fs_results_no_cats.json', 'r') as json_file:
    fs_venues_no_cats = json.load(json_file)

### Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [43]:
#The data that will be included in the dataframe
stn_ind = 0
fs_id = fs_venues_no_cats[0]['results'][0]['fsq_id']
print(fs_id)
ven_name = fs_venues_no_cats[0]['results'][0]['name']
print(ven_name)
ven_pop = fs_venues_no_cats[0]['results'][0]['popularity']
print(ven_pop)
ven_rating = fs_venues_no_cats[0]['results'][0]['rating']
print(ven_rating)
ven_price = fs_venues_no_cats[0]['results'][0]['price']
print(ven_price)
ven_dist_from_stn = fs_venues_no_cats[0]['results'][0]['distance']
print(ven_dist_from_stn)
cat_name = fs_venues_no_cats[0]['results'][0]['categories'][0]['name']
print(cat_name)
cat_id = fs_venues_no_cats[0]['results'][0]['categories'][0]['id']
print(cat_id)
cat_id = fs_venues_no_cats[0]['results'][0]['categories'][0]['id']
print(cat_id)
t_photos = fs_venues_no_cats[0]['results'][0]['stats']['total_photos']
print(t_photos)
t_ratings = fs_venues_no_cats[0]['results'][0]['stats']['total_ratings']
print(t_ratings)
t_tips = fs_venues_no_cats[0]['results'][0]['stats']['total_tips']
print(t_tips)
long = fs_venues_no_cats[0]['context']['geo_bounds']['circle']['center']['longitude']
print(long)

59d19c050c9f3155d662a0c8
CAFE Dispensary
0.827288364889599
7.8
1
40
Café
13034
13034
2
8
0
-79.395954



### Put your parsed results into a DataFrame


Data frames were made from the alternate FS API requests that will not be used, in order to compare them to see which set was preferred.
I did not add the additional coding to make those dataframes, but will include the csv for interest sake.

In [53]:
#csv load - FS results with all desired category codes. Did not use.
fs_data_all_categories = pd.read_csv(r'../data/fs_data_all_categories.csv')

In [57]:
#csv load - FS results with all category codes above, no bars or restaurants. Did not use.
fs_no_bar_2 = pd.read_csv(r'../data/fs_no_bar_2.csv')

All data that was derived from a particular bike station was given a station index number temporarily to facilitate in joining the data frames to the CityBike dataframe later.

In [75]:
#Created a foursquare data frame from the data with no category codes. USING.

data = []
for stn_ind, venue_data in enumerate(fs_venues_no_cats):
    results = venue_data['results']
    latitude = fs_venues_no_cats[0]['context']['geo_bounds']['circle']['center']['latitude']
    longitude = fs_venues_no_cats[0]['context']['geo_bounds']['circle']['center']['longitude']
    radius = venue_data['context']['geo_bounds']['circle']['radius']

    for result in results:
        fs_id = result.get('fsq_id', None)
        ven_name = result.get('name', None)
        ven_pop = result.get('popularity', None)
        ven_rating = result.get('rating', None)
        ven_price = result.get('price', None)
        ven_dist_from_stn = result.get('distance', None)

        cat_name = None
        cat_id = None
        if result.get('categories'):
            cat_name = result['categories'][0].get('name', None)
            cat_id = result['categories'][0].get('id', None)

        stats = result.get('stats', {})
        t_ratings = stats.get('total_ratings', None)

        data.append({
            'stn_ind': stn_ind,
            'fs_id': fs_id,
            'ven_name': ven_name,
            'ven_pop': ven_pop,
            'ven_rating': ven_rating,
            'ven_price': ven_price,
            'ven_dist_from_stn': ven_dist_from_stn,
            'latitude': latitude,
            'longitude': longitude,
            'radius': radius, 
            'cat_name': cat_name,
            'cat_id': cat_id,
            't_ratings': t_ratings
             
        })

fs_data = pd.DataFrame(data)

In [81]:
#csv load - FS results with no category codes. Used.
fs_data = pd.read_csv(r'../data/fs_data.csv')

In [80]:
fs_data.head(1)

Unnamed: 0,stn_ind,fs_id,ven_name,ven_pop,ven_rating,ven_price,ven_dist_from_stn,latitude,longitude,radius,cat_name,cat_id,t_ratings
0,0,59d19c050c9f3155d662a0c8,CAFE Dispensary,0.827288,7.8,1.0,40,43.639832,-79.395954,1000,Café,13034.0,8.0


#### Cleaning Foursquare

In [83]:
#fixing column names

fs_data.columns = ['Station Index','id', 'Name', 'Popularity', 'Rating', 'Price 1-4', 'Distance (m)','Latitude', 'Longitude', 'Radius', 'Category', 'Category Id', 'Total Ratings']

Id, category id and radius are unnecessary. Popularity would probably have been a good one to investigate however I was not able to find a comparible category from yelp. May remove Latitude and Longitude later if they are not needed.

In [84]:
#Removing columns. 

columns_to_remove = ['id', 'Category Id','Radius', 'Popularity']
fs_data.drop(columns=columns_to_remove, inplace=True)

In [85]:
#removing venues outside of the 1000m radius

fs_data = fs_data[fs_data['Distance (m)'] <= 1000]

In [86]:
print(fs_data['Distance (m)'].max())
print(fs_data['Distance (m)'].min())

1000
2


In [88]:
fs_data.head(1)

Unnamed: 0,Station Index,Name,Rating,Price 1-4,Distance (m),Latitude,Longitude,Category,Total Ratings
0,0,CAFE Dispensary,7.8,1.0,40,43.639832,-79.395954,Café,8.0


# Yelp

#### Send a request to Yelp with a small radius (1000m) for all the bike stations in your city of choice. 

In [90]:
YELP_KEY = os.environ.get('YELP_AK')

In [91]:
#function to retrieve data from YELP API

def get_venues_y(latitude, longitude, radius, api_key, categories, limit):
    
    base_url = 'https://api.yelp.com/v3/businesses/search?sort_by=best_match'

    params = {
        'latitude': latitude,
        'longitude': longitude,
        'radius': radius,
        'categories': categories,
        'limit': limit
    }

    headers = {'accept': 'application/json',
               'Authorization': f'Bearer {api_key}'              
              }

    response = requests.get(base_url, params = params, headers=headers)

    return response.json()

In [None]:
"""
#Calling function to retrieve data from YELP. 

#Start and Stop indexes added as I was not able to retrieve all data in one day, due to API limits.
y_venues8 = []
start_index=600
stop_index=700

# Iterate through each row in the CityBikes DataFrame
for index, row in df_bike.iterrows():
    if index < start_index:
        continue 
    if index >= stop_index:
        break
    latitude = row['lat']
    longitude = row['lon']
    
# Make a request to YELP API using the latitude and longitude values from df_bike
    venues = get_venues_y(latitude, longitude, radius=1000, api_key=YELP_KEY, categories='arcades,cinema,landmarks,restaurants,libraries,galleries,gardens,spas', limit=50)
    y_venues8.append(venues)
"""

I ended up making many data requests and merging them all into 1 data frame. With the API limitations I was unable to get it all in one go and it felt safer to do it in chunks. So I apologize for all the unnecessary steps.

In [92]:
#retrieving saved json files from YELP data
with open('../data/yelp json/y_venues_1.json', 'r') as json_file:
    y_ven_1 = json.load(json_file)

In [93]:
with open('../data/yelp json/y_venues_2.json', 'r') as json_file:
    y_ven_2 = json.load(json_file)

In [94]:
with open('../data/yelp json/y_venues_3.json', 'r') as json_file:
    y_ven_3 = json.load(json_file)

In [95]:
with open('../data/yelp json/y_venues_4.json', 'r') as json_file:
    y_ven_4 = json.load(json_file)

In [96]:
with open('../data/yelp json/y_venues_5.json', 'r') as json_file:
    y_ven_5 = json.load(json_file)

In [97]:
with open('../data/yelp json/y_venues_6.json', 'r') as json_file:
    y_ven_6 = json.load(json_file)

In [98]:
with open('../data/yelp json/y_venues_7.json', 'r') as json_file:
    y_ven_7 = json.load(json_file)

In [115]:
with open('../data/yelp json/y_venues_8.json', 'r') as json_file:
    y_ven_8 = json.load(json_file)

#### Parse through the response to get the POI (such as restaurants, bars, etc) details you want (ratings, name, location, etc)

In [99]:
y_ven_1[0]['businesses'][0].keys()

dict_keys(['id', 'alias', 'name', 'image_url', 'is_closed', 'url', 'review_count', 'categories', 'rating', 'coordinates', 'transactions', 'price', 'location', 'phone', 'display_phone', 'distance'])

In [100]:
#locating values for dataframe

y_id = y_ven_1[0]['businesses'][0]['id']
print(y_id)
y_name = y_ven_1[0]['businesses'][0]['name']
print(y_name)
y_review_ct = y_ven_1[0]['businesses'][0]['review_count']
print(y_review_ct)
y_rating = y_ven_1[0]['businesses'][0]['rating']
print(y_rating)
y_price = y_ven_1[0]['businesses'][0]['price']
print(y_price)
y_distance = y_ven_1[0]['businesses'][0]['distance']
print(y_distance)

r_BrIgzYcwo1NAuG9dLbpg
Pai Northern Thai Kitchen
3474
4.5
$$
1067.7901982636108


### Put your parsed results into a DataFrame

All data that was derived from a particular bike station was given a station index number temporarily to facilitate in joining the data frames to the CityBike dataframe later.

In [107]:
#creating df out of yelp json files.

yelp_1 = []

for stn_ind, venue_data in enumerate(y_ven_1):
    results = venue_data['businesses']
    latitude = venue_data['region']['center']['latitude']
    longitude = venue_data['region']['center']['longitude']

    for result in results:
        y_id = result.get('id', None)
        y_name = result.get('name', None)
        y_review_ct = result.get('review_count', None)
        y_rating = result.get('rating', None)
        y_price = result.get('price', None)
        y_distance = result.get('distance', None)

        y_cat = None
        if result.get('categories'):
            y_cat = result['categories'][0].get('title', None)

        yelp_1.append({
            'stn_ind': stn_ind,
            'y_id': y_id,
            'y_name': y_name,
            'y_distance': y_distance,
            'latitude': latitude,
            'longitude': longitude,
            'y_review_ct': y_review_ct,
            'y_rating': y_rating,
            'y_price': y_price,
            'y_cat': y_cat,
        })

yelp_1_df = pd.DataFrame(yelp_1)

Each of my 8 yelp json files were made into dataframes and merged together. In an attempt to keep this notebook a little less cluttered I have not included all the redundent steps. Please load csv for the joined dataframe (pre-cleaning)

In [None]:
"""
#concatinating dataframes to make yelp dataframe

yelp_data= pd.concat([yelp_1_df,yelp_2_df,yelp_3_df,yelp_4_df,yelp_5_df,yelp_6_df,yelp_7_df,yelp_8_df], axis=0)
"""

In [120]:
#csv load - Yelp dataframe after initial join
yelp_data = pd.read_csv(r'../data/yelp_data.csv')

In [123]:
yelp_data.head(1)

Unnamed: 0,stn_ind,y_id,y_name,y_distance,latitude,longitude,y_review_ct,y_rating,y_price,y_cat
0,0,r_BrIgzYcwo1NAuG9dLbpg,Pai Northern Thai Kitchen,1067.790198,43.639832,-79.395954,3474,4.5,$$,Thai


In [124]:
print(yelp_data.shape)

(31439, 10)


#### Cleaning YELP

In [125]:
#adding a numerical indicator of price, to replace symbol indicator

yelp_data['price_count'] = yelp_data['y_price'].str.len()

In [126]:
#renaming columns

yelp_data.columns=['Station Index','id', 'Name','Distance (m)','Latitude', 'Longitude', 'Number of Reviews', 'Rating', 'Price', 'Category', 'Price 1-4']

In [127]:
#remove columns

columns_to_remove = ['Price', 'id']
yelp_data.drop(columns=columns_to_remove, inplace=True)

In [128]:
yelp_data.describe()

Unnamed: 0,Station Index,Distance (m),Latitude,Longitude,Number of Reviews,Rating,Price 1-4
count,31439.0,31439.0,31439.0,31439.0,31439.0,31439.0,23189.0
mean,329.276917,723.2014,43.665065,-79.394112,166.63154,3.892602,2.109793
std,197.460555,2569.371,0.025668,0.051093,282.953438,0.635852,0.690458
min,0.0,1.023678e-09,43.588077,-79.544491,1.0,1.0,1.0
25%,157.5,338.0375,43.648725,-79.423098,23.0,3.5,2.0
50%,323.0,629.8914,43.66056,-79.393388,76.0,4.0,2.0
75%,495.0,905.7067,43.675278,-79.3707,200.0,4.5,2.0
max,688.0,122331.6,43.784242,-79.13012,3475.0,5.0,4.0


In [130]:
#removing venues outside of the 1000m radius

yelp_data = yelp_data[yelp_data['Distance (m)'] <= 1000]

In [131]:
yelp_data['Distance (m)'].max()

999.9841762668566

In [132]:
#rounding distance

rounded_yelp_data= yelp_data.round({'Distance (m)': 1})
yelp_data = rounded_yelp_data

In [133]:
yelp_data.head(1)

Unnamed: 0,Station Index,Name,Distance (m),Latitude,Longitude,Number of Reviews,Rating,Category,Price 1-4
1,0,Gusto 101,673.3,43.639832,-79.395954,1151,4.0,Italian,2.0


# Comparing Results

Exploring data

In [134]:
fs_data['Name'].value_counts()

Tim Hortons                       323
Starbucks                         271
Subway                            168
Shoppers Drug Mart                143
Toronto Public Library            125
                                 ... 
Smirnoff Canada Crate               1
Batl Axe Throwing | Port Lands      1
Cherry Beach Sound Ltd              1
Keating Channel Pub & Grill         1
Cathedral Bluffs Park               1
Name: Name, Length: 3223, dtype: int64

In [135]:
yelp_data['Name'].value_counts()

KINTON RAMEN                       120
Banh Mi Boys                        96
Kibo Sushi House                    95
The Keg Steakhouse + Bar            95
Pizza Pizza                         91
                                  ... 
The Queen’s End Cafe & Emporium      1
Little Vietnam                       1
ImPerfect Fresh Eats                 1
Season Six                           1
Neon Tiger                           1
Name: Name, Length: 2943, dtype: int64

In [136]:
#number of unique categories

fs_unique_categories = fs_data['Category'].unique()
print('FS', len(fs_unique_categories))
y_unique_categories = yelp_data['Category'].unique()
print('yelp', len(y_unique_categories))

FS 394
yelp 178


In [137]:
#number of unique venue names

fs_unique_venues = fs_data['Name'].unique()
print('FS', len(fs_unique_venues))
y_unique_venues = yelp_data['Name'].unique()
print('yelp', len(y_unique_venues))

FS 3223
yelp 2943


#### **Which API provided you with more complete data? Provide an explanation.**

Both of the APIs were very similar in set up and difficulty. They both had a very large amount of settings that allowed for meticulous filtering of the data. However, the efficacy of these filters might not have been uniform. After trying 3 variations of calls to the Foursquare API, it was clear that I recieved the same breakdown of information each time, making the category selection not very useful.

While Yelp's category selection and initial data presentation appeared insightful, closer examination revealed its unsuitability for my dataset. Yelp's category titles contained detailed descriptions like "Italian" or unique labels such as "Comfort Food." However, these specifics complicated the straightforward searches within my DataFrame that I was hoping for. For instance, attempting to apply a generic search term like 'Restaurants' proved more challenging.

The number of unique venues was higher in the Foursquare dataset compared to Yelp. It was surprising that after setting a 1000m search radius, both platforms provided locations well beyond that range. Foursquare's 10-point rating scale offered greater detail and was preferable to Yelp's 5-point scale. This can be important if detailed rating distinctions are essential.

While both platforms provided a similar range of accessible information, it's worth noting that some of Yelp's data was exclusively accessible through a 'premium access tier.' Additionally, both platforms had limitations regarding the amount of data accessible within a specific time frame via the API. Due to these constraints, providing a completely accurate assessment of which API offers more comprehensive data is challenging. However, based on the available data, my preference leans towards Foursquare. Foursquare not only provided more unique data but also exhibited a preferred category structure, despite the apparent limited utility of the category codes in API requests.



### Get the top 10 restaurants according to their rating

### Foursquare

I did this part separately between Foursquare and yelp to see if they had any in common.

In [138]:
#retrieving all the data labelled as restaurants and removing duplicate restaurants.

fs_restaurant = fs_data[fs_data['Category'].fillna('').str.contains('Restaurant')]
fs_unique_rest= fs_restaurant.groupby('Name').head(1)
fs_unique_rest.head(1)

Unnamed: 0,Station Index,Name,Rating,Price 1-4,Distance (m),Latitude,Longitude,Category,Total Ratings
1,0,Hunters Landing,7.3,3.0,69,43.639832,-79.395954,Restaurant,184.0


Data was sorted by rating and limited to the top 10. The 11th restaurant has a rating of 9.1, so no need to investigate further if different restaurants should be in the top 10. Data will be sorted further by the total ratings to give a higher weight to venues with more reviews.

In [139]:
#sorted by rating and limited to the top 10. 

fs_top10_restaurants = fs_unique_rest.sort_values(by=['Rating', 'Total Ratings'], ascending=[False, False]).head(10)
fs_top10_restaurants

Unnamed: 0,Station Index,Name,Rating,Price 1-4,Distance (m),Latitude,Longitude,Category,Total Ratings
270,5,Byblos,9.3,2.0,413,43.639832,-79.395954,Mediterranean Restaurant,338.0
19,0,SOMA chocolatemaker,9.3,2.0,611,43.639832,-79.395954,Peruvian Restaurant,310.0
7179,154,Honest Weight,9.3,3.0,279,43.639832,-79.395954,Seafood Restaurant,49.0
1581,31,Cumbrae's,9.3,,893,43.639832,-79.395954,Restaurant,41.0
452,9,St. Lawrence Market Outdoor Vendors,9.2,1.0,184,43.639832,-79.395954,Restaurant,1527.0
273,5,Pai,9.2,2.0,434,43.639832,-79.395954,Thai Restaurant,935.0
17,0,Rodney's Oyster House,9.2,3.0,573,43.639832,-79.395954,Seafood Restaurant,261.0
1538,31,Union Restaurant,9.2,2.0,209,43.639832,-79.395954,French Restaurant,249.0
113,2,Rasa,9.2,,626,43.639832,-79.395954,Restaurant,117.0
5864,124,Pukka,9.2,3.0,779,43.639832,-79.395954,Indian Restaurant,65.0


### Yelp

In [152]:
#results if you try to pull up 'Restaurant'. Only 6 unique venues.

y_restaurant = yelp_data[yelp_data['Category'].fillna('').str.contains('Restaurant')]
y_restaurant['Name'].unique()

array(['Ice Queen Restaurant', 'Wing Wing Restaurant',
       'Chappa Corner Cafe & Restaurant',
       'Schulich Executive Dining Room', 'Chase Wine & Grill',
       'Uyghur Foods'], dtype=object)

In [154]:
#eliminating categories that I personally don't think are restaurants making the accuracy of these results less. 

cat_to_exclude = ['Cafes Coffee & Tea','Bakeries','Libraries','Day Spas','Desserts','Lounges','Food Trucks Breweries','Art Galleries','Hair Salons','Arcades','Ice Cream & Frozen Yogurt','Nail Salons','Caterers','Hair Removal','Food Delivery Services','Grocery','Landmarks & Historical Buildings','Skin Care','Museums','Butcher','Cheese Shops','Venues & Event Spaces','Juice Bars & Smoothies','Framing','Event Photography','Music Venues','Convenience Stores','Meat Shops','Home Decor','Shopping','Bowling','Arts & Crafts','Botanical Gardens','Virtual Reality Centers','Beer', 'Wine & Spirits','Donuts','Bubble Tea','Candle Stores','Art Supplies','Street Vendors','Nurseries & Gardening','Pool Halls','Massage','Art Schools','Waxing','Farmers Market','Massage Schools','Coffee Roasteries','Men\'s Clothing','Hookah Bars','Art Classes','Health Markets','Go Karts','Parks','Cooking Classes','Tennis','Food Stands International Grocery']

restaurants_yelp = yelp_data[~yelp_data['Category'].str.contains('|'.join(cat_to_exclude))]
restaurants_yelp.head(1)

Unnamed: 0,Station Index,Name,Distance (m),Latitude,Longitude,Number of Reviews,Rating,Category,Price 1-4
1,0,Gusto 101,673.3,43.639832,-79.395954,1151,4.0,Italian,2.0


In [158]:
#finding all unique restaurants

unique_restaurants = restaurants_yelp.groupby('Name').head(1)
unique_restaurants.head(2)

Unnamed: 0,Station Index,Name,Distance (m),Latitude,Longitude,Number of Reviews,Rating,Category,Price 1-4
1,0,Gusto 101,673.3,43.639832,-79.395954,1151,4.0,Italian,2.0
2,0,Hunters Landing,60.9,43.639832,-79.395954,236,3.5,Comfort Food,2.0


In [160]:
#top 10 restaurants by rating and number of reviews

yelp_top10_restaurants = unique_restaurants.sort_values(by=['Rating', 'Number of Reviews'], ascending = [False,False]).head(10)
yelp_top10_restaurants

Unnamed: 0,Station Index,Name,Distance (m),Latitude,Longitude,Number of Reviews,Rating,Category,Price 1-4
23007,480,New Orleans Seafood & Steakhouse,737.2,43.68347,-79.510894,172,5.0,Seafood,3.0
22397,467,Zeal Burgers,230.2,43.698841,-79.519472,137,5.0,Burgers,2.0
5732,115,Grandma Loves You,280.4,43.675492,-79.388858,75,5.0,Sandwiches,2.0
94,1,Gurume Sushi,447.3,43.664964,-79.38355,52,5.0,Sushi Bars,
4148,83,Papyrus,817.3,43.680223,-79.344062,50,5.0,Egyptian,2.0
1839,36,Mallo,325.9,43.664467,-79.414783,44,5.0,Coffee & Tea,1.0
367,7,Hawker,403.6,43.658148,-79.398167,38,5.0,Vegan,
28872,623,Plan B,933.7,43.650169,-79.496335,34,5.0,South African,
866,17,Haidilao Hot Pot,259.9,43.653264,-79.382458,33,5.0,Chinese,
6188,124,Casamiento,550.8,43.675278,-79.423889,28,5.0,Salvadoran,


There were more than 10 restaurant options with a 5 start rating, 179 to be precise. So data was also sorted by the number of reviews.

Saving csv files

In [178]:
fs_data.to_csv('../data/fs_data2.csv', index=False)

In [184]:
yelp_data.to_csv('../data/yelp_data2.csv', index=False)

In [191]:
df_bike.to_csv('../data/bike_df2.csv', index=False)