# Capstone Week4 Project
Due to a new job you have to move from NYC to Toronto. You live in Fordham and are very happy about it.
The below code calculates the most similar neighbourhood in Toronto according to venue types and counts.

In [2]:
# !pip install fuzzywuzzy

In [3]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import folium
from geopy.geocoders import Nominatim
import requests
import json
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from IPython.core.display import HTML
import math
from fuzzywuzzy import fuzz



In [4]:
# Get the list of names of each neighborhoods in Toronto and NYC
# Use these geocodable names in the "near" field in the search endpoint to get venues in each neighbourhoods
# Calculate a distance measure of each Toronto to NYC neighbourhood

In [5]:
# Constants
Toronto_lat = 43.6534817
Toronto_lon = -79.3839347

# Foursquare Credentials
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180928' # Foursquare API version

## Get neighbourhood names of Toronto and NYC

In [37]:
# Toronto
toronto_nhs = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
toronto_nhs.drop(['Postal Code', 'Borough'], axis = 1, inplace = True) # Don't need postal codes or borough

indeces = toronto_nhs[toronto_nhs['Neighborhood'] == 'Not assigned'].index # Find Neighborhood = Not Assigned
toronto_nhs.drop(indeces, inplace=True) # Drop Neighborhood = Not Assigned
toronto_nhs.reset_index(inplace = True)

for count, rr in enumerate(toronto_nhs['Neighborhood']):
    splitted = rr.split(',')
    appendthis = pd.DataFrame()
    if(len(splitted) > 1):
        for app in np.arange(1, len(splitted)):
            appendthis['Neighborhood'] = [splitted[app]]
            toronto_nhs = toronto_nhs.append(appendthis, ignore_index = True)

        toronto_nhs.at[count, 'Neighborhood'] = str.strip(splitted[0])

toronto_nhs.drop_duplicates() # Drop duplicates
toronto_nhs.drop(['index'], axis = 1, inplace = True)

print('There are ' + str(toronto_nhs.shape[0]) + ' Toronto neighbourhoods in this dataframe.')
print('This is the maximum number of neighborhoods as the ones with not enough venues will be deleted.')
toronto_nhs.head()

There are 217 Toronto neighbourhoods in this dataframe.
This is the maximum number of neighborhoods as the ones with not enough venues will be deleted.


Unnamed: 0,Neighborhood
0,Parkwoods
1,Victoria Village
2,Regent Park
3,Lawrence Manor
4,Queen's Park


In [38]:
# NYC
nyc_nhs = pd.read_html('https://en.wikipedia.org/wiki/Neighborhoods_in_New_York_City')[0]
nyc_nhs.drop(['Community Board(CB)', 'Areakm2', 'Pop.Census2010', 'Pop./km2'], axis = 1, inplace = True) # Need only neighbourhood names
nyc_nhs['Neighborhoods'] = nyc_nhs['Neighborhoods'].astype(str)

for count, rr in enumerate(nyc_nhs['Neighborhoods']):
    splitted = rr.split(',')
    appendthis = pd.DataFrame()
    if(len(splitted) > 1):
        for app in np.arange(1, len(splitted)):
            appendthis['Neighborhoods'] = [splitted[app]]
            nyc_nhs = nyc_nhs.append(appendthis, ignore_index = True)

        nyc_nhs.at[count, 'Neighborhoods'] = str.strip(splitted[0])
        
nyc_nhs.drop_duplicates() # Drop duplicates      

print('There are ' + str(nyc_nhs.shape[0]) + ' NYC neighbourhoods in this dataframe.')
print('As the origin, any one of these neighborhoods could have been chosen. However, I chose Fordham.')
nyc_nhs.head()

There are 331 NYC neighbourhoods in this dataframe.
As the origin, any one of these neighborhoods could have been chosen. However, I chose Fordham.


Unnamed: 0,Neighborhoods
0,Melrose
1,Hunts Point
2,Claremont
3,Concourse
4,Fordham


## Use the venues/categories endpoint to get the main (undetailed) venue categories such as Arts&Entertainment, Food...
These are listed in the Foursquare documentation.

In [8]:
# Get primary categories defined in Foursquare to assign each venue to it.
url = 'https://api.foursquare.com/v2/venues/categories?&client_id={}&client_secret={}&v={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION)

results = requests.get(url).json()["response"]#['groups'][0]#['items']
#print(results["response"]['totalResults'])
out = pd.json_normalize(results)
out2 = pd.DataFrame.from_dict(out.iloc[0][0])
out3 = pd.DataFrame.from_dict(out[['categories']].iloc[0][0])
parent_categories = out3[['categories']]#.iloc[0]

expanded_categories_list = []
for count, cat in enumerate(np.arange(0, len(parent_categories))): # There are 10 main categories!
    category = pd.DataFrame.from_dict(parent_categories.iloc[cat][0])#.iloc[0]
    #print(cat, category, out2['name'][cat])
#     single = pd.DataFrame.from_dict(category[0])
#     single

    for single in category['name']:
        expanded_categories_list.append((single, out2['name'][cat]))

expanded_categories = pd.DataFrame(expanded_categories_list)
expanded_categories.rename(columns={0:'Category', 1:'Main Category'}, inplace=True)
expanded_categories.head(5)

Unnamed: 0,Category,Main Category
0,Amphitheater,Arts & Entertainment
1,Aquarium,Arts & Entertainment
2,Arcade,Arts & Entertainment
3,Art Gallery,Arts & Entertainment
4,Bowling Alley,Arts & Entertainment


## Get venue names, locations and non-main categories in each neighbourhood

In [9]:
# The below asks Foursquare to give some venues for each neighbourhood name given.
def getNearbyVenues(names, Concat_cityname, radius=500, limit = 50):
    
    venues_list=[]
    for name in names: # For each neighbourhood        
        name_search = name + ' , ' + Concat_cityname # Concat city name for accurate geocoding
        
        geolocator = Nominatim(user_agent='foursquare_agent')
        location = geolocator.geocode(name_search)
        #print(location)
        if(not(location is None)):
            lat = location.latitude
            lng = location.longitude

            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, limit)

            # make the GET request
            temp = requests.get(url).json()["response"]
          
            #print(temp['totalResults'])
            if (('totalResults' in temp) and (temp['totalResults'] > 0)):
                results = temp['groups'][0]['items']

                # return only relevant information for each nearby venue
                venues_list.append([(name, lat, lng,
                    v['venue']['id'],
                    v['venue']['name'], 
                    v['venue']['location']['lat'], 
                    v['venue']['location']['lng'],  
                    v['venue']['categories'][0]['name']) for v in results])

            nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
            nearby_venues.columns = ['Neighborhood', 
                          'Neighborhood Latitude', 
                          'Neighborhood Longitude',
                          'Venue id',
                          'Venue', 
                          'Venue Latitude', 
                          'Venue Longitude', 
                          'Venue Category']

    return(nearby_venues)

In [10]:
toronto_venues = getNearbyVenues(names=toronto_nhs['Neighborhood'], Concat_cityname = 'Toronto')
nyc_venues = getNearbyVenues(names=nyc_nhs['Neighborhoods'], Concat_cityname = 'New York City')

### Assign main categories depending on the venue category

In [40]:
# Toronto
toronto_venues_exp = toronto_venues.merge(expanded_categories, how = 'left', left_on='Venue Category', right_on = 'Category')

# the merge function above does not always find the main category because Foursquare changes the names.
# For instance, when "Bus Stop" becomes "Bus Line" the main category (Travel&Transport) cannot be found.

# The below algorithm fixes this by comparing the most probable strings.
fuzzes = np.zeros(expanded_categories.shape[0])
indx_nans = np.asarray(toronto_venues_exp[['Main Category']].isna())

count_out = 0
for cat in toronto_venues_exp['Venue Category']:
    #print(count_out)
    count_in = 0
    if(indx_nans[count_out]): # if it is a Nan value
        for cat_fsq in expanded_categories['Category']: # go through each category and log the fuzz ratios
            fuzzes[count_in] = fuzz.ratio(cat, cat_fsq)
            count_in += 1
        # Find the max fuzzy value and assign the category
        max_index = np.argmax(fuzzes)
        toronto_venues_exp.at[count_out, 'Main Category'] = expanded_categories.at[max_index, 'Main Category']   
    count_out += 1

toronto_venues_exp.dropna(axis = 0, inplace = True) # incase there are any Nans left (I don't think so)
toronto_venues_exp.reset_index(inplace = True)
toronto_venues_exp.head(5)

Unnamed: 0,index,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue id,Venue,Venue Latitude,Venue Longitude,Venue Category,Category,Main Category
0,0,Parkwoods,43.7588,-79.320197,4b8991cbf964a520814232e3,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant,Caribbean Restaurant,Food
1,2,Parkwoods,43.7588,-79.320197,4c422e48e26920a1a4ad5fe7,Shoppers Drug Mart,43.760857,-79.324961,Pharmacy,Pharmacy,Shop & Service
2,3,Parkwoods,43.7588,-79.320197,4c361e9118e72d7fca4714f5,Petro-Canada,43.75795,-79.315187,Gas Station,Gas Station,Shop & Service
3,4,Parkwoods,43.7588,-79.320197,4b5f7253f964a520d7ba29e3,Pizza Pizza,43.760231,-79.325666,Pizza Place,Pizza Place,Food
4,5,Parkwoods,43.7588,-79.320197,4d6d1a2ccf7e41bd75bb8285,TD Canada Trust,43.757569,-79.314976,Bank,Bank,Shop & Service


In [41]:
# NYC
nyc_venues_exp = nyc_venues.merge(expanded_categories, how = 'left', left_on='Venue Category', right_on = 'Category')

# the merge function above does not always find the main category because Foursquare changes the names.
# For instance, when "Bus Stop" becomes "Bus Line" the main category (Travel&Transport) cannot be found.

# The below algorithm fixes this by comparing the most probable strings.
fuzzes = np.zeros(expanded_categories.shape[0])
indx_nans = np.asarray(nyc_venues_exp[['Main Category']].isna())

count_out = 0
for cat in nyc_venues_exp['Venue Category']:
    #print(count_out)
    count_in = 0
    if(indx_nans[count_out]): # if it is a Nan value
        for cat_fsq in expanded_categories['Category']: # go through each category and log the fuzz ratios
            fuzzes[count_in] = fuzz.ratio(cat, cat_fsq)
            count_in += 1
        # Find the max fuzzy value and assign the category
        max_index = np.argmax(fuzzes)
        nyc_venues_exp.at[count_out, 'Main Category'] = expanded_categories.at[max_index, 'Main Category']   
    count_out += 1

nyc_venues_exp.dropna(axis = 0, inplace = True) # incase there are any Nans left (I don't think so)
nyc_venues_exp.reset_index(inplace = True)
nyc_venues_exp.head(5)

Unnamed: 0,index,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue id,Venue,Venue Latitude,Venue Longitude,Venue Category,Category,Main Category
0,0,Melrose,40.82567,-73.915242,5956be26123a195de6701c2b,Porto Salvo,40.823887,-73.91291,Italian Restaurant,Italian Restaurant,Food
1,1,Melrose,40.82567,-73.915242,55f96d71498e452a8e98af29,Starbucks,40.825556,-73.918865,Coffee Shop,Coffee Shop,Food
2,2,Melrose,40.82567,-73.915242,58d3f7f79435a92450b97b0d,Chipotle Mexican Grill,40.82589,-73.919534,Mexican Restaurant,Mexican Restaurant,Food
3,3,Melrose,40.82567,-73.915242,4cf46317cc61a35dcf5d259e,Concourse Village,40.823697,-73.919607,Shopping Mall,Shopping Mall,Shop & Service
4,4,Melrose,40.82567,-73.915242,4fa534cee4b0fed4819dc7d4,Perry Coffee Shop.,40.823181,-73.910928,Diner,Diner,Food


In [36]:
# Choose a neighbourhood, generate an array of similarity values:
# MSE = sum(difference in # of main-categories) / categories

Select_NYC_nh = 'Fordham'
my_nh = nyc_venues_exp[nyc_venues_exp['Neighborhood']  == Select_NYC_nh]
my_categories = my_nh.groupby(by = ['Main Category'], axis = 0).count()
my_categories = my_categories[['Neighborhood']]

# Go through each Toronto neighbourhood and calclate an error value that indicates differences in the types and numbers
# of venues according to thei main categories

err = np.zeros(pd.unique(toronto_venues_exp['Neighborhood']).shape[0])
count = 0
for tor_nh in pd.unique(toronto_venues_exp['Neighborhood']):
    subject_nh = toronto_venues_exp[toronto_venues_exp['Neighborhood']  == tor_nh]
    
    subject_categories = subject_nh.groupby(by = ['Main Category'], axis = 0).count()
    subject_categories = subject_categories[['Neighborhood']]
    
    # Comparing with the main categories ONLY in my neighbourhood
    # I am NOT penalizing for extra types of venue types that the new neighbourhood may have.
    for rr in np.arange(0, my_categories.shape[0]):
        name_cat = my_categories.index[rr]
        #print(name_cat)
        if(name_cat in subject_categories.index):
            #print('Here1:' + str(my_categories.at[name_cat, 'Neighborhood']), str(subject_categories.at[name_cat, 'Neighborhood']))
            err_val = math.pow(subject_categories.at[name_cat, 'Neighborhood'] - my_categories.at[name_cat, 'Neighborhood'], 2)
        else: # if the categorie lacks altogether
            #print('Here2:' + str(my_categories.at[name_cat, 'Neighborhood']))
            err_val = math.pow((my_categories.at[name_cat, 'Neighborhood']), 3) # penalize with high power

        err[count] = err[count] + math.pow((err_val/my_categories.shape[0]), 0.5)
    
    count += 1 
    
err = err.round(decimals = 2)

toronto_neighbourhoods = pd.DataFrame()
toronto_neighbourhoods['Neighborhood in Toronto'] = toronto_venues_exp['Neighborhood'].unique()
toronto_neighbourhoods['Error Values'] = err


# The best neighbourhood has lowest error value
best_nh_name = toronto_neighbourhoods.at[np.argmin(err), 'Neighborhood in Toronto']
NOT_best_nh_name = toronto_neighbourhoods.at[np.argmax(err), 'Neighborhood in Toronto']
print('The highest match for '+ Select_NYC_nh + ' in NYC is ' + best_nh_name + ' in Toronto. Error: ' + str(np.amin(err)))
print('A not so good match for '+ Select_NYC_nh + ' in NYC is ' + NOT_best_nh_name + ' in Toronto. Error: ' + str(np.amax(err)))
toronto_neighbourhoods.head(5)

The highest match for Fordham in NYC is Central Bay Street in Toronto. Error: 2.31
A not so good match for Fordham in NYC is Rouge Hill in Toronto. Error: 95.11


Unnamed: 0,Neighborhood in Toronto,Error Values
0,Parkwoods,18.01
1,Victoria Village,53.86
2,Regent Park,15.01
3,Lawrence Manor,61.45
4,Queen's Park,16.74


In [42]:
# See the distribution of error values.
toronto_neighbourhoods.describe()

Unnamed: 0,Error Values
count,195.0
mean,27.443179
std,24.816243
min,2.31
25%,11.08
50%,18.01
75%,38.135
max,95.11


## Checking if the error metrics reflect into observable venue similarities

In [34]:
print('Venues in ' + best_nh_name + '(Low error)')
best_nh = toronto_venues_exp[toronto_venues_exp['Neighborhood']  == best_nh_name]
new_values = best_nh.groupby(by = ['Main Category'], axis = 0).count()
new_values = new_values[['Neighborhood']]
new_values.rename(columns={'Neighborhood':'Venue Count'}, inplace = True)
new_values = new_values.astype('int32')

print('Venues in NYC neighbourhood:' + Select_NYC_nh)
old_values = my_nh.groupby(by = ['Main Category'], axis = 0).count()
old_values = old_values[['Neighborhood']]
old_values.rename(columns={'Neighborhood':'Venue Count'}, inplace = True)
old_values = old_values.astype('int32')

print('Venues in ' + NOT_best_nh_name + '(High error)')
NOT_best_nh = toronto_venues_exp[toronto_venues_exp['Neighborhood']  == NOT_best_nh_name]
random_values = NOT_best_nh.groupby(by = ['Main Category'], axis = 0).count()
random_values = random_values[['Neighborhood']]
random_values.rename(columns={'Neighborhood':'Venue Count'}, inplace = True)
random_values = random_values.astype('int32')

results_table = new_values.merge(old_values, how = 'outer', left_on='Main Category', right_on = 'Main Category')
results_table = results_table.merge(random_values, how = 'outer', left_on='Main Category', right_on = 'Main Category')
results_table = results_table.astype(int, errors = 'ignore')
results_table.rename(columns={'Venue Count_x':best_nh_name, 'Venue Count_y':Select_NYC_nh, 'Venue Count':NOT_best_nh_name}, inplace = True)
results_table.fillna(value = 0, axis = 0, inplace = True)
results_table

Venues in Central Bay Street(Low error)
Venues in NYC neighbourhood:Fordham
Venues in Rouge Hill(High error)


Unnamed: 0_level_0,Central Bay Street,Fordham,Rouge Hill
Main Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Arts & Entertainment,2,0.0,0.0
Food,20,20.0,0.0
Nightlife Spot,1,0.0,0.0
Outdoors & Recreation,3,3.0,0.0
Shop & Service,13,17.0,0.0
Travel & Transport,1,0.0,1.0


## Below is seen a map of Central Bay Street the best candidate neighborhood

In [44]:
# Show the neighbourhood you most likely will feel like at home!
new_nh = toronto_venues_exp[toronto_venues_exp['Neighborhood']  == 'Central Bay Street']

map_clusters = folium.Map(location = [float(new_nh.iloc[0][2]), float(new_nh.iloc[0][3])], zoom_start = 15)

for lat, lon, poi in zip(new_nh['Venue Latitude'], new_nh['Venue Longitude'], new_nh['Venue']):
    label = folium.Popup(str(poi) , parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        fill=True,
        fill_opacity=1).add_to(map_clusters)

HTML(map_clusters._repr_html_())

## Below is seen a map of Fordham the neighborhood of origin

In [25]:
new_nh = nyc_venues_exp[nyc_venues_exp['Neighborhood']  == 'Fordham']

map_clusters = folium.Map(location = [float(new_nh.iloc[0][2]), float(new_nh.iloc[0][3])], zoom_start = 15)

for lat, lon, poi in zip(new_nh['Venue Latitude'], new_nh['Venue Longitude'], new_nh['Venue']):
    label = folium.Popup(str(poi) , parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        fill=True,
        fill_opacity=1).add_to(map_clusters)

HTML(map_clusters._repr_html_())