# Segmenting and Clustering Neighborhoods in Toronto

## Download all the dependencies

In [1]:
import pandas as pd
import numpy as np

!pip install geocoder
import geocoder

!pip install folium==0.5
import folium

!pip install geopy
from geopy.geocoders import Nominatim

import json

import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans



## PART 1

#### 1. Scrape Wikipedia page and create dataframe

In [2]:
# Scrape Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = pd.read_html(url)
tor_neighborhoods = result[0]

# Change column names
column_names = ['PostalCode', 'Borough', 'Neighborhood']
tor_neighborhoods.columns = column_names

#### Remove row with "Not assigned" borough

In [3]:
# Ignore cells with a borough that is 'Not assigned'
tor_neighborhoods.drop(tor_neighborhoods[ tor_neighborhoods['Borough'] == 'Not assigned' ].index , inplace=True)
tor_neighborhoods.reset_index(inplace=True, drop=True)

#### Rename "Not assigned" neighborhoods with borough name

In [4]:
# If a cell has a borough but a 'Not assigned' neighborhood, then the neighborhood will be the same as the borough.
tor_neighborhoods.Neighborhood.replace('Not assigned', tor_neighborhoods.Borough, inplace=True)

#### Each postal code is unique. We don't need to combine rows with the neighborhoods separated with a comma. Proof:

In [5]:
tor_neighborhoods['PostalCode'].nunique() == tor_neighborhoods.shape[0]

True

#### Result

In [6]:
tor_neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
tor_neighborhoods.shape

(103, 3)

## PART 2

#### Get latitude/longitude for each postal code using geocoder

In [8]:
postal_codes = [x for x in tor_neighborhoods['PostalCode']]
latitudes = []
longitudes = []

for i in range(len(postal_codes)):
    lat_lng_coords = None
    
    # loop until we get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_codes[i]))
        lat_lng_coords = g.latlng

    latitudes.append(lat_lng_coords[0])
    longitudes.append(lat_lng_coords[1])
    
temp = pd.DataFrame({'PostalCode':postal_codes, 
                     'Latitude':latitudes,
                     'Longitude':longitudes})

#### Merge the two dataframes

In [9]:
tor_neighborhoods = tor_neighborhoods.merge(temp, on='PostalCode')

#### Result

In [10]:
tor_neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188


In [11]:
tor_neighborhoods.shape

(103, 5)

## PART 3

In [12]:
# @hidden_cell
CLIENT_ID = '4LFUYDKVYTE22WK2OEQGF4KTMFOKW1WD3DYT1UPNLHCV5UQQ'
CLIENT_SECRET = 'KIOFP0HTGRHKKJGI55SY3TE0P3TKSQVPDOS1SVICDO5REKK1'
VERSION = '20180605'
LIMIT = 100

#### Use geopy library to get the latitude and longitude values of Toronto

In [13]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Simplify the dataframe. Let's keep only neighborhoods with a borough that contains the word 'Toronto'

In [14]:
tor_reduced = tor_neighborhoods[ tor_neighborhoods['Borough'].str.contains("Toronto") ]
tor_reduced = tor_reduced[tor_reduced['Neighborhood'] != "Roselawn"]
tor_reduced.shape

(38, 5)

#### Create a map of Toronto with remaining neighborhoods superimposed on top

In [17]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# Add markers to map
for lat, lng, borough, neighborhood in zip(tor_reduced['Latitude'], tor_reduced['Longitude'], tor_reduced['Borough'], tor_reduced['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Function to get nearby venues

In [18]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Create a new dataframe called tor_venues

In [19]:
tor_venues = getNearbyVenues(names=tor_reduced['Neighborhood'],
                             latitudes=tor_reduced['Latitude'],
                             longitudes=tor_reduced['Longitude'])

#### Let's find out how many unique categories can be curated from all the returned venues

In [20]:
print('There are {} uniques categories.'.format(len(tor_venues['Venue Category'].unique())))

There are 228 uniques categories.


#### One hot encoding

In [21]:
tor_onehot = pd.get_dummies(tor_venues[['Venue Category']], prefix="", prefix_sep="")

# Add neighborhood column back to dataframe
tor_onehot['Neighborhood'] = tor_venues['Neighborhood']

# Move neighborhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]

tor_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [22]:
tor_grouped = tor_onehot.groupby('Neighborhood').mean().reset_index()

#### Function to sort the venues in descending order

In [23]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Dataframe with the top 10 venues for each neighborhood

In [27]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = tor_grouped['Neighborhood']

for ind in np.arange(tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Farmers Market,Seafood Restaurant,Restaurant,Breakfast Spot,Cheese Shop,Bakery,Cocktail Bar,Beer Bar,Yoga Studio
1,"Brockton, Parkdale Village, Exhibition Place",Coffee Shop,Café,Bar,Restaurant,Gift Shop,Nightclub,Sandwich Place,Italian Restaurant,Breakfast Spot,Supermarket
2,"Business reply mail Processing Centre, South C...",Coffee Shop,Hotel,Restaurant,Café,Italian Restaurant,Asian Restaurant,Bar,Vegetarian / Vegan Restaurant,Pub,Steakhouse
3,"CN Tower, King and Spadina, Railway Lands, Har...",Italian Restaurant,Coffee Shop,Café,Park,Bar,French Restaurant,Bakery,Gym / Fitness Center,Restaurant,Lounge
4,Central Bay Street,Coffee Shop,Clothing Store,Cosmetics Shop,Sandwich Place,Bubble Tea Shop,Restaurant,Café,Plaza,Sushi Restaurant,Hotel


#### Run k-means to cluster the neighborhood into 5 clusters.

In [28]:
kclusters = 5

tor_grouped_clustering = tor_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_grouped_clustering)

#### Include clustering labels in dataframe

In [29]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

tor_merged = tor_reduced
tor_merged = tor_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

#### Visualize the resulting clusters

In [30]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_merged['Latitude'], tor_merged['Longitude'], tor_merged['Neighborhood'], tor_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Cluster 1: They share "Coffee shop", "Breakfast Sport" or food stores

In [31]:
tor_merged.loc[tor_merged['Cluster Labels'] == 0, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,0,Coffee Shop,Breakfast Spot,Yoga Studio,Thai Restaurant,Italian Restaurant,Food Truck,Event Space,Electronics Store,Distribution Center,Pub
4,Downtown Toronto,0,Coffee Shop,Sandwich Place,Park,Theater,Café,Falafel Restaurant,Fried Chicken Joint,Bank,Burrito Place,Italian Restaurant
9,Downtown Toronto,0,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Japanese Restaurant,Tanning Salon,Bubble Tea Shop,Ramen Restaurant,Hotel,Chinese Restaurant
15,Downtown Toronto,0,Coffee Shop,Cocktail Bar,Cosmetics Shop,Clothing Store,Gastropub,Restaurant,Café,Hotel,Seafood Restaurant,Lingerie Store
19,East Toronto,0,Health Food Store,Pub,Trail,Donut Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant
20,Downtown Toronto,0,Coffee Shop,Farmers Market,Seafood Restaurant,Restaurant,Breakfast Spot,Cheese Shop,Bakery,Cocktail Bar,Beer Bar,Yoga Studio
24,Downtown Toronto,0,Coffee Shop,Clothing Store,Cosmetics Shop,Sandwich Place,Bubble Tea Shop,Restaurant,Café,Plaza,Sushi Restaurant,Hotel
25,Downtown Toronto,0,Café,Grocery Store,Coffee Shop,Playground,Candy Store,Italian Restaurant,Baby Store,Athletics & Sports,Wings Joint,Ethiopian Restaurant
30,Downtown Toronto,0,Coffee Shop,Café,Hotel,Gym,Restaurant,Asian Restaurant,Salad Place,Steakhouse,American Restaurant,Japanese Restaurant
31,West Toronto,0,Park,Grocery Store,Athletics & Sports,Furniture / Home Store,Brazilian Restaurant,Café,Bar,Bank,Bakery,Middle Eastern Restaurant


Cluster 2: only one neighborhood, we cannot give a "characteristic" to this cluster

In [32]:
tor_merged.loc[tor_merged['Cluster Labels'] == 1, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
61,Central Toronto,1,Bus Line,Swim School,Wings Joint,Eastern European Restaurant,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space


Cluster 3: They share "Park", "Farm", "Farmers Market", "Falafel restaurant", "Event Space", "Fast food restaurant"

In [33]:
tor_merged.loc[tor_merged['Cluster Labels'] == 2, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
68,Central Toronto,2,Park,French Restaurant,Wings Joint,Dumpling Restaurant,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space
73,Central Toronto,2,Playground,Gym Pool,Park,Donut Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant


Cluster 4: only one neighborhood, we cannot give a "characteristic" to this cluster

In [34]:
tor_merged.loc[tor_merged['Cluster Labels'] == 3, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
69,West Toronto,3,Convenience Store,Sandwich Place,Park,Residential Building (Apartment / Condo),Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant


Cluster 5: only one neighborhood, we cannot give a "characteristic" to this cluster

In [35]:
tor_merged.loc[tor_merged['Cluster Labels'] == 4, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
83,Central Toronto,4,Playground,Gym,Trail,Wings Joint,Donut Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space


The clusters with only one item don't share all the same attributes of the other clusters