# Segmenting and Clustering Neighbourhoods in Toronto, Canada

#### First we import the required libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#!pip install --user geocoder
import geocoder

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#!pip install --user sklearn
from sklearn.cluster import KMeans

#!pip install --user folium
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Downloading the data from Wikipedia

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

#### Use pandas read_html function to convert first table in url to dataframe

In [3]:
postal_code_data = pd.read_html(url)[0]
postal_code_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
postal_code_data.shape

(180, 3)

In [5]:
postal_code_data = postal_code_data[postal_code_data['Borough']!='Not assigned']
postal_code_data.reset_index(drop = True,inplace = True)

In [6]:
postal_code_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
postal_code_data.shape

(103, 3)

Define function to get the latitude and longitude of a postal code in Toronto

In [8]:
def get_latlon(postal_code):

    lat_lng_coords = None

    while(lat_lng_coords is None):

        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))

        lat_lng_coords = g.latlng

    return lat_lng_coords

Get coordinates for Toronto

In [9]:
address = 'Toronto, ON'

location = geocoder.arcgis(address)
latitude = location.latlng[0]
longitude = location.latlng[1]
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.648690000000045, -79.38543999999996.


Create new dataframe with the postal code information and the latitude and longitude coordinates

In [10]:
toronto_df = pd.DataFrame(columns=["Postal_Code", "Borough", "Neighbourhood", "Latitude", "Longitude"])

for i in postal_code_data.index:
    postal_code = postal_code_data["Postal Code"][i]
    borough = postal_code_data["Borough"][i]
    neighborhood = postal_code_data["Neighbourhood"][i]
    
    latitude = get_latlon(postal_code)[0]
    longitude = get_latlon(postal_code)[1]
    
    toronto_df = toronto_df.append({"Postal_Code": postal_code,
                                      "Borough": borough,
                                      "Neighbourhood": neighborhood,
                                      "Latitude": latitude,
                                      "Longitude": longitude}, ignore_index=True)
toronto_df.head()

Unnamed: 0,Postal_Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188


In [11]:
toronto_df.shape

(103, 5)

In [12]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_df['Borough'].unique()),
        len(toronto_df['Neighbourhood'].unique())
    )
)

The dataframe has 10 boroughs and 99 neighborhoods.


Comparing the result from the 'shape' method to the unique neighbourhood values, it can be seen that some neighbourhoods have multiple postcodes.


To account for this, the latitude and longitude of these neighbourhoods will be calculated as the centrepoint of the unique postcode coordinates.

In [13]:
#Create dataframe with the Neighbourhoods that have multiple
multi_postcode_df = toronto_df[toronto_df.groupby(by="Neighbourhood")['Postal_Code'].transform('count') > 1]
multi_postcode_df.reset_index(drop = True,inplace = True)
multi_postcode_df

Unnamed: 0,Postal_Code,Borough,Neighbourhood,Latitude,Longitude
0,M3B,North York,Don Mills,43.74923,-79.36186
1,M3C,North York,Don Mills,43.72168,-79.34352
2,M3K,North York,Downsview,43.73384,-79.46828
3,M3L,North York,Downsview,43.72071,-79.51701
4,M3M,North York,Downsview,43.73224,-79.50178
5,M3N,North York,Downsview,43.75478,-79.51959


Calculate average latitude and longitude of Neighbourhoods

In [14]:
avg_latlon_df = multi_postcode_df.groupby(["Borough","Neighbourhood"]).mean()
avg_latlon_df.reset_index(inplace = True)
avg_latlon_df

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,North York,Don Mills,43.735455,-79.35269
1,North York,Downsview,43.735393,-79.501665


Remove Postal Code Column and update toronto_df

In [15]:
#Drop 'Postal_Code' Column
toronto_df.drop(['Postal_Code'],axis=1,inplace=True)

#Drop the Neighourhoods from toronto_df column that have multiple postcodes
toronto_df = toronto_df[~toronto_df["Neighbourhood"].isin(avg_latlon_df["Neighbourhood"])].reset_index(drop= True)

#Append the new rows with updated latitude and longitude values
toronto_df = toronto_df.append(avg_latlon_df).reset_index(drop=True)

Create map of Toronto using latitude and longitude values

In [16]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df["Latitude"], toronto_df["Longitude"], toronto_df["Borough"], toronto_df["Neighbourhood"]):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Explore the neighbouroods using the Foursquare API

In [17]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_df['Borough'].unique()),
        len(toronto_df['Neighbourhood'].unique())
    )
)

The dataframe has 10 boroughs and 99 neighborhoods.


#### Use Foursquare API to extract data about the nearby venues to each neighbourhood

My Foursquare Credentials

In [18]:
CLIENT_ID = 'N4AALMF3Z0PH1UI5SCEI5DPB41E1UNSNJNQRKFQYY0IGPIOI' # My Foursquare ID
CLIENT_SECRET = 'CPT3KW5K3UXAF42TKM3JH5L542WWJMP41UFJO3DBSERMSQYR' # My Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value
radius = 500

print('My credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

My credentails:
CLIENT_ID: N4AALMF3Z0PH1UI5SCEI5DPB41E1UNSNJNQRKFQYY0IGPIOI
CLIENT_SECRET:CPT3KW5K3UXAF42TKM3JH5L542WWJMP41UFJO3DBSERMSQYR


Test the Foursquare API using the first neghbourhood

In [19]:
neighborhood_latitude = toronto_df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_df.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Parkwoods are 43.75245000000007, -79.32990999999998.


#### Create url

In [20]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=N4AALMF3Z0PH1UI5SCEI5DPB41E1UNSNJNQRKFQYY0IGPIOI&client_secret=CPT3KW5K3UXAF42TKM3JH5L542WWJMP41UFJO3DBSERMSQYR&v=20180605&ll=43.75245000000007,-79.32990999999998&radius=500&limit=100'

Send GET Request and Review Results

In [21]:
results = requests.get(url).json()

Create function to extract the category form the venue

In [22]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [23]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Variety Store,Food & Drink Shop,43.751974,-79.333114


In [24]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

2 venues were returned by Foursquare.


Test was successful. Now create a function to extract the data for all neighbourhoods in a dataframe

In [25]:
def getNearbyVenues(neighbourhoods, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for nbhood, lat, lng in zip(neighbourhoods, latitudes, longitudes):
        print(nbhood)
    
        #Establish url
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        lng, 
        radius, 
        LIMIT)
    
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            nbhood, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)    

Run the function for all the Toronto neigbourhoods using the toronto_df dataframe

In [26]:
toronto_venues = getNearbyVenues(neighbourhoods=toronto_df['Neighbourhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King


KeyError: 'groups'

In [None]:
print(toronto_venues.shape)
toronto_venues.head()

In [None]:
toronto_venues.groupby('Neighbourhood').count()

In [None]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

### Analyse each Neighbourhood

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']],prefix="",prefix_sep="")

# add neighbourhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Merge the Neighbourhoods and average the frequency of each occuring group

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

In [None]:
toronto_grouped.shape

Print each nieghbourhood along with the top 5 most common venues

In [None]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

### Create pandas dataframe with the top 10 most frequently occuring venues for each neighbourhood

Function to sort venues in descending order

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create the dataframe

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

## Cluster Neighbourhoods

We run the neighbourhood_venue data through the K-Means Machine Learning algorithm

We will group the data into 5 clusters

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

Create a new dataframe that will include the neighbourhood information, the top 10 venue categories and the cluster label

In [None]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Label', kmeans.labels_)

toronto_merged = toronto_df

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

#Drop neighbourhoods that did not get assigned a label
toronto_merged.dropna(subset=['Cluster Label'],inplace=True)
toronto_merged.reset_index(drop=True,inplace=True)

toronto_merged.head() # check the last columns!

Visualize the data by plotting the neighbourhoods on the toronto map, with colour-coding indicating the label category

In [None]:
toronto_merged

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.8).add_to(map_clusters)
       
map_clusters

## Examine the Clusters

We will now observe the neighbourhoods that fall into each of the 5 clusters

#### Cluster 1

In [None]:
toronto_merged.loc[toronto_merged['Cluster Label'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].reset_index(drop=True)

#### Cluster 2

In [None]:
toronto_merged.loc[toronto_merged['Cluster Label'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].reset_index(drop=True)

#### Cluster 3

In [None]:
toronto_merged.loc[toronto_merged['Cluster Label'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].reset_index(drop=True)

#### Cluster 4

In [None]:
toronto_merged.loc[toronto_merged['Cluster Label'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].reset_index(drop=True)

#### Cluster 5

In [None]:
toronto_merged.loc[toronto_merged['Cluster Label'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].reset_index(drop=True)