# Segmenting and Clustering Neighbourhoods in Toronto, Canada

#### First we import the required libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#!conda install -c conda-forge geocoder
import geocoder

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
#!conda install -c conda-forge scikit-learn 
#!conda install numpy scipy joblib scikit-learn --force-reinstall
#from sklearn.cluster import KMeans


print('Libraries imported.')

Libraries imported.


In [2]:
#!conda install -c conda-forge scikit-learn 
#!conda install numpy scipy joblib scikit-learn --force-reinstall
from sklearn.cluster import KMeans

In [3]:
#!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library


### Downloading the data from Wikipedia

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

#### Use pandas read_html function to convert first table in url to dataframe

In [5]:
postal_code_data = pd.read_html(url)[0]
postal_code_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
postal_code_data.shape

(180, 3)

In [7]:
postal_code_data = postal_code_data[postal_code_data['Borough']!='Not assigned']
postal_code_data.reset_index(drop = True,inplace = True)

In [8]:
postal_code_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
postal_code_data.shape

(103, 3)

Define function to get the latitude and longitude of a postal code in Toronto

In [10]:
def get_latlon(postal_code):

    lat_lng_coords = None

    while(lat_lng_coords is None):

        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))

        lat_lng_coords = g.latlng

    return lat_lng_coords

Get coordinates for Toronto

In [11]:
address = 'Toronto, ON'

location = geocoder.arcgis(address)
latitude = location.latlng[0]
longitude = location.latlng[1]
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.648690000000045, -79.38543999999996.


Create new dataframe with the postal code information and the latitude and longitude coordinates

In [12]:
lat_long_df = pd.DataFrame(columns=["Postal_Code", "Borough", "Neighbourhood", "Latitude", "Longitude"])

for i in postal_code_data.index:
    postal_code = postal_code_data["Postal Code"][i]
    borough = postal_code_data["Borough"][i]
    neighborhood = postal_code_data["Neighbourhood"][i]
    
    latitude = get_latlon(postal_code)[0]
    longitude = get_latlon(postal_code)[1]
    
    lat_long_df = lat_long_df.append({"Postal_Code": postal_code,
                                      "Borough": borough,
                                      "Neighbourhood": neighborhood,
                                      "Latitude": latitude,
                                      "Longitude": longitude}, ignore_index=True)
lat_long_df.head()

Unnamed: 0,Postal_Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188


In [13]:
lat_long_df1 = pd.DataFrame(columns=["Postal_Code", "Latitude", "Longitude"])

for i in postal_code_data.index:
    postal_code = postal_code_data["Postal Code"][i]
    
    latitude = get_latlon(postal_code)[0]
    longitude = get_latlon(postal_code)[1]
    
    lat_long_df1 = lat_long_df1.append({"Postal_Code": postal_code,
                                      "Latitude": latitude,
                                      "Longitude": longitude}, ignore_index=True)
    
lat_long_df1.head()

Unnamed: 0,Postal_Code,Latitude,Longitude
0,M3A,43.75245,-79.32991
1,M4A,43.73057,-79.31306
2,M5A,43.65512,-79.36264
3,M6A,43.72327,-79.45042
4,M7A,43.66253,-79.39188


In [14]:
lat_long_df2 = lat_long_df1

postal_code_data2 = postal_code_data
postal_code_data2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [15]:
lat_long_df2.head()

Unnamed: 0,Postal_Code,Latitude,Longitude
0,M3A,43.75245,-79.32991
1,M4A,43.73057,-79.31306
2,M5A,43.65512,-79.36264
3,M6A,43.72327,-79.45042
4,M7A,43.66253,-79.39188


In [16]:
toronto_df = pd.merge(postal_code_data2, lat_long_df2, how="inner", left_on="Postal Code", right_on="Postal_Code")
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Postal_Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.75245,-79.32991
1,M4A,North York,Victoria Village,M4A,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M6A,43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M7A,43.66253,-79.39188


In [17]:
toronto_df.drop(['Postal_Code'],axis=1,inplace=True)

In [18]:
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188


In [19]:
toronto_df.shape

(103, 5)

In [20]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_df['Borough'].unique()),
        len(toronto_df['Neighbourhood'].unique())
    )
)

The dataframe has 10 boroughs and 99 neighborhoods.


Comparing the result from the 'shape' method to the unique neighbourhood values, it can be seen that some neighbourhoods have multiple postcodes.


To account for this, the latitude and longitude of these neighbourhoods will be calculated as the centrepoint of the unique postcode coordinates.

In [21]:
#Create dataframe with the Neighbourhoods that have multiple
multi_postcode_df = toronto_df[toronto_df.groupby(by="Neighbourhood")['Postal Code'].transform('count') > 1]
multi_postcode_df.reset_index(drop = True,inplace = True)
multi_postcode_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3B,North York,Don Mills,43.74923,-79.36186
1,M3C,North York,Don Mills,43.72168,-79.34352
2,M3K,North York,Downsview,43.73384,-79.46828
3,M3L,North York,Downsview,43.72071,-79.51701
4,M3M,North York,Downsview,43.73224,-79.50178
5,M3N,North York,Downsview,43.75478,-79.51959


Calculate average latitude and longitude of Neighbourhoods

In [22]:
avg_latlon_df = multi_postcode_df.groupby(["Borough","Neighbourhood"]).mean()
avg_latlon_df.reset_index(inplace = True)
avg_latlon_df

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,North York,Don Mills,43.735455,-79.35269
1,North York,Downsview,43.735393,-79.501665


Remove Postal Code Column and update toronto_df

In [23]:
#Drop 'Postal Code' Column
toronto_df.drop(['Postal Code'],axis=1,inplace=True)

#Drop the Neighourhoods from toronto_df column that have multiple postcodes
toronto_df = toronto_df[~toronto_df["Neighbourhood"].isin(avg_latlon_df["Neighbourhood"])].reset_index(drop= True)

#Append the new rows with updated latitude and longitude values
toronto_df.append(avg_latlon_df).reset_index(drop= True,inplace=True)

Create map of Toronto using latitude and longitude values

In [25]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df["Latitude"], toronto_df["Longitude"], toronto_df["Borough"], toronto_df["Neighbourhood"]):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Explore the neighbouroods using the Foursquare API

In [26]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_df['Borough'].unique()),
        lat_long_df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


#### Foursquare Credentials and Version

In [27]:
CLIENT_ID = 'N4AALMF3Z0PH1UI5SCEI5DPB41E1UNSNJNQRKFQYY0IGPIOI' # your Foursquare ID
CLIENT_SECRET = 'VINR5MK4KWHJGM01ZHHCEQJDHZWVT11O4THWNNO12WYTXOA0' # your Foursquare Secret
ACCESS_TOKEN = 'HFMNQTKGJMQFD4Q03RSLDUTP5MGDMNE3C0R5AROJRQCOBP03' # your FourSquare Access Token
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: N4AALMF3Z0PH1UI5SCEI5DPB41E1UNSNJNQRKFQYY0IGPIOI
CLIENT_SECRET:VINR5MK4KWHJGM01ZHHCEQJDHZWVT11O4THWNNO12WYTXOA0
