# Objective
The objective of this notebook is to explore and cluster the neighbourhoods of Toronto, Canada, in terms of the different venues local to each neighbourhood.  
This will be done by determining which venues are applicable to each neighbourhood, using Foursquare data.  Clustering will then be performed using the K Means algorithm.

## Set Up
Import libraries for this notebook.

In [1]:
import numpy as np
import pandas as pd
import json
from pandas.io.json import json_normalize
import requests
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors

# install package for plotting Folium maps
!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries imported.')


Solving environment: done

# All requested packages already installed.

Libraries imported.


## Data Collection
Scrape neighbourhood data from the wikipedia page https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [2]:
# Scrape wikipedia page
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_html = requests.get(wiki_url).text

page = BeautifulSoup(wiki_html, 'html.parser')


In [3]:
# get all data from page
table_rows = page.find_all('tr')

l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td]
    l.append(row)

toronto = pd.DataFrame(l)
toronto = toronto.iloc[1:288, :3]
toronto.columns = ['Postcode', 'Borough', 'Neighborhood']
toronto.reset_index(drop=True, inplace=True)
toronto.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Data Cleaning
Steps to clean up the Toronto data: removing 'Not assigned' boroughs, combining multiple neighborhoods within a single postcode, and assigning borough names to unassigned neighborhoods.

In [4]:
# drop rows where Borough is Not assigned
toronto.drop(toronto[toronto['Borough'] == 'Not assigned'].index, inplace=True)
toronto.reset_index(drop=True, inplace=True)

# assign Borough name to Neighborhoods that are Not assigned
idx = toronto[toronto['Neighborhood'] == 'Not assigned'].index.to_list()

for i in idx:
    toronto.loc[i, 'Neighborhood'] = toronto.loc[i, 'Borough']

# combine Neighborhoods with same postcodes
toronto_grouped = toronto.groupby('Postcode').agg({'Neighborhood': ', '.join})

toronto_merge = pd.merge(left=toronto_grouped, right=toronto.loc[:, ['Postcode', 'Borough']], how='left', on='Postcode')
toronto_merge.drop_duplicates(inplace=True)
toronto_merge.reset_index(drop=True, inplace=True)

print('Number of rows of data =', toronto_merge.shape[0])
toronto_merge.head()

Number of rows of data = 103


Unnamed: 0,Postcode,Neighborhood,Borough
0,M1B,"Rouge, Malvern",Scarborough
1,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
2,M1E,"Guildwood, Morningside, West Hill",Scarborough
3,M1G,Woburn,Scarborough
4,M1H,Cedarbrae,Scarborough


## Get Latitude and Longitude
The latitude and longitude of each postcode are required in order to obtain venue information from Foursquare.

In [5]:
# use Nominatim to obtain lat and long for each post code
#lat = []
#long = []
#Nhoods = toronto_merge['Neighborhood']

#geolocator = Nominatim(user_agent="toronto_explorer")

#for i, hood in enumerate(Nhoods):
    #post_code = toronto_merge.Postcode[i]
    #address = '{}, Toronto, Ontario'.format(post_code)
    
    #location = None
    
    #while (location is None):
        #location = geolocator.geocode(address)
    
    #latitude = location.latitude
    #longitude = location.longitude
    
    #lat.append(latitude)
    #long.append(longitude)


Nominatim was not successful, so reverted to using the csv of geospatial coordinates that was supplied.

In [6]:
# The code was removed by Watson Studio for sharing.

In [7]:
# load data from csv
print(coords.head())
print('Shape =', coords.shape)

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476
Shape = (103, 3)


Add the latitude and longitude data to the Neighborhoods dataframe.

In [8]:
# merge on postcode
toronto_coords = pd.merge(left=toronto_merge, right=coords, how='left', left_on='Postcode', right_on='Postal Code')
toronto_coords.reset_index(drop=True, inplace=True)
toronto_coords.drop('Postal Code', axis=1, inplace=True)
print('Dataframe shape:', toronto_coords.shape)
toronto_coords.head()

Dataframe shape: (103, 5)


Unnamed: 0,Postcode,Neighborhood,Borough,Latitude,Longitude
0,M1B,"Rouge, Malvern",Scarborough,43.806686,-79.194353
1,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough,43.784535,-79.160497
2,M1E,"Guildwood, Morningside, West Hill",Scarborough,43.763573,-79.188711
3,M1G,Woburn,Scarborough,43.770992,-79.216917
4,M1H,Cedarbrae,Scarborough,43.773136,-79.239476


In [9]:
# remove outer boroughs for simplicity
drop_boroughs = ['Scarborough', 'North York', 'Etobicoke']
toronto_coords = toronto_coords[~toronto_coords['Borough'].isin(drop_boroughs)]

toronto_coords.reset_index(drop=True, inplace=True)

print('Dataframe shape:', toronto_coords.shape)
toronto_coords.head()

Dataframe shape: (51, 5)


Unnamed: 0,Postcode,Neighborhood,Borough,Latitude,Longitude
0,M4B,"Woodbine Gardens, Parkview Hill",East York,43.706397,-79.309937
1,M4C,Woodbine Heights,East York,43.695344,-79.318389
2,M4E,The Beaches,East Toronto,43.676357,-79.293031
3,M4G,Leaside,East York,43.70906,-79.363452
4,M4H,Thorncliffe Park,East York,43.705369,-79.349372


## Get Foursquare Venue Data
For each postcode, obtain venue data from Foursquare by exploring latitude and longitude coordinates.

In [10]:
# The code was removed by Watson Studio for sharing.

In [11]:
# define Foursquare version and search radius and results limit
VERSION = '20180605'
RADIUS = 500
LIMIT = 100

In [12]:
# define function to obtain venue information for all post codes
def getNearbyVenues(postcodes, latitudes, longitudes, radius=RADIUS):
    
    venues_list=[]
    for code, lat, lng in zip(postcodes, latitudes, longitudes):
        # print(code)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            code, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
# use function for all postcodes
toronto_venues = getNearbyVenues(postcodes=toronto_coords['Postcode'],
                                   latitudes=toronto_coords['Latitude'],
                                   longitudes=toronto_coords['Longitude']
                                  )

In [15]:
# check resulting dataframe
print('Dataframe shape:', toronto_venues.shape)
toronto_venues.head()

Dataframe shape: (1794, 7)


Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4B,43.706397,-79.309937,Jawny Bakers,43.705783,-79.312913,Gastropub
1,M4B,43.706397,-79.309937,East York Gymnastics,43.710654,-79.309279,Gym / Fitness Center
2,M4B,43.706397,-79.309937,Shoppers Drug Mart,43.705933,-79.312825,Pharmacy
3,M4B,43.706397,-79.309937,TD Canada Trust,43.70574,-79.31227,Bank
4,M4B,43.706397,-79.309937,Pizza Pizza,43.705159,-79.31313,Pizza Place


In [16]:
# information on venues
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 234 unique categories.


## Analyse Venue Data
Determine frequency of venue category by postcode.

In [17]:
# one hot encoding of venue category
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add postcode column back to dataframe
toronto_onehot['Postcode'] = toronto_venues['Postcode'] 

# move postcode column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Postcode,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M4B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# group venue data by postcode and take mean of categories for frequency value
toronto_venue_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()
toronto_venue_grouped.head()

Unnamed: 0,Postcode,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M4B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0
2,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0625


Display common venues by postcode.

In [19]:
# define function to return common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [20]:
# create dataframe with common venues by postcode
num_top_venues = 8

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postcode'] = toronto_venue_grouped['Postcode']

for ind in np.arange(toronto_venue_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_venue_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue
0,M4B,Fast Food Restaurant,Pizza Place,Bus Line,Intersection,Gym / Fitness Center,Pet Store,Gastropub,Pharmacy
1,M4C,Curling Ice,Pharmacy,Spa,Beer Store,Asian Restaurant,Cosmetics Shop,Skating Rink,Video Store
2,M4E,Park,Trail,Pub,Neighborhood,Health Food Store,Dog Run,Discount Store,Diner
3,M4G,Coffee Shop,Sporting Goods Shop,Sushi Restaurant,Furniture / Home Store,Burger Joint,Electronics Store,Restaurant,Shopping Mall
4,M4H,Indian Restaurant,Yoga Studio,Supermarket,Bank,Burger Joint,Coffee Shop,Fast Food Restaurant,Gas Station


## Postcode Cluster Analysis
Create clusters of similar postcodes by venue frequency.

In [21]:
# set number of clusters
kclusters = 8

toronto_grouped_clustering = toronto_venue_grouped.drop('Postcode', 1)

# run k-means clustering
k_means = KMeans(init="k-means++", n_clusters=kclusters, n_init=12).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
k_means.labels_[0:10]

array([0, 0, 2, 0, 0, 4, 0, 0, 0, 6], dtype=int32)

In [22]:
# add clustering labels to postcode data
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', k_means.labels_)

#toronto_merged = toronto_coords

# merge to add latitude/longitude for each postcode
toronto_merged = pd.merge(left=neighborhoods_venues_sorted, right=toronto_coords, how='left', on='Postcode')

toronto_merged.head()

Unnamed: 0,Cluster Labels,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,Neighborhood,Borough,Latitude,Longitude
0,0,M4B,Fast Food Restaurant,Pizza Place,Bus Line,Intersection,Gym / Fitness Center,Pet Store,Gastropub,Pharmacy,"Woodbine Gardens, Parkview Hill",East York,43.706397,-79.309937
1,0,M4C,Curling Ice,Pharmacy,Spa,Beer Store,Asian Restaurant,Cosmetics Shop,Skating Rink,Video Store,Woodbine Heights,East York,43.695344,-79.318389
2,2,M4E,Park,Trail,Pub,Neighborhood,Health Food Store,Dog Run,Discount Store,Diner,The Beaches,East Toronto,43.676357,-79.293031
3,0,M4G,Coffee Shop,Sporting Goods Shop,Sushi Restaurant,Furniture / Home Store,Burger Joint,Electronics Store,Restaurant,Shopping Mall,Leaside,East York,43.70906,-79.363452
4,0,M4H,Indian Restaurant,Yoga Studio,Supermarket,Bank,Burger Joint,Coffee Shop,Fast Food Restaurant,Gas Station,Thorncliffe Park,East York,43.705369,-79.349372


## Visualise Clusters
Visualise the resulting clusters on a map.

In [None]:
# centre map on Toronto
latitude = 43.6532
longitude = -79.3832

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Postcode'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color="gray",
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters