In [1]:
import pandas as pd

### Read data from the wiki page

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_tables = pd.read_html(url)

- There are multiple tables in the html. The first table is the list of postal codes. 
- Remove the records with Borough as 'Not assigned'
- Set Neighbourhood with the value of Borough when Neighbourhood is 'Not assigned'

In [3]:
df_toronto = df_tables[0]
df_toronto = df_toronto[df_toronto.Borough!='Not assigned']
df_toronto.loc[df_toronto.Neighbourhood == 'Not assigned', 'Neighbourhood'] = df_toronto[df_toronto.Neighbourhood == 'Not assigned']['Borough']
df_toronto.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


- Concat the Neighbourhood values with the same Postcode

In [4]:
df_toronto_agg = df_toronto.groupby('Postcode')['Borough', 'Neighbourhood'].agg(lambda x: max(x) if x.name == 'Borough' else ','.join(x)).reset_index()
df_toronto_agg.rename(columns={'Postcode': 'PostalCode'}, inplace=True)
df_toronto_agg.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


- Show the shape of the result

In [5]:
df_toronto_agg.shape

(103, 3)

- Load geocode from csv

In [6]:
df_geocoders = pd.read_csv('Geospatial_Coordinates.csv')
df_geocoders.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df_geocoders.head(5)

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


- Join 2 tables on Postal Code

In [7]:
df_postal_codes_geo = pd.merge(df_toronto_agg, df_geocoders, on='PostalCode', how='inner')
df_postal_codes_geo.head(5)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [8]:
df_postal_codes_geo.shape

(103, 5)

- Only select neighbourhood nearby Toronto

In [9]:
df_clustering = df_postal_codes_geo[df_postal_codes_geo.Borough.str.contains('Toronto')].reset_index()
df_clustering

Unnamed: 0,index,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923
4,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,47,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,48,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
9,49,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


- Cluster neighbourhoods
- Choose 4 as the number of clusters. So we can see if clusters match categories.
- Generate labels by making predication with the model

In [10]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(df_clustering[['Latitude', 'Longitude']])

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

In [11]:
df_clustering['Cluster Labels'] = kmeans.predict(df_clustering[['Latitude', 'Longitude']])
df_clustering

Unnamed: 0,index,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0
1,41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0
2,42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,0
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923,0
4,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2
5,45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,2
6,46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,2
7,47,M4S,Central Toronto,Davisville,43.704324,-79.38879,2
8,48,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316,2
9,49,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049,2


- Create a map with one of the neighbourhood as the center
- Mark the clustered neighbourhoods with 4 different colors

In [12]:
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [13]:
map_clusters = folium.Map(location=[43.686412, -79.400049], zoom_start=12)
rainbow = ['#7e1e9c', '#15b01a', '#0343df', '#e50000']

markers_colors = []
for lat, lon, poi, cluster in zip(df_clustering['Latitude'], df_clustering['Longitude'], df_clustering['Neighbourhood'], df_clustering['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters