In [99]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 7.7MB/s eta 0:00:011
[?25hCollecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [103]:
!pip install  geopy



In [41]:
import pandas as pd
import numpy as np
from pandas.io.html import read_html

In [63]:
#Extracting the data from the wiki
page= 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050.' #permanent link to a specific revision to prevent issues from changes in formating
table= read_html(page, attrs={'class':'wikitable'}) #the class is obtainable by inspecting the table on the wiki
table=table[0]
table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [66]:
#processing the data: removing any entry without a borough assigned
table=table[table['Borough']!="Not assigned"]
table=table.reset_index(drop=True)
table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [72]:
#processing the data: setting any entry without a neighbourhood assigned to have it set to its borough
table['Neighbourhood']=table['Neighbourhood'].replace('Not assigned', table['Borough'])
table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [80]:
#processing the data:grouping neighbourhoods by postcode
table=table.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
table.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [79]:
table.shape

(103, 3)

In [84]:
#getting the geographical coordinates from the csv file
! wget -q -O 'geodata.csv' 'http://cocl.us/Geospatial_data'
geodata=pd.read_csv('geodata.csv')
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [87]:
#fixing the postal code column to match the name in the other dataframe
geodata.rename(columns={'Postal Code':'Postcode'},inplace=True)
geodata.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [89]:
#merging the geographical coordinates into the previous dataframe
dataframe = pd.merge(table, geodata, on='Postcode')
dataframe

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [96]:
#clustering
from sklearn.cluster import KMeans

#considering the postal codes go from M1 to M9, the number of clusters is assumed to be 9
cluster_data=dataframe.drop(['Postcode','Borough','Neighbourhood'], axis=1)
kmeans = KMeans(n_clusters=9, random_state=0).fit(cluster_data)
kmeans.labels_[0:10] 

array([8, 8, 8, 8, 1, 1, 1, 5, 1, 5], dtype=int32)

In [97]:
#adding the clustering information to the dataframe
dataframe.insert(0, 'Cluster Labels', kmeans.labels_)
dataframe.head()

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,8,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,8,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,8,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,8,M1G,Scarborough,Woburn,43.770992,-79.216917
4,1,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [109]:
#visualizing the data
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
#defined toronto geolocation as 43 and -79 based on the data
# create map
map_clusters = folium.Map(location=[43.7, -79.3], zoom_start=11)

# set color scheme for the clusters
x = np.arange(9)
ys = [i + x + (i*x)**2 for i in range(9)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dataframe['Latitude'], dataframe['Longitude'], dataframe['Postcode'], dataframe['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters