In [11]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 12.4MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [12]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 7.4MB/s eta 0:00:011
[?25hCollecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [13]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate
import geocoder
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

Get the html of the url in Wikipedia to get the table

In [14]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
res

<Response [200]>

Use ***Beautiful Soup*** to scrap the content

In [15]:
soup = BeautifulSoup(res.content,'lxml')
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"Xk2A8ApAMFEAAJuXmEwAAAAF","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":935851093,"wgRevisionId":935851093,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario

Get the first table from the content which is the one with the postal codes

In [16]:
table = soup.find_all('table')[0] 
table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North

Finally, transform the table to a pandas DataFrame

In [17]:
df_pccanada = pd.read_html(str(table))[0]
df_pccanada.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [18]:
# Delete all the rows that has Not assigned in Borough
df_canada = df_pccanada[ df_pccanada["Borough"] != "Not assigned"]
df_canada[ df_canada["Neighbourhood"] == "Not assigned" ]

Unnamed: 0,Postcode,Borough,Neighbourhood
9,M9A,Queen's Park,Not assigned


In [19]:
# Assign the Borough to the Neighbourhood if Neighbourhood has Not assigned
def assign_neighbourhood( actual_register ):
    if (actual_register["Neighbourhood"] == "Not assigned"):
        return actual_register["Borough"]
    else:
        return actual_register["Neighbourhood"]

df_canada["Neighbourhood"] = df_canada[ ["Borough", "Neighbourhood"] ].apply(assign_neighbourhood, axis=1)

#df_canada.loc[df_canada['Neighbourhood'] == "Not assigned", 'Neighbourhood'] = df_canada['Borough']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [20]:
# Check if what was done is correctly
df_canada[ df_canada["Neighbourhood"] == "Queen's Park" ]

Unnamed: 0,Postcode,Borough,Neighbourhood
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park


In [21]:
df_canada.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [22]:
# Resset the index
df_canada = df_canada.reset_index(drop=True)

In [23]:
df_canada.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [24]:
df_canada.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


## Latutides and longitudes
Use *geocoder* to get the latitude and longitudes of the Postcodes

In [25]:
# Retrieve the latitudes and longitudes of the postal codes 
df_pc_latlon = pd.read_csv("https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv")
df_pc_latlon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [26]:
# Set the index to bo Postal Code for referencing it later
df_pc_latlon.set_index('Postal Code', inplace=True)
df_pc_latlon.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [27]:
df_pc_latlon.loc["M1B"]
# Obtain Latitude / Longitude
#df_pc_latlon.loc["M1B"].Latitude

Latitude     43.806686
Longitude   -79.194353
Name: M1B, dtype: float64

In [28]:
# Assign Latitude and Longitude to the df_canada
def assign_latitude( postal_code ):
    
    latitude = df_pc_latlon.loc[postal_code].Latitude
    return latitude

def assign_longitude( postal_code ):
    
    longitude = df_pc_latlon.loc[postal_code].Longitude
    return longitude


df_canada["Latitude"] = df_canada[ "Postcode" ].apply(assign_latitude)
df_canada["Longitude"] = df_canada[ "Postcode" ].apply(assign_longitude)

In [29]:
df_canada.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763


In [30]:
df_canada [ ["Borough", "Neighbourhood"] ]
print(df_canada["Borough"].value_counts())

Etobicoke           44
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64


# Clustering by...

Decided to cluster the data by deciding how many unique vowels does the name of the Neibourhoods of the Boroghs have

In [31]:
def count_unique_vowels( actual_borough ):
    numvowels = 0
    if 'a' in actual_borough.lower():
        numvowels+=1
    if 'e' in actual_borough.lower():
        numvowels+=1
    if 'i' in actual_borough.lower():
        numvowels+=1
    if 'o' in actual_borough.lower():
        numvowels+=1
    if 'u' in actual_borough.lower():
        numvowels+=1
        
    return numvowels

def count_total_vowels( actual_borough ):
    total_vowels = [letter for letter in actual_borough if letter in 'aeiou']
    return len(total_vowels)

df_canada["Unique vowels"] = df_canada[ "Neighbourhood" ].apply(count_unique_vowels)
#df_canada["Total vowels"] = df_canada[ "Neighbourhood" ].apply(count_total_vowels)

In [32]:
df_canada.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Unique vowels
0,M3A,North York,Parkwoods,43.753259,-79.329656,2
1,M4A,North York,Victoria Village,43.725882,-79.315572,4
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,3
3,M6A,North York,Lawrence Heights,43.718518,-79.464763,3
4,M6A,North York,Lawrence Manor,43.718518,-79.464763,3


In [33]:
print(df_canada["Unique vowels"].value_counts())
#print(df_canada["Total vowels"].value_counts())

3    94
2    58
4    41
5    10
1     7
Name: Unique vowels, dtype: int64


#### Make one hot encoding
Fo this part, we'll get the dummies of the column ***Total vowels*** to cluster later

In [34]:
# Get the dummies of the number of unique vowels of the neighbourhoods
df_one_hot_total_vowels = pd.get_dummies(df_canada['Unique vowels'], prefix="Vowels_Neigh_", prefix_sep="")

In [35]:
df_one_hot_total_vowels.head()

Unnamed: 0,Vowels_Neigh_1,Vowels_Neigh_2,Vowels_Neigh_3,Vowels_Neigh_4,Vowels_Neigh_5
0,0,1,0,0,0
1,0,0,0,1,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,1,0,0


The meaning of the next table is: Number of vowels of the neighbourhood and append to the right its Borough

In [36]:
# Include Borough so we can join it later with df_canada
df_one_hot_total_vowels["Borough"] = df_canada["Borough"]

df_one_hot_total_vowels.head()

Unnamed: 0,Vowels_Neigh_1,Vowels_Neigh_2,Vowels_Neigh_3,Vowels_Neigh_4,Vowels_Neigh_5,Borough
0,0,1,0,0,0,North York
1,0,0,0,1,0,North York
2,0,0,1,0,0,Downtown Toronto
3,0,0,1,0,0,North York
4,0,0,1,0,0,North York


In [37]:
# Arrange the columns so that the Brough is the firts onw
fixed_columns = [df_one_hot_total_vowels.columns[-1]] + list(df_one_hot_total_vowels.columns[:-1])
df_one_hot_total_vowels = df_one_hot_total_vowels[fixed_columns]
print(df_one_hot_total_vowels.shape)
df_one_hot_total_vowels.head()

(210, 6)


Unnamed: 0,Borough,Vowels_Neigh_1,Vowels_Neigh_2,Vowels_Neigh_3,Vowels_Neigh_4,Vowels_Neigh_5
0,North York,0,1,0,0,0
1,North York,0,0,0,1,0
2,Downtown Toronto,0,0,1,0,0
3,North York,0,0,1,0,0
4,North York,0,0,1,0,0


In [38]:
# Group the Boroughs and make the mean of number of unique vowels of their neighbourhood 
boroughs_grouped = df_one_hot_total_vowels.groupby('Borough').mean().reset_index()
boroughs_grouped

Unnamed: 0,Borough,Vowels_Neigh_1,Vowels_Neigh_2,Vowels_Neigh_3,Vowels_Neigh_4,Vowels_Neigh_5
0,Central Toronto,0.0,0.352941,0.529412,0.117647,0.0
1,Downtown Toronto,0.027027,0.297297,0.459459,0.216216,0.0
2,East Toronto,0.0,0.428571,0.428571,0.0,0.142857
3,East York,0.0,0.0,0.666667,0.333333,0.0
4,Etobicoke,0.022727,0.227273,0.5,0.181818,0.068182
5,Mississauga,0.0,0.0,0.0,1.0,0.0
6,North York,0.052632,0.210526,0.526316,0.184211,0.026316
7,Queen's Park,0.0,0.0,1.0,0.0,0.0
8,Scarborough,0.027027,0.324324,0.378378,0.189189,0.081081
9,West Toronto,0.153846,0.307692,0.230769,0.153846,0.153846


In [39]:
# Print the top 2 number of vowels freq for each Borough
top_num_vowels = 2

for borough in boroughs_grouped['Borough']:
    print("----"+borough+"----")
    temp = boroughs_grouped[boroughs_grouped['Borough'] == borough].T.reset_index()
    temp.columns = ['num vocals of neighbourood','freq']
    # Grab from the second register
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(top_num_vowels))
    print('\n')

----Central Toronto----
  num vocals of neighbourood  freq
0             Vowels_Neigh_3  0.53
1             Vowels_Neigh_2  0.35


----Downtown Toronto----
  num vocals of neighbourood  freq
0             Vowels_Neigh_3  0.46
1             Vowels_Neigh_2  0.30


----East Toronto----
  num vocals of neighbourood  freq
0             Vowels_Neigh_2  0.43
1             Vowels_Neigh_3  0.43


----East York----
  num vocals of neighbourood  freq
0             Vowels_Neigh_3  0.67
1             Vowels_Neigh_4  0.33


----Etobicoke----
  num vocals of neighbourood  freq
0             Vowels_Neigh_3  0.50
1             Vowels_Neigh_2  0.23


----Mississauga----
  num vocals of neighbourood  freq
0             Vowels_Neigh_4   1.0
1             Vowels_Neigh_1   0.0


----North York----
  num vocals of neighbourood  freq
0             Vowels_Neigh_3  0.53
1             Vowels_Neigh_2  0.21


----Queen's Park----
  num vocals of neighbourood  freq
0             Vowels_Neigh_3   1.0
1             V

## Conclusions
As we can see, ***Mississauga*** and ***Queen's Park*** have a freq of 1.0 meaning their neighbouhood all have the same number of unique vowels

In [40]:
# Decide how many clusters
kclusters = 4

# Make a DF with the frequencies and without the column "Borough"
boroughs_grouped_clustering = boroughs_grouped.drop('Borough', 1)
boroughs_grouped_clustering

Unnamed: 0,Vowels_Neigh_1,Vowels_Neigh_2,Vowels_Neigh_3,Vowels_Neigh_4,Vowels_Neigh_5
0,0.0,0.352941,0.529412,0.117647,0.0
1,0.027027,0.297297,0.459459,0.216216,0.0
2,0.0,0.428571,0.428571,0.0,0.142857
3,0.0,0.0,0.666667,0.333333,0.0
4,0.022727,0.227273,0.5,0.181818,0.068182
5,0.0,0.0,0.0,1.0,0.0
6,0.052632,0.210526,0.526316,0.184211,0.026316
7,0.0,0.0,1.0,0.0,0.0
8,0.027027,0.324324,0.378378,0.189189,0.081081
9,0.153846,0.307692,0.230769,0.153846,0.153846


In [41]:
# Build the model with the DF of the frequencies
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(boroughs_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 1, 1, 0, 1, 2, 1, 0, 1, 1, 3], dtype=int32)

In [42]:
# add clustering labels to the DF
boroughs_grouped_clustering.insert(0, 'Cluster Labels', kmeans.labels_)
# add the Borough column
boroughs_grouped_clustering["Borough"] = boroughs_grouped["Borough"]
boroughs_grouped_clustering

Unnamed: 0,Cluster Labels,Vowels_Neigh_1,Vowels_Neigh_2,Vowels_Neigh_3,Vowels_Neigh_4,Vowels_Neigh_5,Borough
0,1,0.0,0.352941,0.529412,0.117647,0.0,Central Toronto
1,1,0.027027,0.297297,0.459459,0.216216,0.0,Downtown Toronto
2,1,0.0,0.428571,0.428571,0.0,0.142857,East Toronto
3,0,0.0,0.0,0.666667,0.333333,0.0,East York
4,1,0.022727,0.227273,0.5,0.181818,0.068182,Etobicoke
5,2,0.0,0.0,0.0,1.0,0.0,Mississauga
6,1,0.052632,0.210526,0.526316,0.184211,0.026316,North York
7,0,0.0,0.0,1.0,0.0,0.0,Queen's Park
8,1,0.027027,0.324324,0.378378,0.189189,0.081081,Scarborough
9,1,0.153846,0.307692,0.230769,0.153846,0.153846,West Toronto


### Join the two Data Frames

In [43]:
print(df_canada.shape)
df_canada.head()

(210, 6)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Unique vowels
0,M3A,North York,Parkwoods,43.753259,-79.329656,2
1,M4A,North York,Victoria Village,43.725882,-79.315572,4
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,3
3,M6A,North York,Lawrence Heights,43.718518,-79.464763,3
4,M6A,North York,Lawrence Manor,43.718518,-79.464763,3


In [44]:
canada_merged = df_canada

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
canada_merged = canada_merged.join(boroughs_grouped_clustering.set_index('Borough'), on='Borough')

print(canada_merged.shape)
canada_merged.head()

(210, 12)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Unique vowels,Cluster Labels,Vowels_Neigh_1,Vowels_Neigh_2,Vowels_Neigh_3,Vowels_Neigh_4,Vowels_Neigh_5
0,M3A,North York,Parkwoods,43.753259,-79.329656,2,1,0.052632,0.210526,0.526316,0.184211,0.026316
1,M4A,North York,Victoria Village,43.725882,-79.315572,4,1,0.052632,0.210526,0.526316,0.184211,0.026316
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,3,1,0.027027,0.297297,0.459459,0.216216,0.0
3,M6A,North York,Lawrence Heights,43.718518,-79.464763,3,1,0.052632,0.210526,0.526316,0.184211,0.026316
4,M6A,North York,Lawrence Manor,43.718518,-79.464763,3,1,0.052632,0.210526,0.526316,0.184211,0.026316


# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [45]:
# Latitude and longitude of toronto
latitude, longitude = "43.7001114", "-79.4162979"

In [47]:
# Create the map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, neigh, borough, cluster in zip(canada_merged['Latitude'], canada_merged['Longitude'], canada_merged['Neighbourhood'], canada_merged['Borough'], canada_merged['Cluster Labels']):
    # The Popup will have the Neighbourhood, Borough and Cluster
    label = folium.Popup('N:' + str(neigh) + '\nB:' + str(borough) + '\nC:' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters