In [1]:
import pandas as pd
import numpy as np

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# Get Wiki page

In [13]:
import requests

result = requests.get(URL).text

# Beautify the output

In [14]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(result,'lxml')

# Parse the table

In [123]:
soup
Postcode = []
Borough = []
Neighborhood = []

# Find tables
for items in soup.find('table', class_='wikitable').find_all('tr')[1::1]:
    # Collect table entries
    data = items.find_all(['th','td'])
    # If no info presented for 'Borough' -> skip
    if data[1].text == 'Not assigned':
        continue
    # Process entries with the same PostalCode
    idx = -1
    try:
        idx = Postcode.index(data[0].text)
    except:
        idx = -1
    if idx != -1:
        Neighborhood[idx] += ', ' + data[2].text[:-1]
        continue
        
    Postcode.append(data[0].text)
    Borough.append(data[1].text)
    # Set neighborhood value to borough value if neighborhood value is not defined
    if data[2].text[:-1] == 'Not assigned':
        Neighborhood.append(data[1].text)
        continue
        
    Neighborhood.append(data[2].text[:-1])

In [124]:
df = pd.DataFrame([Postcode, Borough, Neighborhood]).T

In [130]:
df.columns = ['PostalCode', 'Borough', 'Neighborhood']

In [131]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [132]:
df.shape

(103, 3)

# Read location file and merge it with initial dataframe

In [156]:
df_loc = pd.read_csv('location.csv')

In [162]:
df_loc_cleaned = df_loc.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1).rename(columns={'Postal Code' : 'PostalCode'})

In [163]:
df_loc_cleaned.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [165]:
df_merged = pd.merge(df, df_loc_cleaned, how='inner', on='PostalCode')

In [166]:
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


# Cluster boroughs

In [193]:
df_merged_onehot = pd.get_dummies(df_merged['Borough'], prefix='', prefix_sep='')
df_merged_onehot['Neighborhood'] = df_merged['Neighborhood']

fixed_columns = [df_merged_onehot.columns[-1]] + list(df_merged_onehot.columns[:-1])
df_merged_onehot = df_merged_onehot[fixed_columns]

In [194]:
from sklearn.cluster import KMeans

In [195]:
# set number of clusters
kclusters = 5

df_grouped_clustering = df_merged_onehot.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 1, 3, 1, 0, 4, 3, 0, 1], dtype=int32)

In [196]:
# add clustering labels
df_merged_onehot.insert(0, 'Cluster Labels', kmeans.labels_)

In [200]:
df_merged_onehot = pd.merge(df_merged_onehot, df_merged, on='Neighborhood')

In [201]:
df_merged_onehot.head()

Unnamed: 0,Cluster Labels,Neighborhood,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York,PostalCode,Borough,Latitude,Longitude
0,3,Parkwoods,0,0,0,0,0,0,1,0,0,0,0,M3A,North York,43.753259,-79.329656
1,3,Victoria Village,0,0,0,0,0,0,1,0,0,0,0,M4A,North York,43.725882,-79.315572
2,1,Harbourfront,0,1,0,0,0,0,0,0,0,0,0,M5A,Downtown Toronto,43.65426,-79.360636
3,3,"Lawrence Heights, Lawrence Manor",0,0,0,0,0,0,1,0,0,0,0,M6A,North York,43.718518,-79.464763
4,1,Queen's Park,0,1,0,0,0,0,0,0,0,0,0,M7A,Downtown Toronto,43.662301,-79.389494


In [204]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 6.0MB/s eta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.10.1


In [207]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [212]:
map_clusters = folium.Map(location=[43.70, -79.40], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged_onehot['Latitude'], df_merged_onehot['Longitude'], df_merged_onehot['Neighborhood'], df_merged_onehot['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters