# Part 1 - Areas
Load data from web
 

In [1]:
from bs4 import BeautifulSoup
import requests

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).text
soup = BeautifulSoup(html)

table = soup.find_all('table', {'class':'wikitable sortable'})[0]

Parce table to pandas Dataframe

In [2]:
import pandas as pd

items = []

for row in table.find_all('tr'):
    columns = row.find_all('td')
    
    if len(columns) != 3: 
        continue
        
    # print(len(columns))
    
    postcode = columns[0].get_text()
    borough =  columns[1].get_text()
    Neighborhood =  columns[2].get_text()

    items.append([postcode, borough, Neighborhood])

print('item count=',len(items))

df = pd.DataFrame(data=items, columns=['PostalCode','Borough','Neighborhood'])

print(df.head())
print(df.shape)

item count= 288
  PostalCode           Borough        Neighborhood
0        M1A      Not assigned      Not assigned\n
1        M2A      Not assigned      Not assigned\n
2        M3A        North York         Parkwoods\n
3        M4A        North York  Victoria Village\n
4        M5A  Downtown Toronto      Harbourfront\n
(288, 3)


* Clean columns
* Drop not assigned boroughs
* Replace not assigned neighborhoods with borough names
 

In [3]:
df.drop(df[df['Borough'] == 'Not assigned'].index, inplace=True)
df['Neighborhood'] = df['Neighborhood'].str.rstrip()
# df.Neighborhood[df['Neighborhood'] == 'Not assigned'] = df['Borough']
df['Neighborhood'][df['Neighborhood'] == 'Not assigned'] = df['Borough']
print(df.head())
print(df.shape)


  PostalCode           Borough      Neighborhood
2        M3A        North York         Parkwoods
3        M4A        North York  Victoria Village
4        M5A  Downtown Toronto      Harbourfront
5        M5A  Downtown Toronto       Regent Park
6        M6A        North York  Lawrence Heights
(211, 3)


Merge Neighborhoods with same borough and postal index into one row


In [4]:
d2 = df.groupby(['PostalCode','Borough']).agg(lambda col: ', '.join(col)).reset_index()
d2.sort_values(by=['Borough'], inplace=True)
print(d2.head())
print(d2.shape)

   PostalCode          Borough                         Neighborhood
47        M4S  Central Toronto                           Davisville
63        M5N  Central Toronto                             Roselawn
46        M4R  Central Toronto                   North Toronto West
64        M5P  Central Toronto  Forest Hill North, Forest Hill West
65        M5R  Central Toronto  The Annex, North Midtown, Yorkville
(103, 3)


---
# Part 2 - Geo Coordinates
Load latitude/longitude coordinates 


In [5]:
dfCoords = pd.read_csv('https://cocl.us/Geospatial_data')
print(dfCoords.head())
print(dfCoords.shape)


  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476
(103, 3)


Merge with existing dataframe


In [6]:
dfCoords.columns = ['PostalCode','Latitude','Longitude']

dMerged = pd.merge(d2, dfCoords, on='PostalCode')
print(dMerged.head())
print(dMerged.shape)


  PostalCode          Borough                         Neighborhood   Latitude  \
0        M4S  Central Toronto                           Davisville  43.704324   
1        M5N  Central Toronto                             Roselawn  43.711695   
2        M4R  Central Toronto                   North Toronto West  43.715383   
3        M5P  Central Toronto  Forest Hill North, Forest Hill West  43.696948   
4        M5R  Central Toronto  The Annex, North Midtown, Yorkville  43.672710   

   Longitude  
0 -79.388790  
1 -79.416936  
2 -79.405678  
3 -79.411307  
4 -79.405678  
(103, 5)


---
# Part 3 - Clustering
Select only boroughs that contains 'Toronto' on their name

Also marking each borough with it index (digital representation)


In [8]:
dfToronto = dMerged.drop(dMerged[~dMerged['Borough'].str.contains('Toronto')].index)
dfToronto['Borough_index'] = dfToronto.groupby(['Borough']).ngroup()

print(dfToronto.head())
print(dfToronto.shape)


  PostalCode          Borough                         Neighborhood   Latitude  \
0        M4S  Central Toronto                           Davisville  43.704324   
1        M5N  Central Toronto                             Roselawn  43.711695   
2        M4R  Central Toronto                   North Toronto West  43.715383   
3        M5P  Central Toronto  Forest Hill North, Forest Hill West  43.696948   
4        M5R  Central Toronto  The Annex, North Midtown, Yorkville  43.672710   

   Longitude  Borough_index  
0 -79.388790              0  
1 -79.416936              0  
2 -79.405678              0  
3 -79.411307              0  
4 -79.405678              0  
(38, 6)


Let's show existing boroughs on map with folium


In [21]:
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

latitude = dfToronto['Latitude'].mean()
longitude = dfToronto['Longitude'].mean()
print(latitude,longitude)

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

uniqueBoroughs = dfToronto['Borough'].unique()
boroughCount = len(uniqueBoroughs)
print('Unique boroughs count=', boroughCount)

rainbow = ['#0f0f0f', '#0000ff', '#00ff00', '#ff0000']

feature_group = folium.FeatureGroup(name='Some icons')
for lat, lng, label, borough, boroughIndex in zip(dfToronto['Latitude'], 
                                                  dfToronto['Longitude'], 
                                                  dfToronto['Neighborhood'], 
                                                  dfToronto['Borough'], 
                                                  dfToronto['Borough_index']):
    label = folium.Popup('{}<br><br>Borough={}'.format(label,borough))
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[boroughIndex-1],
        fill=True,
        fill_color=rainbow[boroughIndex-1],
        fill_opacity=0.7,
        parse_html=False).add_to(feature_group)  
    
feature_group.add_to(map_toronto)
folium.LayerControl().add_to(map_toronto)
map_toronto

43.66726218421053 -79.38988323421052
Unique boroughs count= 4


And now let's try to cluster them based on latitude&longitude instead of using borough&postal code


In [22]:
from sklearn.cluster import KMeans

X = dfToronto[['Latitude','Longitude']]
# print(X)
print(X.shape)

k = 4
kmeans = KMeans(n_clusters=k, random_state=0).fit(X)

dfToronto['Cluster_index'] = kmeans.labels_

print(dfToronto.head())

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

rainbow = ['#0f0f0f', '#0000ff', '#00ff00', '#ff0000']

feature_group = folium.FeatureGroup(name='Some icons')
for lat, lng, label, borough, index in zip(dfToronto['Latitude'], 
                                                  dfToronto['Longitude'], 
                                                  dfToronto['Neighborhood'], 
                                                  dfToronto['Borough'], 
                                                  dfToronto['Cluster_index']):
    label = folium.Popup('{}<br><br>Cluster={}<br>Borough={}'.format(label,index, borough))
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[index-1],
        fill=True,
        fill_color=rainbow[index-1],
        fill_opacity=0.7,
        parse_html=False).add_to(feature_group)  
    
feature_group.add_to(map_toronto)
folium.LayerControl().add_to(map_toronto)
map_toronto

(38, 2)
  PostalCode          Borough                         Neighborhood   Latitude  \
0        M4S  Central Toronto                           Davisville  43.704324   
1        M5N  Central Toronto                             Roselawn  43.711695   
2        M4R  Central Toronto                   North Toronto West  43.715383   
3        M5P  Central Toronto  Forest Hill North, Forest Hill West  43.696948   
4        M5R  Central Toronto  The Annex, North Midtown, Yorkville  43.672710   

   Longitude  Borough_index  Cluster_index  
0 -79.388790              0              0  
1 -79.416936              0              0  
2 -79.405678              0              0  
3 -79.411307              0              0  
4 -79.405678              0              1  


Conclusion

As you can see - clustered boroughs are almost identical with real ones.