# Segmenting and Clustering Neighbourhoods in Toronto
## The webscrapping, cleaning and clustering are included in the same notebook



Installation of the required libraries

In [None]:

!pip install beautifulsoup4
!pip install lxml
!conda install -c conda-forge folium=0.5.0 --yes

#import libraries
from bs4 import BeautifulSoup #for parsing
import pandas as pd #for data analysis
import numpy as np #data handeling in vectorized manner
import matplotlib.cm as cm #colormaps
import matplotlib.colors as colors
import random #random number generator library
import requests #to handle requests
import folium #for plotting

from pandas.io.json import json_normalize
from geopy.geocoders import Nominatim
from IPython.display import Image 
from IPython.core.display import HTML 
from IPython.display import display_html
from sklearn.cluster import KMeans #for kmeans clustering



Webscrapping the  Postal Codes of Canada

In [55]:
link = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
Soup = BeautifulSoup(link, 'html.parser')
print(Soup.title)
Table_contents = []
Table = Soup.find('table')#returns string version of the table

for row in Table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        Table_contents.append(cell)
    
df=pd.DataFrame(Table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.head()

<title>List of postal codes of Canada: M - Wikipedia</title>


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


Importing the longitude latitude data

In [56]:
latlon = pd.read_csv('https://cocl.us/Geospatial_data') #importing the longitude latitude data
latlon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging the longitudes and latitudes to the neighborhoods and boroughs of Canada

In [60]:
#Merging the longitudes and latitudes to the neighborhoods and boroughs of Canada
df.rename(columns = {'PostalCode':'Postal Code'}, inplace = True)
df1 = pd.merge(latlon, df, on= 'Postal Code')
df1

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern, Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae
...,...,...,...,...,...
98,M9N,43.706876,-79.518188,York,Weston
99,M9P,43.696319,-79.532242,Etobicoke,Westmount
100,M9R,43.688905,-79.554724,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,43.739416,-79.588437,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [81]:
#Extracting the data where Totnto is a Borough
df2 = df1[df1['Borough'].str.contains('Toronto', regex = False)]
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood
37,M4E,43.676357,-79.293031,East Toronto,The Beaches
40,M4J,43.685347,-79.338106,East York/East Toronto,The Danforth East
41,M4K,43.679557,-79.352188,East Toronto,"The Danforth West, Riverdale"
42,M4L,43.668999,-79.315572,East Toronto,"India Bazaar, The Beaches West"
43,M4M,43.659526,-79.340923,East Toronto,Studio District


In [92]:
#Visualizing the map of Toronto Neighborhoods

Toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(df2['Latitude'],df2['Longitude'],df2['Borough'],df2['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#2954cc',
    fill_opacity=1.0,
    parse_html=False).add_to(map_toronto)
Toronto

In [93]:
#KMeans Clustering
k = 4
Cluster = df2.drop(['Postal Code', 'Borough', 'Neighborhood'], 1)
KMC = KMeans(init = 'k-means++', n_clusters = k, n_init = 12)
KMC.fit(Cluster)
KMC.labels_
df2.insert(0, 'Cluster Labels', KMC.labels_)
df2

Unnamed: 0,Cluster Labels,Postal Code,Latitude,Longitude,Borough,Neighborhood
37,2,M4E,43.676357,-79.293031,East Toronto,The Beaches
40,2,M4J,43.685347,-79.338106,East York/East Toronto,The Danforth East
41,2,M4K,43.679557,-79.352188,East Toronto,"The Danforth West, Riverdale"
42,2,M4L,43.668999,-79.315572,East Toronto,"India Bazaar, The Beaches West"
43,2,M4M,43.659526,-79.340923,East Toronto,Studio District
44,0,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park
45,0,M4P,43.712751,-79.390197,Central Toronto,Davisville North
46,0,M4R,43.715383,-79.405678,Central Toronto,North Toronto West
47,0,M4S,43.704324,-79.38879,Central Toronto,Davisville
48,0,M4T,43.689574,-79.38316,Central Toronto,"Moore Park, Summerhill East"


In [98]:
# create map
MapToronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df2['Latitude'], df2['Longitude'], df2['Neighborhood'], df2['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=4,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=1).add_to(MapToronto)

MapToronto
