# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

### Part1 :

Step 1 : Import libaraies and build a BeautifulSoup instance to parse the wikipedia html souce code.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page= requests.get(URL)

soup= BeautifulSoup(page.content, "html.parser")

Step 2 : Process the cells with assigned borough and extract PostalCode, Borough and Neighborhood to make a dataframe df. 

In [2]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [3]:
print("The number of rows of df is:", df.shape[0])

The number of rows of df is: 103


### Part 2 :

Step 1: Import Geospatial_Coordinates.csv as dataframe geo.

In [4]:
geo = pd.read_csv("Geospatial_Coordinates.csv")

Step 2 : Use merge method to combine df and geo dataframes by Postal Code and drop one of the duplicate postal code column.

In [5]:
Toronto_neighborhood = pd.merge(df, geo, left_on="PostalCode", right_on="Postal Code").drop('Postal Code',axis=1)
Toronto_neighborhood.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [13]:
# Select Neighborhood wtih Borough name including "Toronto"
Toronto_borough = Toronto_neighborhood[Toronto_neighborhood['Borough'].str.contains("Toronto")]

# One hot encodeing with Borough
Toronto_neighborhood_onehot = pd.get_dummies(Toronto_borough[['Borough']], prefix="", prefix_sep="")

# Add neighborhood column to Toronto_neighborhood_onehot
Toronto_neighborhood_onehot['Neighborhood'] = Toronto_neighborhood['Neighborhood']

# Move ngighborhood column to the first column
fixed_column = [Toronto_neighborhood_onehot.columns[-1]] + list(Toronto_neighborhood_onehot.columns[:-1])
Toronto_neighborhood_onehot = Toronto_neighborhood_onehot[fixed_column]

print("The shape of Toronto_neighborhood_onehot is:", Toronto_neighborhood_onehot.shape)

Toronto_neighborhood_onehot.head()

The shape of Toronto_neighborhood_onehot is: (39, 8)


Unnamed: 0,Neighborhood,Central Toronto,Downtown Toronto,Downtown Toronto Stn A,East Toronto,East Toronto Business,East York/East Toronto,West Toronto
2,"Regent Park, Harbourfront",0,1,0,0,0,0,0
9,"Garden District, Ryerson",0,1,0,0,0,0,0
15,St. James Town,0,1,0,0,0,0,0
19,The Beaches,0,0,0,1,0,0,0
20,Berczy Park,0,1,0,0,0,0,0


In [16]:
# Set number of clusters
from sklearn.cluster import KMeans

k_clusters = 7
Toronto_clustering = Toronto_neighborhood_onehot.drop('Neighborhood', 1)

# Run K-means clustering
K_means = KMeans(n_clusters=k_clusters, random_state=1).fit(Toronto_clustering)

# check cluster labels generated for each row in the dataframe
K_means.labels_[0:10] 

array([0, 0, 0, 3, 0, 0, 0, 0, 1, 4])

In [23]:
# Add clustering labels

Toronto_borough.insert(0, 'Cluster_labels', K_means.labels_)
Toronto_borough = Toronto_borough.reset_index().drop('index', 1)

Unnamed: 0,Cluster_labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,0,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
8,1,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
9,4,M4J,East York/East Toronto,The Danforth East,43.685347,-79.338106


In [37]:
#!pip install folium
#!pip install geopy

from geopy.geocoders import Nominatim
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np

# Derive the location of Toronto
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(k_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_borough['Latitude'], Toronto_borough['Longitude'], Toronto_borough['Neighborhood'], Toronto_borough['Cluster_labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Cluster 0 :

In [43]:
Toronto_borough.loc[Toronto_borough['Cluster_labels'] == 0, ['Borough', 'Neighborhood']]

Unnamed: 0,Borough,Neighborhood
2,Downtown Toronto,"Regent Park, Harbourfront"
9,Downtown Toronto,"Garden District, Ryerson"
15,Downtown Toronto,St. James Town
20,Downtown Toronto,Berczy Park
24,Downtown Toronto,Central Bay Street
25,Downtown Toronto,Christie
30,Downtown Toronto,"Richmond, Adelaide, King"
36,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands"
42,Downtown Toronto,"Toronto Dominion Centre, Design Exchange"
48,Downtown Toronto,"Commerce Court, Victoria Hotel"


Cluster 1 :

In [44]:
Toronto_borough.loc[Toronto_borough['Cluster_labels'] == 1, ['Borough', 'Neighborhood']]

Unnamed: 0,Borough,Neighborhood
31,West Toronto,"Dufferin, Dovercourt Village"
37,West Toronto,"Little Portugal, Trinity"
43,West Toronto,"Brockton, Parkdale Village, Exhibition Place"
69,West Toronto,"High Park, The Junction South"
75,West Toronto,"Parkdale, Roncesvalles"
81,West Toronto,"Runnymede, Swansea"


Cluster 2:

In [45]:
Toronto_borough.loc[Toronto_borough['Cluster_labels'] == 2, ['Borough', 'Neighborhood']]

Unnamed: 0,Borough,Neighborhood
61,Central Toronto,Lawrence Park
62,Central Toronto,Roselawn
67,Central Toronto,Davisville North
68,Central Toronto,Forest Hill North & West
73,Central Toronto,North Toronto West
74,Central Toronto,"The Annex, North Midtown, Yorkville"
79,Central Toronto,Davisville
83,Central Toronto,"Moore Park, Summerhill East"
86,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest..."


Cluster 3:

In [46]:
Toronto_borough.loc[Toronto_borough['Cluster_labels'] == 3, ['Borough', 'Neighborhood']]

Unnamed: 0,Borough,Neighborhood
19,East Toronto,The Beaches
41,East Toronto,"The Danforth West, Riverdale"
47,East Toronto,"India Bazaar, The Beaches West"
54,East Toronto,Studio District


Cluster 4:

In [47]:
Toronto_borough.loc[Toronto_borough['Cluster_labels'] == 4, ['Borough', 'Neighborhood']]

Unnamed: 0,Borough,Neighborhood
35,East York/East Toronto,The Danforth East


Cluster 5:

In [48]:
Toronto_borough.loc[Toronto_borough['Cluster_labels'] == 5, ['Borough', 'Neighborhood']]

Unnamed: 0,Borough,Neighborhood
100,East Toronto Business,Enclave of M4L


Cluster 6:

In [49]:
Toronto_borough.loc[Toronto_borough['Cluster_labels'] == 6, ['Borough', 'Neighborhood']]

Unnamed: 0,Borough,Neighborhood
92,Downtown Toronto Stn A,Enclave of M5E
