# Segmenting and Clustering Toronto Neighborhoods

In this notebook I will be scraping data from the webpage, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, with the Requests and BeautifulSoup libraries. Then after some data processing with pandas, I use KMeans to cluster the neighborhoods and finally plot the neighborhood clusters on a map with Folium.

In [25]:
import numpy as np
import pandas as pd
import requests
!pip install geocoder
import geocoder

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!pip install folium
import folium
!pip install beautifulsoup4
from bs4 import BeautifulSoup
from IPython.display import display_html



In [26]:
#scraping data
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(URL) 
  
soup = BeautifulSoup(r.content, 'lxml')

#pretty_soup = soup.prettify()
#print(pretty_soup)

In [27]:
#defining scraped data as a pandas dataframe
table = str(soup.table)


df = pd.read_html(table)
df = df[0]

print(df.head())
print(df.shape)

  Postal Code           Borough              Neighbourhood
0         M1A      Not assigned               Not assigned
1         M2A      Not assigned               Not assigned
2         M3A        North York                  Parkwoods
3         M4A        North York           Victoria Village
4         M5A  Downtown Toronto  Regent Park, Harbourfront
(180, 3)


In [28]:
#creating a dataframe for Postal Codes in Toronto
df_tr = df[df['Borough'].str.contains('Toronto',regex=False)]
print(df_tr.head())

   Postal Code           Borough                                Neighbourhood
4          M5A  Downtown Toronto                    Regent Park, Harbourfront
6          M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government
13         M5B  Downtown Toronto                     Garden District, Ryerson
22         M5C  Downtown Toronto                               St. James Town
30         M4E      East Toronto                                  The Beaches


In [29]:
#creating dataframe for postal codes, longtitudes, and latitudes from csv file
lat_lng = pd.read_csv('https://cocl.us/Geospatial_data')

lat_lng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [30]:
#adding longitude and latitudes to Toronto dataframe
df_tr2 = pd.merge(df_tr,lat_lng)[['Postal Code','Borough','Neighbourhood','Latitude','Longitude']]
df_tr2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [31]:
#using K means to cluster data assigning each record a numerical value
k=4

toronto_clustering = df_tr2.drop(['Postal Code','Borough','Neighbourhood'],1)

km = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
km.labels_
df_tr2.insert(0, 'Cluster_No', km.labels_)

In [32]:
df_tr2.head()

Unnamed: 0,Cluster_No,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,3,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,3,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,0,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [33]:

# defining map
Tr_neighborhood_Clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=12)

# create color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lng, neighbourhood, cluster in zip(df_tr2['Latitude'], df_tr2['Longitude'], df_tr2['Neighbourhood'], df_tr2['Cluster_No']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(Tr_neighborhood_Clusters)
#displaying map
Tr_neighborhood_Clusters