# Segmenting and Clustering Neighborhoods in Toronto

### Creating a dataframe by webscraping postcode, borough and neighbourhood information of Toronto from Wikipedia

In [12]:
#Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [13]:
#Importing BeautifulSoup in order to extract data from Wikipedia

from urllib.request import urlopen
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

html=urlopen(url)  

soup=BeautifulSoup(html,'lxml')
type(soup)

bs4.BeautifulSoup

In [14]:
#Retrieving rows of the dataset from the Toronto postal codes Wikipedia page

rows=soup.find_all('tr')
print(rows[:5])

[<tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighbourhood
</th></tr>, <tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>, <tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>, <tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>, <tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>]


In [15]:
#retrieved table cells of the data from the Wikipedia page

for row in rows:
    
    row_td=soup.find_all('td')
    
print(row_td)
type(row_td)

[<td>M1A
</td>, <td>Not assigned
</td>, <td>Not assigned
</td>, <td>M2A
</td>, <td>Not assigned
</td>, <td>Not assigned
</td>, <td>M3A
</td>, <td>North York
</td>, <td>Parkwoods
</td>, <td>M4A
</td>, <td>North York
</td>, <td>Victoria Village
</td>, <td>M5A
</td>, <td>Downtown Toronto
</td>, <td>Regent Park, Harbourfront
</td>, <td>M6A
</td>, <td>North York
</td>, <td>Lawrence Manor, Lawrence Heights
</td>, <td>M7A
</td>, <td>Downtown Toronto
</td>, <td>Queen's Park, Ontario Provincial Government
</td>, <td>M8A
</td>, <td>Not assigned
</td>, <td>Not assigned
</td>, <td>M9A
</td>, <td>Etobicoke
</td>, <td>Islington Avenue, Humber Valley Village
</td>, <td>M1B
</td>, <td>Scarborough
</td>, <td>Malvern, Rouge
</td>, <td>M2B
</td>, <td>Not assigned
</td>, <td>Not assigned
</td>, <td>M3B
</td>, <td>North York
</td>, <td>Don Mills
</td>, <td>M4B
</td>, <td>East York
</td>, <td>Parkview Hill, Woodbine Gardens
</td>, <td>M5B
</td>, <td>Downtown Toronto
</td>, <td>Garden District, Ryerson
</td>

bs4.element.ResultSet

In [16]:
#Clean text by removing HTML tags

str_cells=str(row_td)
cleantext=BeautifulSoup(str_cells,'lxml').get_text()
print(cleantext)

[M1A
, Not assigned
, Not assigned
, M2A
, Not assigned
, Not assigned
, M3A
, North York
, Parkwoods
, M4A
, North York
, Victoria Village
, M5A
, Downtown Toronto
, Regent Park, Harbourfront
, M6A
, North York
, Lawrence Manor, Lawrence Heights
, M7A
, Downtown Toronto
, Queen's Park, Ontario Provincial Government
, M8A
, Not assigned
, Not assigned
, M9A
, Etobicoke
, Islington Avenue, Humber Valley Village
, M1B
, Scarborough
, Malvern, Rouge
, M2B
, Not assigned
, Not assigned
, M3B
, North York
, Don Mills
, M4B
, East York
, Parkview Hill, Woodbine Gardens
, M5B
, Downtown Toronto
, Garden District, Ryerson
, M6B
, North York
, Glencairn
, M7B
, Not assigned
, Not assigned
, M8B
, Not assigned
, Not assigned
, M9B
, Etobicoke
, West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
, M1C
, Scarborough
, Rouge Hill, Port Union, Highland Creek
, M2C
, Not assigned
, Not assigned
, M3C
, North York
, Don Mills
, M4C
, East York
, Woodbine Heights
, M5C
, Downtown To

In [17]:
#Seperated the postal codes, borough and neighbourhood in to a list

import re

list_rows=[]

for row in rows:
    cells=row.find_all('td')
    str_cells= str(cells)
    clean=re.compile('<.*?>')
    clean2=(re.sub(clean,'',str_cells))
    list_rows.append(clean2)
    print(clean2)
    type(clean2)

[]
[M1A
, Not assigned
, Not assigned
]
[M2A
, Not assigned
, Not assigned
]
[M3A
, North York
, Parkwoods
]
[M4A
, North York
, Victoria Village
]
[M5A
, Downtown Toronto
, Regent Park, Harbourfront
]
[M6A
, North York
, Lawrence Manor, Lawrence Heights
]
[M7A
, Downtown Toronto
, Queen's Park, Ontario Provincial Government
]
[M8A
, Not assigned
, Not assigned
]
[M9A
, Etobicoke
, Islington Avenue, Humber Valley Village
]
[M1B
, Scarborough
, Malvern, Rouge
]
[M2B
, Not assigned
, Not assigned
]
[M3B
, North York
, Don Mills
]
[M4B
, East York
, Parkview Hill, Woodbine Gardens
]
[M5B
, Downtown Toronto
, Garden District, Ryerson
]
[M6B
, North York
, Glencairn
]
[M7B
, Not assigned
, Not assigned
]
[M8B
, Not assigned
, Not assigned
]
[M9B
, Etobicoke
, West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
]
[M1C
, Scarborough
, Rouge Hill, Port Union, Highland Creek
]
[M2C
, Not assigned
, Not assigned
]
[M3C
, North York
, Don Mills
]
[M4C
, East York
, Woodbine Hei

In [18]:
#converted the list into a DataFrame using Pandas


df=pd.DataFrame(list_rows)
df.head(10)

Unnamed: 0,0
0,[]
1,"[M1A\n, Not assigned\n, Not assigned\n]"
2,"[M2A\n, Not assigned\n, Not assigned\n]"
3,"[M3A\n, North York\n, Parkwoods\n]"
4,"[M4A\n, North York\n, Victoria Village\n]"
5,"[M5A\n, Downtown Toronto\n, Regent Park, Harbo..."
6,"[M6A\n, North York\n, Lawrence Manor, Lawrence..."
7,"[M7A\n, Downtown Toronto\n, Queen's Park, Onta..."
8,"[M8A\n, Not assigned\n, Not assigned\n]"
9,"[M9A\n, Etobicoke\n, Islington Avenue, Humber ..."


In [19]:
#Split the Dataframe into columns from '/n'

df1=df[0].str.split('\n,',expand=True)
df1.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,[],,,,,,,,,,...,,,,,,,,,,
1,[M1A,Not assigned,Not assigned\n],,,,,,,,...,,,,,,,,,,
2,[M2A,Not assigned,Not assigned\n],,,,,,,,...,,,,,,,,,,
3,[M3A,North York,Parkwoods\n],,,,,,,,...,,,,,,,,,,
4,[M4A,North York,Victoria Village\n],,,,,,,,...,,,,,,,,,,
5,[M5A,Downtown Toronto,"Regent Park, Harbourfront\n]",,,,,,,,...,,,,,,,,,,
6,[M6A,North York,"Lawrence Manor, Lawrence Heights\n]",,,,,,,,...,,,,,,,,,,
7,[M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government\n]",,,,,,,,...,,,,,,,,,,
8,[M8A,Not assigned,Not assigned\n],,,,,,,,...,,,,,,,,,,
9,[M9A,Etobicoke,"Islington Avenue, Humber Valley Village\n]",,,,,,,,...,,,,,,,,,,


In [20]:
#Striped '[' from each row of the postcode column

df1[0]=df1[0].str.strip('[')
df1.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,],,,,,,,,,,...,,,,,,,,,,
1,M1A,Not assigned,Not assigned\n],,,,,,,,...,,,,,,,,,,
2,M2A,Not assigned,Not assigned\n],,,,,,,,...,,,,,,,,,,
3,M3A,North York,Parkwoods\n],,,,,,,,...,,,,,,,,,,
4,M4A,North York,Victoria Village\n],,,,,,,,...,,,,,,,,,,
5,M5A,Downtown Toronto,"Regent Park, Harbourfront\n]",,,,,,,,...,,,,,,,,,,
6,M6A,North York,"Lawrence Manor, Lawrence Heights\n]",,,,,,,,...,,,,,,,,,,
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government\n]",,,,,,,,...,,,,,,,,,,
8,M8A,Not assigned,Not assigned\n],,,,,,,,...,,,,,,,,,,
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village\n]",,,,,,,,...,,,,,,,,,,


In [21]:
#Retrieved the column headers from the Toronto postal codes Wikipedia page 

col_labels=soup.find_all('th')

In [22]:
#Created a list of the headers from the Wikipedia page

all_header=[]

col_str=str(col_labels)
cleantext2=BeautifulSoup(col_str,'lxml').get_text()
all_header.append(cleantext2)
print(all_header)

['[Postal Code\n, Borough\n, Neighbourhood\n, Canadian postal codes\n]']


In [23]:
#converted the headers into a Dataframe

df2=pd.DataFrame(all_header)
df2.head()

Unnamed: 0,0
0,"[Postal Code\n, Borough\n, Neighbourhood\n, Ca..."


In [24]:
#Split the dataframe into columns from the '\n'

df3=df2[0].str.split('\n,',expand=True)
df3.head()

Unnamed: 0,0,1,2,3
0,[Postal Code,Borough,Neighbourhood,Canadian postal codes\n]


In [25]:
#Removed the '[' from column 0

df3[0]=df3[0].str.strip('[')
df3.head()

Unnamed: 0,0,1,2,3
0,Postal Code,Borough,Neighbourhood,Canadian postal codes\n]


In [26]:
#Merged df3 to df1 to create df4 

frames=[df3,df1]

df4=pd.concat(frames)
df4.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,Postal Code,Borough,Neighbourhood,Canadian postal codes\n],,,,,,,...,,,,,,,,,,
0,],,,,,,,,,,...,,,,,,,,,,
1,M1A,Not assigned,Not assigned\n],,,,,,,,...,,,,,,,,,,
2,M2A,Not assigned,Not assigned\n],,,,,,,,...,,,,,,,,,,
3,M3A,North York,Parkwoods\n],,,,,,,,...,,,,,,,,,,
4,M4A,North York,Victoria Village\n],,,,,,,,...,,,,,,,,,,
5,M5A,Downtown Toronto,"Regent Park, Harbourfront\n]",,,,,,,,...,,,,,,,,,,
6,M6A,North York,"Lawrence Manor, Lawrence Heights\n]",,,,,,,,...,,,,,,,,,,
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government\n]",,,,,,,,...,,,,,,,,,,
8,M8A,Not assigned,Not assigned\n],,,,,,,,...,,,,,,,,,,


In [27]:
#Removed columns 3 to 30

cols = df4.columns.tolist()
to_remove = cols[3:31]

df5=df4.loc[:, ~df4.columns.isin(to_remove)]
df5.head(10)

Unnamed: 0,0,1,2
0,Postal Code,Borough,Neighbourhood
0,],,
1,M1A,Not assigned,Not assigned\n]
2,M2A,Not assigned,Not assigned\n]
3,M3A,North York,Parkwoods\n]
4,M4A,North York,Victoria Village\n]
5,M5A,Downtown Toronto,"Regent Park, Harbourfront\n]"
6,M6A,North York,"Lawrence Manor, Lawrence Heights\n]"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government\n]"
8,M8A,Not assigned,Not assigned\n]


In [28]:
#Renamed columns to Postal code, Borough and Neighbourhood

df6=df5.rename(columns=df5.iloc[0])
df6.head(10)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,Postal Code,Borough,Neighbourhood
0,],,
1,M1A,Not assigned,Not assigned\n]
2,M2A,Not assigned,Not assigned\n]
3,M3A,North York,Parkwoods\n]
4,M4A,North York,Victoria Village\n]
5,M5A,Downtown Toronto,"Regent Park, Harbourfront\n]"
6,M6A,North York,"Lawrence Manor, Lawrence Heights\n]"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government\n]"
8,M8A,Not assigned,Not assigned\n]


In [29]:
df6

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,Postal Code,Borough,Neighbourhood
0,],,
1,M1A,Not assigned,Not assigned\n]
2,M2A,Not assigned,Not assigned\n]
3,M3A,North York,Parkwoods\n]
...,...,...,...
180,M9Z,Not assigned,Not assigned\n]
181,,\n],
182,\n\n\nNL\n\nNS\n\nPE\n\nNB\n\nQC\n\nON\n\nMB\n...,NL,NS
183,NL,NS,PE


In [30]:
df6['Postal Code']=df6['Postal Code'].str.strip('NL')

In [31]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186 entries, 0 to 184
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Postal Code     186 non-null    object
 1    Borough        185 non-null    object
 2    Neighbourhood  184 non-null    object
dtypes: object(3)
memory usage: 5.8+ KB


In [32]:
#Removed the last 4 rows of the dataframe as there was no useful information

df7=df6[:-4]

In [33]:
df7

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,Postal Code,Borough,Neighbourhood
0,],,
1,M1A,Not assigned,Not assigned\n]
2,M2A,Not assigned,Not assigned\n]
3,M3A,North York,Parkwoods\n]
...,...,...,...
176,M5Z,Not assigned,Not assigned\n]
177,M6Z,Not assigned,Not assigned\n]
178,M7Z,Not assigned,Not assigned\n]
179,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor..."


In [34]:
df7.columns

Index(['Postal Code', ' Borough', ' Neighbourhood'], dtype='object')

In [35]:
#Renamed Dataframe columns

df8=df7.rename(columns={' Borough':'Borough',' Neighbourhood':'Neighbourhood'})

In [36]:
df8.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,Postal Code,Borough,Neighbourhood
0,],,
1,M1A,Not assigned,Not assigned\n]
2,M2A,Not assigned,Not assigned\n]
3,M3A,North York,Parkwoods\n]


In [37]:
#Dropped the first row of the dataframe

df8.drop(df8.index[0], inplace=True)

In [38]:
df8

Unnamed: 0,Postal Code,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned\n]
2,M2A,Not assigned,Not assigned\n]
3,M3A,North York,Parkwoods\n]
4,M4A,North York,Victoria Village\n]
5,M5A,Downtown Toronto,"Regent Park, Harbourfront\n]"
...,...,...,...
176,M5Z,Not assigned,Not assigned\n]
177,M6Z,Not assigned,Not assigned\n]
178,M7Z,Not assigned,Not assigned\n]
179,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor..."


In [39]:
#Deleted all rows where the borough was not assigned

df9=df8[~df8.Borough.str.contains("Not assigned")]

In [40]:
df9

Unnamed: 0,Postal Code,Borough,Neighbourhood
3,M3A,North York,Parkwoods\n]
4,M4A,North York,Victoria Village\n]
5,M5A,Downtown Toronto,"Regent Park, Harbourfront\n]"
6,M6A,North York,"Lawrence Manor, Lawrence Heights\n]"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government\n]"
...,...,...,...
161,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North\n]"
166,M4Y,Downtown Toronto,Church and Wellesley\n]
169,M7Y,East Toronto,"Business reply mail Processing Centre, South ..."
170,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, H..."


In [41]:
#Removed '\n]' from the neighbourhood column

df10 = df9.assign(result=df9['Neighbourhood'].str.replace('\n]', ''))

df10.drop(columns='Neighbourhood',axis=0,inplace=True)

df10

  app.launch_new_instance()


Unnamed: 0,Postal Code,Borough,result
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
161,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
166,M4Y,Downtown Toronto,Church and Wellesley
169,M7Y,East Toronto,"Business reply mail Processing Centre, South ..."
170,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, H..."


In [42]:
#Renamed the results table to Neighbourhood

df10.rename(columns={'result':'Neighbourhood'},inplace=True)

In [43]:
#Reset the index

df10.reset_index(drop=True,inplace=True)

In [44]:
df10

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South ..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, H..."


In [45]:
df10.shape

(103, 3)

# Part 2, Adding Longitude and Latitude data onto the the existing Toronto postal code Dataframe

In [46]:
#Read the given CSV file

url="http://cocl.us/Geospatial_data"

coords=pd.read_csv(url)

In [47]:
#Displayed the coordinates of the dataframe and names it coords

coords

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [48]:
#Renamed the postal code column of the coordinates dataframe to ensure it matched with the postal code column on df10

coords.rename(columns={'Postal Code':'Postal Code'}, inplace=True)


In [49]:
#Merged dataframe coords with df10

full=pd.merge(df10, coords, on='Postal Code', suffixes=("Latitude", "Longitude"))


In [50]:
full

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
84,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
85,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
86,M7Y,East Toronto,"Business reply mail Processing Centre, South ...",43.662744,-79.321558
87,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, H...",43.636258,-79.498509


# Part 3, Clustering The Neighbourhoods into Boroughs of Toronto

In [53]:
#Install forge folium and import the folium library

!conda install -c conda-forge folium=0.5.0 --yes
import folium

!pip install geopy
from geopy.geocoders import Nominatim

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [54]:
#Credentials taken from Foursquare

CLIENT_ID = '02OC1N5XYN4SAIHGDFYU2BHZOVZO0KXTWKB02GGYK1XM04A1' # your Foursquare ID
CLIENT_SECRET = 'T2TUUH0D2TUSG23IG5YE5URJWMG144HRRIGK4QOC1HUFODRC' # your Foursquare Secret
ACCESS_TOKEN = 'SUHVDHFN15EKU3HDY3P5RG0DT5HC5IHXTM0KYBJVJY3VYRLK' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 02OC1N5XYN4SAIHGDFYU2BHZOVZO0KXTWKB02GGYK1XM04A1
CLIENT_SECRET:T2TUUH0D2TUSG23IG5YE5URJWMG144HRRIGK4QOC1HUFODRC


In [55]:
#Collected coordinates of Toronto

address= 'Toronto, Ontario'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [58]:
#Created a map of Toronto with Neighbourhoods and Boroughs superimposed

map_toronto=folium.Map(location=[latitude,longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(full['Latitude'], full['Longitude'], full['Borough'], full['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto