# Segmenting and Clustering Neighborhoods in Toronto

# Import necessary liberaries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#install BeautifulSoup
#! conda install -c conda-forge beautifulsoup4 --yes
from bs4 import BeautifulSoup

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


Get url data for Toronto Neighbourhood from Canada postcode dataset. The data can be accessed via wikipedia: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [2]:
Canada_Postcode = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
print("Data Downloaded!")

Data Downloaded!


In [3]:
soup = BeautifulSoup(Canada_Postcode, 'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":890001695,"wgRevisionId":890001695,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

# Read table headers, rows, and cell data

In [4]:
my_table = soup.findAll("table",{"class":"wikitable sortable"})
my_table

[<table class="wikitable sortable">
 <tbody><tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>
 <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>
 <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>
 <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
 </td></tr>
 <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Pa

In [5]:
rows = my_table[0].findAll('tr')
len(rows)

289

In [6]:
headers = [th.text for th in rows[0].findAll('th')]
print(headers)

['Postcode', 'Borough', 'Neighbourhood\n']


In [7]:
title = soup.title.text
print(title)

List of postal codes of Canada: M - Wikipedia


In [23]:
FullTex = soup.find('div', class_='mw-parser-output').text
print(FullTex)

This is a list of postal codes in Canada where the first letter is M. Postal codes beginning with M are located within the city of Toronto in the province of Ontario. Only the first three characters are listed, corresponding to the Forward Sortation Area.
Canada Post provides a free postal code look-up tool on its website,[1] via its applications for such smartphones as the iPhone and BlackBerry,[2]  and sells hard-copy directories and CD-ROMs. Many vendors also sell validation tools, which allow customers to properly match addresses and postal codes. Hard-copy directories can also be consulted in all post offices, and some libraries.

Toronto - FSAs[edit]
Note: There are no rural FSAs in Toronto, hence no postal codes start with M0.



Postcode
Borough
Neighbourhood


M1A
Not assigned
Not assigned


M2A
Not assigned
Not assigned


M3A
North York
Parkwoods


M4A
North York
Victoria Village


M5A
Downtown Toronto
Harbourfront


M5A
Downtown Toronto
Regent Park


M6A
North York
Lawrence 

# Create a CSV file that contains all rows and columns of the table

In [8]:
for table in my_table:
    ths = table.findAll('th')
    headings = [th.text.strip() for th in ths]
    if headings[:3] == ['Postcode', 'Borough', 'Neighbourhood']:
        break
ths

[<th>Postcode</th>, <th>Borough</th>, <th>Neighbourhood
 </th>]

In [9]:
with open('Toronto_Postcode.csv', 'w') as fo:
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        Postcode, Borough, Neighbourhood = [td.text.strip() for td in tds[:3]]
       
        print(', '.join([Postcode, Borough, Neighbourhood]), file=fo)
tds

[<td>M9Z</td>, <td>Not assigned</td>, <td>Not assigned
 </td>]

In [10]:
df = pd.read_csv("Toronto_Postcode.csv", header = None)
df.columns = ["Postcode", "Borough", "Neighbourhood"]
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


# Group row cells with same postcode and write all related neighbourhood using comma in one cell

In [11]:
grp_by = df.groupby(('Postcode','Borough'))
df1 = grp_by['Neighbourhood'].unique()
df1 = df1.reset_index()
df1

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,[ Not assigned]
1,M1B,Scarborough,"[ Rouge, Malvern]"
2,M1C,Scarborough,"[ Highland Creek, Rouge Hill, Port Union]"
3,M1E,Scarborough,"[ Guildwood, Morningside, West Hill]"
4,M1G,Scarborough,[ Woburn]
5,M1H,Scarborough,[ Cedarbrae]
6,M1J,Scarborough,[ Scarborough Village]
7,M1K,Scarborough,"[ East Birchmount Park, Ionview, Kennedy Park]"
8,M1L,Scarborough,"[ Clairlea, Golden Mile, Oakridge]"
9,M1M,Scarborough,"[ Cliffcrest, Cliffside, Scarborough Village..."


# Remove all rows that have Borough "Not Assigned" and check number of rows in the new dataframe

In [12]:
data = df1[~df1.Borough.str.contains("Not assigned")]
data

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1B,Scarborough,"[ Rouge, Malvern]"
2,M1C,Scarborough,"[ Highland Creek, Rouge Hill, Port Union]"
3,M1E,Scarborough,"[ Guildwood, Morningside, West Hill]"
4,M1G,Scarborough,[ Woburn]
5,M1H,Scarborough,[ Cedarbrae]
6,M1J,Scarborough,[ Scarborough Village]
7,M1K,Scarborough,"[ East Birchmount Park, Ionview, Kennedy Park]"
8,M1L,Scarborough,"[ Clairlea, Golden Mile, Oakridge]"
9,M1M,Scarborough,"[ Cliffcrest, Cliffside, Scarborough Village..."
10,M1N,Scarborough,"[ Birch Cliff, Cliffside West]"


In [13]:
data.iloc[85,2] = 'Queen\'s Park'
data = data.reset_index(drop=True)
data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"[ Rouge, Malvern]"
1,M1C,Scarborough,"[ Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[ Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[ Woburn]
4,M1H,Scarborough,[ Cedarbrae]
5,M1J,Scarborough,[ Scarborough Village]
6,M1K,Scarborough,"[ East Birchmount Park, Ionview, Kennedy Park]"
7,M1L,Scarborough,"[ Clairlea, Golden Mile, Oakridge]"
8,M1M,Scarborough,"[ Cliffcrest, Cliffside, Scarborough Village..."
9,M1N,Scarborough,"[ Birch Cliff, Cliffside West]"


In [14]:
data.shape

(103, 3)

In [15]:
data.to_csv('Toronto_Neighbourhood', sep='\t', index=False)

# Work with geocode data to locate latitude and longitude of a given postcode of Canada

In [24]:
#!conda install -c conda-forge geocoder --yes

Solving environment: done


  current version: 4.4.10
  latest version: 4.6.12

Please update conda by running

    $ conda update -n base conda



## Package Plan ##

  environment location: /anaconda3

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geocoder-1.38.1            |             py_0          52 KB  conda-forge
    orderedset-2.0.1           |           py36_0          74 KB  conda-forge
    ratelim-0.1.6              |           py36_0           5 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         132 KB

The following NEW packages will be INSTALLED:

    geocoder:   1.38.1-py_0  conda-forge
    orderedset: 2.0.1-py36_0 conda-forge
    ratelim:    0.1.6-py36_0 conda-forge


Downloading and Extracting Packages
geocoder 1.38.1: ###################

# Add gographical data for each postcode

To avoid comlication remove the first column from the csv 

In [16]:
data1 = pd.read_csv("Geospatial_Coordinates.csv", header=None)
data1.columns = ["Latitude", "Longitude"]
data1

Unnamed: 0,Latitude,Longitude
0,43.806686,-79.194353
1,43.784535,-79.160497
2,43.763573,-79.188711
3,43.770992,-79.216917
4,43.773136,-79.239476
5,43.744734,-79.239476
6,43.727929,-79.262029
7,43.711112,-79.284577
8,43.716316,-79.239476
9,43.692657,-79.264848


# Join dataframe of Borough and Neighbourhood with respective geographical location based on Postcode

In [17]:
Toronto_data = pd.concat([data, data1], axis=1)
Toronto_data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"[ Rouge, Malvern]",43.806686,-79.194353
1,M1C,Scarborough,"[ Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,Scarborough,"[ Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,Scarborough,[ Woburn],43.770992,-79.216917
4,M1H,Scarborough,[ Cedarbrae],43.773136,-79.239476
5,M1J,Scarborough,[ Scarborough Village],43.744734,-79.239476
6,M1K,Scarborough,"[ East Birchmount Park, Ionview, Kennedy Park]",43.727929,-79.262029
7,M1L,Scarborough,"[ Clairlea, Golden Mile, Oakridge]",43.711112,-79.284577
8,M1M,Scarborough,"[ Cliffcrest, Cliffside, Scarborough Village...",43.716316,-79.239476
9,M1N,Scarborough,"[ Birch Cliff, Cliffside West]",43.692657,-79.264848


In [18]:
Toronto_data.to_csv('Toronto_Neighbourhood_geo', sep='\t', index=False)

# Explore and cluster the neighborhoods in Toronto

Let's see number of Borough and Neighbourhood Toronto have

In [35]:
data1 = df[~df.Borough.str.contains("Not assigned")]
data1 = data1.reset_index(drop=True)
data1

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [36]:
data1.iloc[6,2] = 'Queen\'s Park'
data1 = neighbourhoods.reset_index(drop=True)
data1

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [37]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(data1['Borough'].unique()),
        data1.shape[0]
    )
)

The dataframe has 11 boroughs and 211 neighborhoods.


# Find latitude and longitude of Toronto using geopy liberary

In [38]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [40]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, postcode, borough, neighborhood in zip(Toronto_data['Latitude'], Toronto_data['Longitude'], Toronto_data['Postcode'], Toronto_data['Borough'], Toronto_data['Neighbourhood']):
    label = '{}, {}'.format(postcode, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Work on Borough that contain a word "Toronto"

First let's find Borough's that contain a word "Toronto" as follows;

In [46]:
Toronto_area = Toronto_data[Toronto_data['Borough'].str.contains('Toronto', regex=False, case=False, na=False)]
Toronto_area

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,[ The Beaches],43.676357,-79.293031
41,M4K,East Toronto,"[ The Danforth West, Riverdale]",43.679557,-79.352188
42,M4L,East Toronto,"[ The Beaches West, India Bazaar]",43.668999,-79.315572
43,M4M,East Toronto,[ Studio District],43.659526,-79.340923
44,M4N,Central Toronto,[ Lawrence Park],43.72802,-79.38879
45,M4P,Central Toronto,[ Davisville North],43.712751,-79.390197
46,M4R,Central Toronto,[ North Toronto West],43.715383,-79.405678
47,M4S,Central Toronto,[ Davisville],43.704324,-79.38879
48,M4T,Central Toronto,"[ Moore Park, Summerhill East]",43.689574,-79.38316
49,M4V,Central Toronto,"[ Deer Park, Forest Hill SE, Rathnelly, Sou...",43.686412,-79.400049


In [47]:
# create map of Toronto area using latitude and longitude values
map_toronto_area = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, postcode, borough, neighborhood in zip(Toronto_area['Latitude'], Toronto_area['Longitude'], Toronto_area['Postcode'], Toronto_area['Borough'], Toronto_area['Neighbourhood']):
    label = '{}, {}'.format(postcode, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_area)  
    
map_toronto_area

# Thank you for reading my script

# P.S. some of the scripts are drived from different open source websites and I would like to thank whoever generated them initially!