# Segmenting and Clustering Neighborhoods in Toronto

First let's import the important library : 

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from bs4 import BeautifulSoup #library for beautifulSoup

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
import geocoder # import geocoder
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Scrapping of the wikipedia page

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

#Get the content of the wiki page on a string
results = requests.get(url).text

#Transform the content in xml with BeautifulSoup
page = BeautifulSoup(results, "lxml")
print(page)

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":890001695,"wgRevisionId":890001695,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","w

In [3]:
title = page.find("title")
print(title)

<title>List of postal codes of Canada: M - Wikipedia</title>


In [4]:
page.tbody

<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" tit

#notes perso

<!--
page.tbody.tr.contents
#['\n', <th>Postcode</th>, '\n', <th>Borough</th>, '\n', <th>Neighbourhood</th>]
page.tbody.tr.children
#<list_iterator at 0x25901d1d2b0>
page.tbody.tr.descendants
#<generator object Tag.descendants at 0x0000025901D57318>
-->

### Get the column of the Dataframe

In [5]:
col = []
for name_col in page.tbody.tr.stripped_strings:
    col.append(name_col)
    
col[2] = 'Neighborhood'
postal_code = pd.DataFrame(columns=col)
postal_code

Unnamed: 0,Postcode,Borough,Neighborhood


### Fill the Dataframe with the postcodes

In [6]:
tab = page.tbody

for row in tab.find_all('tr'):
    #We remove the empty postcode and the first line containing the colomn names
    if(row.contents[3].string != 'Not assigned' and row.contents[3].string != 'Borough'):
        postcode = row.contents[1].string
        borough = row.contents[3].string
        neighborhood_name = row.contents[5].contents[0].string
        
        postal_code = postal_code.append({'Postcode': postcode,
                                          'Borough': borough,
                                          'Neighborhood': neighborhood_name},
                                          ignore_index=True)

#Remove all '\n' in the dataFrame
postal_code = postal_code.replace('\n','', regex=True)
postal_code.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [7]:
postal_code.shape

(211, 3)

In [8]:
#In order to reshape the dataframe, let's first get all unique Postcode/Borough
unique_postal_code = postal_code.drop_duplicates(['Postcode', 'Borough'])

In [9]:
unique_postal_code

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,Lawrence Heights
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
10,M3B,North York,Don Mills North
11,M4B,East York,Woodbine Gardens
13,M5B,Downtown Toronto,Ryerson


### Combine Neighborhood in one row

In [10]:
#For every unique Poscode/Borough, look if there is a doublon and if yes, add the Neigborhood to the last one separate by a coma
for ele in unique_postal_code.values:
    temp = ele[2]
    for doublon in postal_code.values:
        if(ele.all() != doublon.all() and ele[0:2].any() == doublon[0:2].any()):
            temp = temp+', '+doublon[2]
    
    #Append a new line with all Neighborhood from a Postcode/Borough in one line
    postal_code = postal_code.append({'Postcode': ele[0],
                                          'Borough': ele[1],
                                          'Neighborhood': temp},
                                          ignore_index=True)      


In [11]:
postal_code.tail()

Unnamed: 0,Postcode,Borough,Neighborhood
309,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
310,M4Y,Downtown Toronto,Church and Wellesley
311,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
312,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."
313,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen..."


In [12]:
#Delete all the first duplicate line Postcode/Borough, only keep the last that we append just before
postal_code = postal_code.drop_duplicates(['Postcode', 'Borough'], keep='last').reset_index(drop=True)

In [13]:
#Change the cell Neighborhood that was Not Assigned
postal_code.iloc[4]['Neighborhood'] = postal_code.iloc[4]['Borough']

### Final Dataframe

In [14]:
postal_code

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [15]:
postal_code.shape

(103, 3)

#Code for using geocoder but didn't work

#initialize your variable to None
lat_lng_coords = None

for index in postal_code.index:
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code.loc[index, 'Postcode']))
        lat_lng_coords = g.latlng
        print(lat_lng_coords)
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]

    postal_code.loc[index, 'Latitude'] = latitude
    postal_code.loc[index, 'Longitude'] = longitude
        
postal_code.head()


### Import of the coordinates and add to the dataframe

In [16]:
lat_lng_coords = pd.read_csv('Geospatial_Coordinates.csv')
lat_lng_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
#Set the Postal code as index of the imported dataframe
lat_lng_coords = lat_lng_coords.set_index('Postal Code')
lat_lng_coords.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [18]:
for index in postal_code.index:
    #Access the latitude longitude with the index as Postal code of lat_lng_coords
    latitude, longitude = lat_lng_coords.loc[postal_code.loc[index, 'Postcode'], ['Latitude', 'Longitude']]        
    postal_code.loc[index, 'Latitude'] = latitude
    postal_code.loc[index, 'Longitude'] = longitude
    
postal_code.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


### Exploration of the neighborhoods in Toronto

In [19]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[43.718498, -79.351138], zoom_start=11)

# add markers to map
for lat, lng, postcode, borough, neighborhood in zip(postal_code['Latitude'], postal_code['Longitude'], postal_code['Postcode'], postal_code['Borough'], postal_code['Neighborhood']):
    label = '{}, {}: {}'.format(postcode, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Clustering of the neighborhoods in Toronto

My goal for the clustering will be to cluster each borough, and show them on a map with a different color to have a better visualization of the different boroughs of Toronto

First I created a Dataframe containing the different borough of Toronto

In [20]:
borough_df = pd.DataFrame(postal_code['Borough'].unique())
borough_df.rename(columns={0:'Borough'}, inplace=True)

#Copy of the original dataframe to a new one because I don't want to modify the first one 
postal_code_borough = postal_code
borough_df

Unnamed: 0,Borough
0,North York
1,Downtown Toronto
2,Queen's Park
3,Etobicoke
4,Scarborough
5,East York
6,York
7,East Toronto
8,West Toronto
9,Central Toronto


Then for every row of my initial dataframe, I add a new column with the index of my Borough Dataframe as value with the corresponding Borough in initial dataframe

In [21]:
for index in postal_code_borough.index:
    #Access the latitude longitude with the index as Postal code of lat_lng_coords
    Borough_label = int(borough_df[borough_df['Borough']==postal_code.loc[index, 'Borough']].index[0])
    postal_code_borough.loc[index, 'Borough label'] = Borough_label
    
postal_code_borough = postal_code_borough.astype({'Borough label': int})

In [22]:
postal_code_borough

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Borough label
0,M3A,North York,Parkwoods,43.753259,-79.329656,0
1,M4A,North York,Victoria Village,43.725882,-79.315572,0
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,1
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,0
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,2
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242,3
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,4
7,M3B,North York,Don Mills North,43.745906,-79.352188,0
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937,5
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,1


### Result of the clustered map

In [23]:
# create map
map_borough = folium.Map(location=[43.718498, -79.351138], zoom_start=11)

# set color scheme for the clusters
nbBorough = len(postal_code_borough['Borough'].unique())
x = np.arange(nbBorough)
ys = [i + x + (i*x)**2 for i in range(nbBorough)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, postcode, borough, neighborhood, borLabel in zip(postal_code_borough['Latitude'], postal_code_borough['Longitude'], postal_code_borough['Postcode'], postal_code_borough['Borough'], postal_code_borough['Neighborhood'], postal_code_borough['Borough label']):
#for lat, lon, poi, cluster in zip(postal_code['Latitude'], postal_code['Longitude'], postal_code['Neighborhood'], postal_code['Cluster Labels']):
    label = '{}, {}: {}'.format(postcode, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[borLabel-1],
        fill=True,
        fill_color=rainbow[borLabel-1],
        fill_opacity=0.7).add_to(map_borough)
       
map_borough