# Segmenting and Clustering Neighborhoods in Toronto, Canada

## Part1

### Import libraries

In [154]:
import pandas as pd
import numpy as np
import requests

! pip install bs4
! pip install lxml==4.6.2

from bs4 import BeautifulSoup

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

!conda install -c conda-forge folium==0.5.0 --yes
import folium 
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



### Scrape postal code data of Toronto from the Wikipedia website and convert it to dataframe

Since the needed data is in a table, we can easily find it and transform into needed dataframe

In [92]:
# Get Wikipedia html
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)

In [135]:
#  Use BeautifulSoup to transform html data into a dataframe
html_data = response.text
soup = BeautifulSoup(html_data, 'html.parser')
table = soup.find('table')
# Get dataframe
toronto_df = pd.read_html(str(table))[0]
toronto_df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### Prepare a dataframe with new column names, process and clean data

Rename columns and in order to only process cells with an assigned borough, ignore cells with 'Not assigned' value for Borough

In [136]:
#Rename columns
toronto_df.columns=['PostalCode', 'Borough','Neighborhood']
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [137]:
#Remove 'Not assignmed' values from Borough
toronto_df = toronto_df[toronto_df['Borough'].str.contains("Not assigned") == False].reset_index()
toronto_df.head(10)

Unnamed: 0,index,PostalCode,Borough,Neighborhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,9,M1B,Scarborough,"Malvern, Rouge"
7,11,M3B,North York,Don Mills
8,12,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,13,M5B,Downtown Toronto,"Garden District, Ryerson"


#### If a cell has a borough but the neighborhood is 'Not assigned', then the 'Neighborhood' value becomes the same as'Borough' value 

In [138]:
toronto_df.loc[toronto_df['Neighborhood'] =='Not assigned' , 'Neighborhood'] = toronto_df['Borough']
toronto_df.head(10)


Unnamed: 0,index,PostalCode,Borough,Neighborhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,9,M1B,Scarborough,"Malvern, Rouge"
7,11,M3B,North York,Don Mills
8,12,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,13,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Group neighborhoods with the same postal code together

In [140]:
toronto = toronto_df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()
toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [141]:
# Find number of rows in dataframe
toronto.shape

(103, 3)

## Part 2

The provided csv file is used to get the latitude and the longitude coordinates of each neighborhood.

In [142]:
#Get csv and turn data into dataframe
url = 'https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv' 
df_geo=pd.read_csv(url)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [143]:
#Rename column
df_geo.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
df_geo.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [144]:
# Get number of rows in df_geo dataframe
df_geo.shape

(103, 3)

#### The two dataframes have the same number of rows.
#### We can merge the first dataframe including borough and neighborhood name with df_geo dataframe on the 'PostalCode column' to add latitude and longitude information.

In [145]:
toronto = pd.merge(toronto, df_geo, on = 'PostalCode')
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [74]:
#Get number of rows in new combined dataframe
toronto.shape

(103, 4)

## Part 3

#### Only Boroughs that contain the word Toronto will be used to generate map and visualize the neighborhoods and clusters

In [146]:
# Get dataframe with only the boroughs that contain Toronto
toronto = toronto[toronto['Borough'].str.contains('Toronto')]
toronto = toronto.reset_index(drop=True)
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [147]:
# Get the number of values in dataframe
toronto.shape

(40, 5)

#### The Borough data is used to create a new 'Label' and dataframe for clustering

In [148]:
# Get values in Borough
toronto['Borough'].value_counts()

Downtown Toronto    19
Central Toronto      9
West Toronto         6
East Toronto         5
Toronto/York         1
Name: Borough, dtype: int64

There are 5 boroughs and 40 different postal codes

#### New 'Label' column is created and Borough data is added as integer

In [151]:
toronto['Label'] = toronto['Borough'].replace(to_replace=['Downtown Toronto','Central Toronto','West Toronto','East Toronto','Toronto/York'], 
                                              value=[1,2,3,4,5], inplace=False)
toronto.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Label
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,4
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,4
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,4
3,M4M,East Toronto,Studio District,43.659526,-79.340923,4
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2


#### The geographical coordinates of Toronto are called for visualization

In [157]:
address = 'Toronto'
geolocator = Nominatim(user_agent = 'toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geographical coordinates of Toronto are 43.6534817, -79.3839347.


#### Map of clusters is prepared

In [183]:
# Set cluster as Label number
kclusters = len(toronto.Label.unique())

#Create map using coordinates
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

#Set color for clusters(boroughs)
x = np.arange(kclusters)
ys = [i + x +(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to map
markers_colors = []
for lat, lon, cluster in zip(toronto['Latitude'], toronto['Longitude'], toronto['Label']):
    label = folium.Popup(str(toronto['Borough'])+ 'Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  

#Display map with 5 clustering
toronto_map