# Segmenting and Clustering Neighborhoods in Toronto

### Part 1 - Loading the Data

In [108]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

We are going to use the Wikipedia page (url below) that contains a table of postal codes and convert it into a DataFrame in python

In [109]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [110]:
source = requests.get(url).text
soup = BeautifulSoup(source,'lxml')
table = soup.find('table').text
table = soup.find_all('table')
df = pd.read_html(str(table).strip())[0]
df.columns = ['PostalCode','Borough','Neighborhood']

Ignore any cells that do not have a borough

In [111]:
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)

Since the Wikipedia table already has the postal codes grouped, replace the backslash with the comma for the neighborhood column

In [112]:
df = df.replace({' /': ','}, regex=True)

In [123]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [115]:
df.shape

(103, 3)

### Part 2 - Adding Geospatial Data

Using the csv file to read in the coordinates for each postal code, we then merge this dataframe with the pre-existing one to match each postal code

In [128]:
df_coords = pd.read_csv('Geospatial_Coordinates.csv')
df = pd.merge(df, df_coords, how='left', left_on = 'PostalCode', right_on = 'Postal Code')
df.drop("Postal Code", axis=1, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Part 3 - Clustering on a Map

In [132]:
import folium
latitude = 43.7
longitude = -79.3

In [135]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [137]:
toronto_data = df[df['Borough'].str.contains("Toronto")].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [139]:
# create map of just Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto