# Segmenting and Clustering Neighborhoods in Toronto

In [65]:
import pandas as pd
import numpy as np

###### Using read_html from pandas to read in tables from given html

In [129]:
toronto_raw = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

###### create a data frame from first row

In [130]:
toronto = pd.DataFrame(toronto_raw[0])
toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [131]:
toronto.shape

(180, 3)

In [132]:
toronto["Borough"].unique()

array(['Not assigned', 'North York', 'Downtown Toronto', 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [134]:
toronto = toronto[toronto["Borough"] != "Not assigned"]
toronto.head(20)

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge
11,M3B,North York,Don Mills
12,M4B,East York,Parkview Hill / Woodbine Gardens
13,M5B,Downtown Toronto,"Garden District, Ryerson"


###### there is no cell that has a borough but a Not assigned neighborhood

In [136]:
len(toronto["Neighborhood"][toronto["Neighborhood"] == "Not assigned"])

0

In [138]:
toronto.shape

(103, 3)

###### In order to combine the value of "Neighborhood" that have the same Postal code, I will use .groupby() and join value, separating them by a comma, and overwrite the "toronto" dataframe with the new dataframe where "Neighborhood"s are listed in one row. Then I'll reset the index

In [139]:
toronto = pd.DataFrame(
    toronto.groupby(["Postal code","Borough"])["Neighborhood"].apply(lambda hoods: ", ".join(hoods)))
toronto.reset_index(inplace=True)
toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [140]:
toronto.shape

(103, 3)