# Segmenting and Clustering Neighborhoods in Toronto

1. Importing required libraries

In [70]:
import pandas as pd

2. Fetching data from wiki page and converting it to Pandas dataframe 

In [71]:
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
post_codes = pd.read_html(link,header=0)[0]
post_codes

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


3. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [72]:
def fix_neighbourhood(record):
    if record.Borough != 'Not assigned' and record.Neighbourhood == 'Not assigned':
        record.Neighbourhood = record.Borough
    return record

post_codes.apply(fix_neighbourhood, axis = 1)  #Applying to each row

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


4. Removing Boroughs where data is not available

In [73]:
post_codes = post_codes[post_codes.Borough != 'Not assigned']
post_codes

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
...,...,...,...
282,M8Z,Etobicoke,Kingsway Park South West
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West


5. Combining Neighbourhoods associated with same Post code

In [74]:
def combine_neighbourhood(df):
    return pd.Series({'Neighbourhood':",".join(df.Neighbourhood.values.tolist())})

post_codes = post_codes.groupby(['Postcode', 'Borough']).apply(combine_neighbourhood)
post_codes

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M9N,York,Weston
M9P,Etobicoke,Westmount
M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


6. Reset the index

In [75]:
post_codes.reset_index(inplace = True)
post_codes

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


7. Dump shape

In [76]:
post_codes.shape

(103, 3)