# Segmenting and Clustering Neighborhoods in Toronto
#### This is a notebook for the project "Segmenting and Clustering Neighborhoods in Toronto" for coursera course: Applied Data Science Capstone

In [1]:
import pandas as pd
import numpy as np
import requests
import lxml.html as lh

In [112]:
# the url of the wiki page
url = 'http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# use pd.read_html to scrap the table in the html page, and use match='Borough' to return the table that has information Borough'
table = pd.read_html(url,match='Borough')

# the function return a list of DataFrame
#table

#### the first dataframe in the list is the one we want!

In [105]:
df = table[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


#### use the first row as the column name

In [106]:
df.columns= df.iloc[0,:]

# use reset_index(drop=True) to reset the index and drop the old index
df.drop(0).reset_index(drop=True).head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [71]:
# count the number of unique postcode
print (df.Postcode.unique().shape)
# count the number of Borough
print (df.Borough.unique().shape)

(181,)
(13,)


#### create a new dataframe that excludes borough with 'Not assigned'

In [113]:
df_Tor = pd.DataFrame(columns = {'Postcode','Borough','Neighbourhood'})

for post,bor,nb in zip(df['Postcode'],df['Borough'],df['Neighbourhood']):
        
        if post!='Postcode':
            if bor!='Not assigned'and nb=='Not assigned':
                df_Tor = df_Tor.append({'Postcode': post, 'Borough': bor,
                                       'Neighbourhood': bor}, ignore_index=True)
            elif bor!='Not assigned'and nb!='Not assigned':
                df_Tor = df_Tor.append({'Postcode': post,'Borough': bor,
                                       'Neighbourhood': nb}, ignore_index=True)

#number of unique postcode after removing 'Not assigned' values
df_Tor['Postcode'].unique().shape

(103,)

#### combine Neighbourhoods sharing the same postcode


In [110]:
# combine neighbourhood sharing the same postcode
df_new = df_Tor.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()
df_new.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### shape of the new dataframe after combing


In [111]:
df_new.shape

(103, 3)