# Segmenting and Clustering Neighborhoods in Toronto

In [2]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup

## 1. Data Scraping

In [57]:
# Load table in Wikipedia page in pandas dataframe with BeautifulSoup
toronto_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urlopen(toronto_url)
soup = BeautifulSoup(page, 'html.parser')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table), header = 0)[0]
print('Dataframe size: {} rows and {} columns'.format(df.shape[0], df.shape[1]))
df.head()

Dataframe size: 289 rows and 3 columns


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## 2. Data Cleaning

In [69]:
# Drop all entries where Borough is not assigned
df = df[df['Borough'] != 'Not assigned']

# When Neighbourhood is not assigned, replace it with Borough
for index, row in df.iterrows():
    if (row['Neighbourhood'] == 'Not assigned'):
        df.loc[index, 'Neighbourhood'] = row['Borough']

# Combine all entries with the same postcode 
df_unique = df.groupby('Postcode', as_index = False).first()
for index, row in df_unique.iterrows():
    code = row['Postcode']    
    neighbourhoods = df.loc[df['Postcode'] == code]
    neighbourhoods_list = neighbourhoods['Neighbourhood'].values.tolist()    
    df_unique.loc[index, 'Neighbourhood'] = ', '.join(neighbourhoods_list)   
df = df_unique   

## Result

In [70]:
print('Dataframe size: {} rows and {} columns'.format(df.shape[0], df.shape[1]))
df.head() 

Dataframe size: 103 rows and 3 columns


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
