## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Scrape wikipage and convert data to DataFrame

In [6]:
wiki_link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(wiki_link)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
_df = pd.read_html(str(table))[0] 

### Bringing the data frame to the required form
1. Make a copy of origin dataset
2. Rename column Postcode to PostalCode
3. Ignore cells with a borough that is Not assigned.
4. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
5. Rows with same PostalCode will be combined into one row with the neighborhoods separated with a comma

In [3]:
df = _df.copy()
df = df.rename(columns={"Postcode": "PostalCode"})
df = df[(df.Borough != 'Not assigned')]
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df.loc[df['Neighbourhood'] == 'Not assigned', 'Borough']
df = df.groupby('PostalCode', as_index=False).agg(lambda x : ', '.join(set(x)))

In [7]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"West Hill, Guildwood, Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [4]:
df.shape

(103, 3)

In [5]:
df.to_csv('neighborhoods.csv')