In [1]:
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup

### Crawled the data from wiki to get the corresponding table

In [2]:
website_url = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = BeautifulSoup(website_url,'html.parser')
toronto = soup.find('table',{'class':'wikitable sortable'})

### Get the data from HTML and convert to DataFrame

In [11]:
headings = [th.get_text().strip('\n') for th in toronto.find("tr").find_all("th")]
datasets = []
for row in toronto.find_all("tr")[1:]:
    datasets.append([td.get_text().strip('\n') for td in row.find_all("td")])
toronto_df = pd.DataFrame(datasets)
toronto_df.columns = headings

### Preprocessing the data

In [12]:
# remove the "Not assigned" value in Borough column
toronto_df = toronto_df.loc[~(toronto_df['Borough']=='Not assigned')]
# replace the "Not assigned" value in Neighbourhood column with corresponding value in its Borough
toronto_df.loc[toronto_df['Neighbourhood']=='Not assigned', 'Neighbourhood'] = toronto_df.loc[toronto_df['Neighbourhood']=='Not assigned', 'Borough']

In [13]:
toronto_copy = toronto_df.copy()
combined_neighbourhood = toronto_copy.groupby(['Postcode'], as_index=False)['Neighbourhood'].agg({'Neighbourhood': (lambda x: ','.join(x.tolist()))})


In [14]:
del toronto_copy['Neighbourhood']

In [15]:
new_toronto = combined_neighbourhood.merge(toronto_copy.drop_duplicates('Postcode'), how='left', on='Postcode')
new_toronto.head()

Unnamed: 0,Postcode,Neighbourhood,Borough
0,M1B,"Rouge,Malvern",Scarborough
1,M1C,"Highland Creek,Rouge Hill,Port Union",Scarborough
2,M1E,"Guildwood,Morningside,West Hill",Scarborough
3,M1G,Woburn,Scarborough
4,M1H,Cedarbrae,Scarborough


In [16]:
new_toronto.shape

(103, 3)