In [9]:
# main documentation page: http://beautiful-soup-4.readthedocs.io/en/latest/
# how to use the BeautifulSoup package: https://www.youtube.com/watch?v=ng2o98k983k video
from bs4 import BeautifulSoup 
import requests 
import pandas as pd

### Read Data


In [None]:
# read data

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

source = requests.get(url).text
soup = BeautifulSoup(source,'lxml')

table = soup.find('table')
table.prettify()

### Create Dataframe

In [149]:
# parse data and create dataframe 
postcode =[]
borough =[]
neighbourhood =[]

for tr in table.find_all('tr'):
    i = 1 
    for td in tr.find_all('td'):
        if i == 1:
            postcode.append(td.text)    
        elif i == 2:    
            borough.append(td.text)
        elif i == 3:        
            neighbourhood.append(td.text.replace('\n',''))
        i +=1
        
dict = {'Postcode': postcode, 'Borough': borough, 'Neighbourhood': neighbourhood}        

df = pd.DataFrame.from_dict(dict)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Remove Borough = 'Not assigned'

In [150]:
#delete  rows with Not assigned 
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Generate Groupby DataFrame

In [210]:
boroughLst = df.groupby(['Postcode'])['Borough'].apply(set)
neighbourhoodLst = df.groupby(['Postcode'])['Neighbourhood'].apply(set)

result = pd.merge(pd.DataFrame(boroughLst), pd.DataFrame(neighbourhoodLst), how='outer', on=['Postcode'])
result.reset_index(inplace=True)
result.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,{Scarborough},"{Rouge, Malvern}"
1,M1C,{Scarborough},"{Port Union, Rouge Hill, Highland Creek}"
2,M1E,{Scarborough},"{West Hill, Guildwood, Morningside}"
3,M1G,{Scarborough},{Woburn}
4,M1H,{Scarborough},{Cedarbrae}


In [218]:
result[result.Postcode =='M5A']

Unnamed: 0,Postcode,Borough,Neighbourhood
53,M5A,{Downtown Toronto},"{Regent Park, Harbourfront}"


### Print Shape Result

In [212]:
result.shape

(103, 3)