# Import required libraries

In [98]:
# import all require libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

Scrape website

In [99]:
# scrape data from website
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# create soup - parse source 
soup = BeautifulSoup(source, 'lxml')
#print(soup.prettify())

In [100]:
#creates a new empty dataframe
df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighbourhood'])

# find all td / cell elements in soup
for neighbourhood in soup.find_all('td'):
    #print(neighbourhood)
    
    # only process relevant cells with a p.span element
    if not neighbourhood.p: continue
    
    # skip Not assigned boroughs
    if neighbourhood.find("i") and neighbourhood.find("i").text == 'Not assigned': continue
    
    # get postal code
    postal_code = neighbourhood.find("b").text if neighbourhood.find("b") else ''
    #print(postal_code)
    
    # get borough
    neigh_list = neighbourhood.p.span.contents
    borough = neigh_list[0].text if neigh_list[0].name == "a" else neigh_list[0]
    #print(borough)
    
    # clean neighbourhood content
    neigh_list.pop(0) # remove borough
    neigh_list.pop(0) # remove <br/>
    if '(' in neigh_list: neigh_list.remove('(')
    if ')' in neigh_list: neigh_list.remove(')')
    
    # clean neighbourhood elements and build clean string
    neigh_str = ""
    for neigh in neigh_list:
        if neigh.name == 'br': continue
        if neigh.name == 'a':
            neigh_str += neigh.text
        else:
            neigh_str += neigh
    
    # replace spurious characters
    if '(' in neigh_str: neigh_str = neigh_str.replace('(', '')
    if ')' in neigh_str: neigh_str = neigh_str.replace(')', '')
    
    # split string into individual neighbourhood names and clean spurious characters
    neigh_list = neigh_str.split(' / ')
    neigh_list = list(map(str.strip, neigh_list))
    if '' in neigh_list: neigh_list.remove('')
    neighborhoods = ", ".join(neigh_list)
    #print(neighborhoods)
    
    # add postal code, borough and neighbourhoods to dataframe
    df = df.append({'PostalCode': postal_code, 'Borough': borough, 'Neighbourhood': neighborhoods}, ignore_index = True)


In [101]:
# inspect dataframe
print(df.head())

  PostalCode           Borough                     Neighbourhood
0        M3A        North York                         Parkwoods
1        M4A        North York                  Victoria Village
2        M5A  Downtown Toronto         Regent Park, Harbourfront
3        M6A        North York  Lawrence Manor, Lawrence Heights
4        M7A      Queen's Park     Ontario Provincial Government


In [102]:
# print out shape
df.shape

(103, 3)