In [None]:
#I referred to the work of other Github users, Crismag & Aliousidib for assistance in creating the following.

Get the necessary libraries and Beautiful Soup packages

In [20]:
import requests 
import lxml.html as lh
import bs4 as bs
import urllib.request
import numpy as np 
import pandas as pd 

Retrieve url holding the needed data

In [21]:
url   = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

The following two functions can be used to get data from the table in the url

In [22]:

def scrape_table_bs4(cname,cols):
    
    page  = urllib.request.urlopen(url).read()
    
    soup  = bs.BeautifulSoup(page,'lxml')
    
    table = soup.find("table",class_=cname)
    
    header = [head.findAll(text=True)[0].strip() for head in table.find_all("th")]
    
    data   = [[td.findAll(text=True)[0].strip() for td in tr.find_all("td")]
              for tr in table.find_all("tr")]
    
    data    = [row for row in data if len(row) == cols]
    
    raw_df = pd.DataFrame(data,columns=header)
    
    return raw_df


def scrape_table_lxml(XPATH,cols):
    
    page = requests.get(url)
    
    doc = lh.fromstring(page.content)
    
    table_content = doc.xpath(XPATH)
    
    for table in table_content:
        headers = [th.text_content().strip() for th in table.xpath('//th')]
        headers = headers[0:3]
        data    = [[td.text_content().strip() for td in tr.xpath('td')] 
                   for tr in table.xpath('//tbody/tr')]
        data    = [row for row in data if len(row) == cols]
        raw_df = pd.DataFrame(data,columns=headers)
        
        return raw_df

Test to ensure that the beautiful soup scraper is working properly

In [28]:
raw_TorontoPostalCodes = scrape_table_bs4("wikitable",3)

print("TPS")
print(raw_TorontoPostalCodes.info(verbose=True))

TPS
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Postal Code    180 non-null    object
 1   Borough        180 non-null    object
 2   Neighbourhood  180 non-null    object
dtypes: object(3)
memory usage: 4.3+ KB
None


The following lines will edit the scraped table so that empty values will be accounted for.

In [29]:
TorontoPostalCodes=raw_TorontoPostalCodes[~raw_TorontoPostalCodes['Borough'].isin(['Not assigned'])]


TorontoPostalCodes=TorontoPostalCodes.sort_values(by=['Postcode','Borough','Neighbourhood'], ascending=[1,1,1]).reset_index(drop=True)

TorontoPostalCodes.loc[TorontoPostalCodes['Neighbourhood'] == 'Not assigned', ['Neighbourhood']] = TorontoPostalCodes['Borough']
check_unassigned_post_state_sample = TorontoPostalCodes.loc[TorontoPostalCodes['Borough'] == 'Queen\'s Park']

TorontoPostalCodes = TorontoPostalCodes.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

KeyError: 'Postcode'

Show the resultant table

In [30]:
TorontoPostalCodes


Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Here, we describe the shape of the resultant table so that the unseen data is accounted for.

In [26]:
TorontoPostalCodes.shape


(103, 3)