In [1]:
# Import web scraping library
# !pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests
import pandas as pd

Let's use the requests to retrive the webpage and BeautifulSoup package to scrape the data in the table on the Wikipedia page.

In [2]:
# Webpage url 
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

page = requests.get(url)

# Scrape web page
page = BeautifulSoup(page.content, 'html.parser')

table = page.find_all('tbody')

table = table[0].find_all('tr')

# Create empty lists
PostalCode = []
Borough = []
Neighborhood = []

# Iterate through the table content to retrive data from the <td> tags 
for i in range(1,len(table)):
    PostalCode.append(table[i].contents[1].getText().strip('\n'))
    Borough.append(table[i].contents[3].getText().strip('\n'))
    Neighborhood.append(table[i].contents[-1].getText().strip('\n').replace(' /', ','))


Let's create a dataframe to pass on the lists

In [3]:
# Create data frame
df = {'PostalCode': PostalCode,
     'Borough': Borough,
     'Neighborhood': Neighborhood}

df = pd.DataFrame(df)
df.shape


(180, 3)

In [4]:
# Remove all not assigned Boroughs rows
not_assigned = df[df['Borough'] == 'Not assigned']
df = df.drop(index=not_assigned.index, axis=0)

# Reset dataframe indexes
df = df.reset_index().drop(columns='index', axis=1)
df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
df.shape

(103, 3)

Now, let's get the latitude and longitude for each post code.

In [28]:
# Read csv containing lat and lng for each postcode
with open('Coursera_Capstone/Geospatial_Coordinates.csv', 'r') as file:
    geo_csv = pd.read_csv(file)

# Rename Post code colum to merge tables 
geo_csv.rename(columns= {'Postal Code':'PostalCode'}, inplace= True)

# Merge data frame based on PostCode 
df = df.merge(geo_csv, on='PostalCode')

df.head()