In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url).text
soup = BeautifulSoup(req, 'html')

## Let's read the contents of the web page and look for the cells

In [3]:
# The contents of the table cell for reference
print(soup.find_all('td')[6].text)
soup.find_all('td')[6]


M7AQueen's Park(Ontario Provincial Government)




<td style="width:11%; vertical-align:top;">
<p><b>M7A</b><br/><span style="font-size:85%;"><a href="/wiki/Queen%27s_Park_(Toronto)" title="Queen's Park (Toronto)">Queen's Park</a><br/>(Ontario Provincial Government)</span>
</p>
</td>

In [4]:
# First let's make a list that will contain all the information
comm = []
# Run through all the instances of a table cell
for instance in soup.find_all('td'):
    # Initiate the dictionary that will hold the cell's data
    commDict = {}
    # The try will kick out if there is no information in the cell and make no entry
    try:
        # The postal code is wrapped up in the first Bold tag <b></b>
        commDict['PostalCode'] = instance.b.text
        # If the cell is unassaigned it will not allocate any information
        if instance.span.text == 'Not assigned':
            pass
        else:
            data = instance.span.text.split(')')
            # Now the Borough is in the first bit of the text before the '('
            commDict['Borough'] = instance.span.text[:instance.span.text.find('(')]
            commDict['Neighborhood'] = ''
            for area in data:
                if commDict['Neighborhood'] != '' and len(area.split('(')>1):
                    commDict['Neighborhood'] = commDict['Neighborhood'] + ',' 
                # The neighborhoods content is in between the cells and seperated by ' / '
                commDict['Neighborhood'] = commDict['Neighborhood'] + area[instance.span.text.find('(')+1:].replace(' / ',',')
    except:
        pass
    # Now we read the data into the list 'comm' if there is information
    try:
        if commDict['Borough'] is not None:
            comm.append(commDict)
    except:
        pass

In [5]:
# Now we read the dictionary into a dataframe
df_scrape = pd.DataFrame(comm)
df_scrape

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park,Harbourfront"
3,M6A,North York,"Lawrence Manor,Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South,King's Mill Park,Sunnylea,Humbe..."


In [6]:
# Let's remove areas that are just postal addresses
ban_list = ['M7Y','M5W','M7R','M7A']
df_scrape = df_scrape[~df_scrape['PostalCode'].isin(ban_list)]
df_scrape.shape

(99, 3)

In [7]:
import geocoder # import geocoder

df_PScodes = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv")
df_PScodes = df_PScodes.set_index('Postal Code')
df_PScodes

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476
...,...,...
M9N,43.706876,-79.518188
M9P,43.696319,-79.532242
M9R,43.688905,-79.554724
M9V,43.739416,-79.588437


Now if we make the indexes the same for the two dataframes we will have the new columns automatically take on the correct values for the postal codes.

In [8]:
df_scrape = df_scrape.set_index('PostalCode')
df_scrape['Latitude'], df_scrape['Longitude'] = df_PScodes['Latitude'], df_PScodes['Longitude']
df_scrape

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park,Harbourfront",43.654260,-79.360636
M6A,North York,"Lawrence Manor,Lawrence Heights",43.718518,-79.464763
M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
...,...,...,...,...
M5X,Downtown Toronto,"First Canadian Place,Underground city",43.648429,-79.382280
M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
M8Y,Etobicoke,"Old Mill South,King's Mill Park,Sunnylea,Humbe...",43.636258,-79.498509


In [9]:
df_scrape.reset_index(inplace=True)

In [10]:
df_scrape

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor,Lawrence Heights",43.718518,-79.464763
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
...,...,...,...,...,...
94,M5X,Downtown Toronto,"First Canadian Place,Underground city",43.648429,-79.382280
95,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
96,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
97,M8Y,Etobicoke,"Old Mill South,King's Mill Park,Sunnylea,Humbe...",43.636258,-79.498509
