In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url).text
soup = BeautifulSoup(req, 'html')

## Let's read the contents of the web page and look for the cells

In [3]:
# The contents of the table cell for reference
print(soup.find_all('td')[6].text)
soup.find_all('td')[6]


M7AQueen's Park(Ontario Provincial Government)




<td style="width:11%; vertical-align:top;">
<p><b>M7A</b><br/><span style="font-size:85%;"><a href="/wiki/Queen%27s_Park_(Toronto)" title="Queen's Park (Toronto)">Queen's Park</a><br/>(Ontario Provincial Government)</span>
</p>
</td>

In [4]:
# First let's make a list that will contain all the information
comm = []
# Run through all the instances of a table cell
for instance in soup.find_all('td'):
    # Initiate the dictionary that will hold the cell's data
    commDict = {}
    # The try will kick out if there is no information in the cell and make no entry
    try:
        # The postal code is wrapped up in the first Bold tag <b></b>
        commDict['PostalCode'] = instance.b.text
        # If the cell is unassaigned it will not allocate any information
        if instance.span.text == 'Not assigned':
            pass
        else:
            data = instance.span.text.split(')')
            # Now the Borough is in the first bit of the text before the '('
            commDict['Borough'] = instance.span.text[:instance.span.text.find('(')]
            commDict['Neighborhood'] = ''
            for area in data:
                if commDict['Neighborhood'] != '' and len(area.split('(')>1):
                    commDict['Neighborhood'] = commDict['Neighborhood'] + ',' 
                # The neighborhoods content is in between the cells and seperated by ' / '
                commDict['Neighborhood'] = commDict['Neighborhood'] + area[instance.span.text.find('(')+1:].replace(' / ',',')
    except:
        pass
    # Now we read the data into the list 'comm' if there is information
    try:
        if commDict['Borough'] is not None:
            comm.append(commDict)
    except:
        pass
# The dictionary with all the information
comm

[{'PostalCode': 'M3A', 'Borough': 'North York', 'Neighborhood': 'Parkwoods'},
 {'PostalCode': 'M4A',
  'Borough': 'North York',
  'Neighborhood': 'Victoria Village'},
 {'PostalCode': 'M5A',
  'Borough': 'Downtown Toronto',
  'Neighborhood': 'Regent Park,Harbourfront'},
 {'PostalCode': 'M6A',
  'Borough': 'North York',
  'Neighborhood': 'Lawrence Manor,Lawrence Heights'},
 {'PostalCode': 'M7A',
  'Borough': "Queen's Park",
  'Neighborhood': 'Ontario Provincial Government'},
 {'PostalCode': 'M9A',
  'Borough': 'Etobicoke',
  'Neighborhood': 'Islington Avenue'},
 {'PostalCode': 'M1B',
  'Borough': 'Scarborough',
  'Neighborhood': 'Malvern,Rouge'},
 {'PostalCode': 'M3B', 'Borough': 'North York', 'Neighborhood': 'Don Mills'},
 {'PostalCode': 'M4B',
  'Borough': 'East York',
  'Neighborhood': 'Parkview Hill,Woodbine Gardens'},
 {'PostalCode': 'M5B',
  'Borough': 'Downtown Toronto',
  'Neighborhood': 'Garden District, Ryerson'},
 {'PostalCode': 'M6B', 'Borough': 'North York', 'Neighborhood': 

In [5]:
# Now we read the dictionary into a dataframe
df_scrape = pd.DataFrame(comm)
df_scrape

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park,Harbourfront"
3,M6A,North York,"Lawrence Manor,Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South,King's Mill Park,Sunnylea,Humbe..."


In [6]:
# Let's remove areas that are just postal addresses
ban_list = ['M7Y','M5W','M7R','M7A']
df_scrape = df_scrape[~df_scrape['PostalCode'].isin(ban_list)]
df_scrape.shape

(99, 3)