In [1]:
import pandas as pd
import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen

## Reading wikipedia page URL and parsing HTML using a BeautifulSoup object

In [35]:
wikiUrl = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikiData = urlopen(wikiUrl)
wikiHtml = wikiData.read()
wikiData.close()

page_soup = soup(wikiHtml, 'html.parser')

## Finding the table in the HTML based on the attribute 'class'

In [36]:
table_soup = page_soup.find_all(class_='wikitable')[0]

## Parsing 'th' tags to find table headers and stock them in a list

In [78]:
headers = []
for th in table_soup.find_all('th'):
    headers.append(th.text[:-1]) #extracting text from each th tag and removing last character (new line)
print(headers)
headers[0] = 'PostalCode'        #Modifying the name of the first column to match requirements
headers

['Postal code', 'Borough', 'Neighborhood']


['PostalCode', 'Borough', 'Neighborhood']

## Parsing content ,creating a list of rows, wrangling and cleaning data in the process

In [85]:
content = []
for tr in table_soup.find_all('tr')[1:] :
    new_row = []
    for td in tr.find_all('td'):
        new_row.append(td.text[:-1])    # extracting all 3 td tags creating a row and creating a list
    if new_row[1] != 'Not assigned' :
        new_row[2] = new_row[2].replace(' / ',', ')           # seperating neighborhoods by ',' instead of '/'
        if new_row[2] in ('','Not assigned'):
            new_row[2] = new_row[1]                           # 'Not assigned neighborhoods are the same as the borough'
        content.append(new_row)         # appending new row to the list content if it has an assigned borough
content[:5]                             # showing first 5 rows to make sure data is correct

[['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront'],
 ['M6A', 'North York', 'Lawrence Manor, Lawrence Heights'],
 ['M7A', 'Downtown Toronto', "Queen's Park, Ontario Provincial Government"]]

## Creating a pandas dataframe from our data, with 'headers' as the list of columns :

In [87]:
df= pd.DataFrame(content,columns = headers)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [86]:
df.shape

(103, 3)