In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

#### Setup the website scrape

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
header = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36", "X-Requested-With": "XMLHttpRequest"}


In [4]:
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

In [5]:
soup = BeautifulSoup(html_content, 'lxml')

#### Table -> Data Frame

In [6]:
tbl = soup.find("table", attrs={"class": "wikitable"})
data_frame = pd.read_html(str(tbl))[0]

In [7]:
data_frame

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


#### Drop rows that have Borough as Not assigned

In [8]:
df = data_frame.drop(data_frame[data_frame.Borough == 'Not assigned'].index)

In [9]:
df.shape

(103, 3)

#### Check if there are any duplicates in the Postal Code column

In [10]:
df['Postal Code'].value_counts() > 1 

M6H    False
M4N    False
M9L    False
M1P    False
M3C    False
       ...  
M9B    False
M8Z    False
M4R    False
M4V    False
M4G    False
Name: Postal Code, Length: 103, dtype: bool

#### Check if there are any "Not assigned" in the Neighborhood column

In [11]:
df['Neighborhood'] == 'Not assigned'

2      False
3      False
4      False
5      False
6      False
       ...  
160    False
165    False
168    False
169    False
178    False
Name: Neighborhood, Length: 103, dtype: bool

In [12]:
print('Final size of the data is ', df.shape)

Final size of the data is  (103, 3)
