In [1]:
from bs4 import BeautifulSoup 
import requests
import numpy as np 
import pandas as pd 
import random 

In [2]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source, 'lxml')

table = soup.find("table")
table_rows = table.tbody.find_all("tr")

res = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [tr.text for tr in td]
    
    # Only process the cells that have an assigned borough. 
    if row != [] and row[1] != "Not assigned\n":
        if "Not assigned\n" in row[2]: 
            row[2] = row[1]
        res.append(row)

In [3]:
wiki_df = pd.DataFrame(res, columns = ["PostalCode", "Borough", "Neighborhood"])
wiki_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods\n
1,M4A\n,North York\n,Victoria Village\n
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"


In [4]:
wiki_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   PostalCode    103 non-null    object
 1   Borough       103 non-null    object
 2   Neighborhood  103 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


In [5]:
#Remove "\n"
wiki_df["Neighborhood"] = wiki_df["Neighborhood"].str.replace("\n","")
wiki_df["PostalCode"] = wiki_df["PostalCode"].str.replace("\n","")
wiki_df["Borough"] = wiki_df["Borough"].str.replace("\n","")

wiki_df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
wiki_df["PostalCode"].nunique()

103

In [7]:
#Group all neighborhoods with the same postal code
wiki_df = wiki_df.groupby(["PostalCode", "Borough"])["Neighborhood"].apply(", ".join).reset_index()
wiki_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
c = (wiki_df['Neighborhood'] == '').sum()
print('empty string in wiki_data Neighborhood column :', c)

empty string in wiki_data Neighborhood column : 0


In [9]:
wiki_df.shape

(103, 3)