In [112]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests


#### After import necessary package, using BeautifulSoup to obtain the table

In [113]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_source = requests.get(wiki_url).text
soup = BeautifulSoup(wiki_source, 'lxml')
table = soup.tbody

#### Initialize the DataFrame

In [114]:
collist = ['Postcode', 'Borough', 'Neighbourhood']
df_og = pd.DataFrame(columns = collist)

#### Some Helper Function

##### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [115]:
def na_borough(row):
    return row[1] == 'Not assigned'

##### More than one neighborhood can exist in one postal code area. These two rows will be combined into one row with the neighborhoods separated with a comma

In [116]:
def combine_rows(row, df):
    if row[0] not in df['Postcode'].values:
        return df.append(pd.Series(row, index = df.columns), ignore_index = True)
    target = df[df['Postcode'] == row[0]].index[0]
    df.loc[target, 'Neighbourhood'] = (df[df['Postcode'] == row[0]]['Neighbourhood'].values + ', ' + row[2])[0] 
    return df

##### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [117]:
def na_neighbourhood(row):
    if row[2] == 'Not assigned':
        row[2] = row[1]
    return row

#### Go through the Data and form the DataFrame

In [118]:
for line in table.find_all('tr'):
    tmp = []
    for item in line.find_all('td'):
        tmp.append(item.text.strip())
    if len(tmp) != 3:
        continue
    if na_borough(tmp):
        continue
    tmp = na_neighbourhood(tmp)
    df_og = combine_rows(tmp, df_og)

#### Print out the shape of the DataFrame

In [119]:
print (df_og.shape)

(103, 3)
