### Transform data from Wiki to pandas dataframe

In [1]:
# setup
import numpy as np
import pandas as pd

In [2]:
# setup a empty dataframe
data = pd.DataFrame()

# use panda read_html 
url = ['https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M']
for url in url:
    data = data.append(pd.read_html(url))
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,Borough,Neighbourhood,Postcode
0,,,,,,,,,,,...,,,,,,,,Not assigned,Not assigned,M1A
1,,,,,,,,,,,...,,,,,,,,Not assigned,Not assigned,M2A
2,,,,,,,,,,,...,,,,,,,,North York,Parkwoods,M3A
3,,,,,,,,,,,...,,,,,,,,North York,Victoria Village,M4A
4,,,,,,,,,,,...,,,,,,,,Downtown Toronto,Harbourfront,M5A


In [3]:
# only need postcode, borough and neighbourhood 
data = data[['Postcode', 'Borough', 'Neighbourhood']]
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
# delete all rows that borough equals not assigned
data = data[data['Borough'] != "Not assigned"]

In [5]:
data

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
1,,,
2,,,
3,,,
0,,,


In [7]:
# first according postcode and borough to group, and add it together in neighbourhood with comma to separate
data = data.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
data

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [10]:
# iterate dataframe and copy content of borough when its neighbourhhod is not assigned
for index, row in data.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']

In [11]:
data.shape

(103, 3)