In [1]:
#For data retrieval and manipulation
import numpy as np
import pandas as pd 
from bs4 import BeautifulSoup
import requests

In [2]:
# Request and Retrieve the data
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)
r_soup = BeautifulSoup(r.text)
r_table = r_soup.find('tbody')
r_table_rows = r_table.find_all('tr')[1:]

In [3]:
# Get the dataframe ready for input
headers  = ['Postal Code','Borough','Neighbourhood']
toronto_df = pd.DataFrame(columns=headers)

In [4]:
# Put the information into the dataframe
for i in range(len(r_table_rows)):
    toronto_df.loc[i] = [td.text.strip() for td in r_table_rows[i].find_all('td')]

In [5]:
# Check
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
# Remove "Not Assigned" Boroughs
toronto_df = toronto_df.drop(toronto_df[toronto_df.Borough == 'Not assigned'].index)

In [8]:
# Check
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [10]:
# Assign Borough if Neighbourhood is not assigned
toronto_df.loc[toronto_df['Neighbourhood'] == 'Not assigned','Neighbourhood'] = toronto_df['Borough']

In [11]:
# Check
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [12]:
# Grouping more than one Neighborhoods to postal code
toronto_df = toronto_df.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(list).to_frame().reset_index()
toronto_df['Neighbourhood'] = toronto_df['Neighbourhood'].str.join(', ')

In [13]:
# Check
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
# SHAPE
toronto_df.shape

(103, 3)

In [17]:
# Obtaining Geo Data
geodata = pd.read_csv('http://cocl.us/Geospatial_data', sep = ',')
toronto_df = toronto_df.merge(geodata,left_on = 'Postal Code', right_on = 'Postal Code', how = 'left')

In [18]:
# Check
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [20]:
#Here is the entire dataframe
toronto_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
