In [1]:
from bs4 import BeautifulSoup as BSp
import lxml
#import html5lib alternative to lxml
import requests
import pandas as pd

### 1. Get text from wikipedia website for scraping

In [2]:
#from html file for offline work
#with open('Cantest.html') as file:
#    soup = BSp(file,'lxml')


#from URL
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(URL).text
soup = BSp(source, 'lxml')
#print(soup.prettify())

### 2. Scraping
- find table
- extract all rows
- produce a clean list through spliting and slicing

In [3]:
#isolating table
table = soup.find('table')
rows = table.tbody.text
rows

"\nPostcode\nBorough\nNeighbourhood\n\n\nM1A\nNot assigned\nNot assigned\n\n\nM2A\nNot assigned\nNot assigned\n\n\nM3A\nNorth York\nParkwoods\n\n\nM4A\nNorth York\nVictoria Village\n\n\nM5A\nDowntown Toronto\nHarbourfront\n\n\nM5A\nDowntown Toronto\nRegent Park\n\n\nM6A\nNorth York\nLawrence Heights\n\n\nM6A\nNorth York\nLawrence Manor\n\n\nM7A\nQueen's Park\nNot assigned\n\n\nM8A\nNot assigned\nNot assigned\n\n\nM9A\nEtobicoke\nIslington Avenue\n\n\nM1B\nScarborough\nRouge\n\n\nM1B\nScarborough\nMalvern\n\n\nM2B\nNot assigned\nNot assigned\n\n\nM3B\nNorth York\nDon Mills North\n\n\nM4B\nEast York\nWoodbine Gardens\n\n\nM4B\nEast York\nParkview Hill\n\n\nM5B\nDowntown Toronto\nRyerson\n\n\nM5B\nDowntown Toronto\nGarden District\n\n\nM6B\nNorth York\nGlencairn\n\n\nM7B\nNot assigned\nNot assigned\n\n\nM8B\nNot assigned\nNot assigned\n\n\nM9B\nEtobicoke\nCloverdale\n\n\nM9B\nEtobicoke\nIslington\n\n\nM9B\nEtobicoke\nMartin Grove\n\n\nM9B\nEtobicoke\nPrincess Gardens\n\n\nM9B\nEtobicoke\n

In [4]:
# create list by spliting the string
rowList = rows.split('\n')

# clean the list
while '' in rowList:
    rowList.remove('')

#remove header
rowList = rowList[3:]

### 3. produce dataframe

In [5]:
#create list for later columns

postcode = rowList[0::3]
Borough = rowList[1::3]
Neighb = rowList[2::3]

#create a dictionary and from that the dataframe
dfDict = {'Postcode':postcode,'Borough':Borough,'Neighborhood':Neighb}
df = pd.DataFrame.from_dict(dfDict)
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [6]:
#remove "Not assigned" Boroughs
df = df[df.Borough != "Not assigned"]

#join Neighborhoods by postcode
df = df.groupby(['Postcode', 'Borough'], as_index = False)['Neighborhood'].agg(list)
df['Neighborhood'] = df['Neighborhood'].str.join(', ')

#replace "Not assigned" Neighborhood with Borough
dfTest = df.copy()
dfTest.iloc[:, 1:3] = dfTest.iloc[:, 1:3].where(df['Neighborhood'] != "Not assigned", df['Borough'], axis = 0)
df = dfTest
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


In [9]:
#load csv file with coordinates
dfCoords = pd.read_csv('Geospatial_Coordinates.csv')
dfCoords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
#join dataframes and drop additional Postal Codes column
dfNew = pd.concat([df, dfCoords], axis = 1)
dfFull = dfNew.drop(["Postal Code"], axis = 1)
dfFull.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
