In [18]:
import pandas as pd
import numpy as np
import urllib.request

from bs4 import BeautifulSoup

In [19]:
#load the webpage with beautifulsoup4
website_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
webpage = urllib.request.urlopen(website_url)
soup = BeautifulSoup(webpage, 'lxml')

#locate the table and get the type
tables = soup.find_all('table')

#select it like so...
pstl_table = soup.find('table', {'class':'wikitable sortable'})

In [20]:
#convert from an html table into a pandas dataframe
postcodes = []
boroughs = []
neighbourhoods = []

for row in pstl_table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells) == 3:
        postcodes.append(cells[0].find(text=True))
        boroughs.append(cells[1].find(text=True))
        neighbourhoods.append(cells[2].find(text=True))
    
df = pd.DataFrame({'Postal Code': postcodes, 'Borough':boroughs, 'Neighbourhood':neighbourhoods})
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
...,...,...,...
175,M5Z\n,Not assigned\n,Not assigned\n
176,M6Z\n,Not assigned\n,Not assigned\n
177,M7Z\n,Not assigned\n,Not assigned\n
178,M8Z\n,Etobicoke\n,"Mimico NW, The Queensway West, South of Bloor,..."


In [21]:
#remove the messy newline characters...
df.replace('\\n','', regex=True, inplace = True)

#we don't need any rows with a borough that isn't assigned
df = df[df['Borough'] != 'Not assigned'].reset_index(drop=True)

#assuming that you can't have different burrows within the same post code...
df = df.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(','.join).reset_index()
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [22]:
import geocoder

def get_lat_long(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude

#df[['Latitude', 'Longitude']] = zip(*df['Postal Code'].apply(get_lat_long))
#df

Geocoder isn't working - freezes on first attempt to find coords. So, let's use the file instead:

In [23]:
import os
from pathlib import Path
#geodata = pd.read_csv('')
path = Path(os.getcwd()) / 'Geospatial_Coordinates.csv'

geo_df = pd.read_csv(path)
geo_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [24]:
df = pd.merge(df, geo_df, how='left', on='Postal Code')
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


Let's take a look at those on a map...

In [25]:
import folium
from sklearn.cluster import KMeans


In [26]:
#To create the map, we need the lat/long coords of Toronto, which we can get from geopy, or just average
#the values we have to get close enough

lat, lon = df.Latitude.mean(), df.Longitude.mean()
map_toronto = folium.Map(location = [lat, lon])

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng],
                        radius=4,
                        popup=label,
                        color='red',
                        fill=True,
                        fill_color='red',
                        fill_opacity=0.4,
                        parse_html=False).add_to(map_toronto)  


map_toronto

In [27]:
#We can't just group these by lat/long
#...well, we can, but we shouldn't, because the groups will be pretty meaningless

#we can one-hot encode the buroughs
df = df.join(pd.get_dummies(df.Borough, prefix='Borough'))#.drop('Borough', axis = 1)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Borough_Central Toronto,Borough_Downtown Toronto,Borough_East Toronto,Borough_East York,Borough_Etobicoke,Borough_Mississauga,Borough_North York,Borough_Scarborough,Borough_West Toronto,Borough_York
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0,0,0,0,0,0,0,1,0,0
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,0,0,0,0,0,0,0,1,0,0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0,0,0,0,0,0,0,1,0,0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0,0,0,0,0,0,0,1,0,0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188,0,0,0,0,0,0,0,0,0,1
99,M9P,Etobicoke,Westmount,43.696319,-79.532242,0,0,0,0,1,0,0,0,0,0
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724,0,0,0,0,1,0,0,0,0,0
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437,0,0,0,0,1,0,0,0,0,0


In [28]:
#we kept the borough column so that we can join data to it, but we will need to drop it before running any ML algorithm on
#the data

