In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [3]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source ,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})

In [4]:
header = table.tr.text
headers = header.split('\n')
headers = headers[1:len(headers)-1]
headers

['Postcode', 'Borough', 'Neighbourhood']

In [5]:
rows_list = []
rows = table.findAll('tr')
for row in rows:
    rows_list.append([cell.text.rstrip() for cell in row.findAll('td')])
rows_list = rows_list[1:]
rows_list[0:10]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned']]

In [7]:
import copy
row_list_copy = copy.deepcopy(rows_list)
for sample_row in row_list_copy:
    if sample_row[-1] == 'Not assigned':
        if sample_row[1] != 'Not assigned':
            sample_row[-1] = sample_row[1]
final_list = row_list_copy
row_list_copy[0:10]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", "Queen's Park"],
 ['M8A', 'Not assigned', 'Not assigned']]

In [8]:
rows_list[8]

['M7A', "Queen's Park", 'Not assigned']

In [9]:
df_temp = pd.DataFrame(final_list,columns = headers)
df_temp.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
9,M8A,Not assigned,Not assigned


In [10]:
Borough_filter = df_temp.Borough == 'Not assigned'
Borough_filter[:10]

0     True
1     True
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9     True
Name: Borough, dtype: bool

In [11]:
df = df_temp[~Borough_filter]
df.sort_values(by='Postcode').head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern
29,M1C,Scarborough,Port Union
28,M1C,Scarborough,Rouge Hill
27,M1C,Scarborough,Highland Creek
42,M1E,Scarborough,Guildwood
43,M1E,Scarborough,Morningside
44,M1E,Scarborough,West Hill
53,M1G,Scarborough,Woburn
62,M1H,Scarborough,Cedarbrae


In [12]:
len(df.Postcode.unique())

103

In [13]:
df2 = df.groupby('Postcode').agg({'Borough':'first',
                               'Neighbourhood': ', '.join}).reset_index()

In [14]:
df2.sort_values(by='Postcode').head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [15]:
len(df2.Postcode.unique())

103

In [40]:
!conda install -c conda-forge geocoder --yes

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geocoder:   1.38.1-py_0  conda-forge
    orderedset: 2.0-py35_0   conda-forge
    ratelim:    0.1.6-py35_0 conda-forge

orderedset-2.0 100% |################################| Time: 0:00:00  60.52 MB/s
ratelim-0.1.6- 100% |################################| Time: 0:00:00  11.89 MB/s
geocoder-1.38. 100% |################################| Time: 0:00:00  46.13 MB/s


In [41]:
import geocoder

In [46]:
df2.sort_values(by='Postcode').head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [51]:
path="http://cocl.us/Geospatial_data"
coordinate_data = pd.read_csv(path,header=None)
print("Done")


Done


In [52]:
type(coordinate_data)

pandas.core.frame.DataFrame

In [53]:
coordinate_data.head()

Unnamed: 0,0,1,2
0,Postal Code,Latitude,Longitude
1,M1B,43.8066863,-79.1943534
2,M1C,43.7845351,-79.1604971
3,M1E,43.7635726,-79.1887115
4,M1G,43.7709921,-79.2169174


In [54]:
coordinate_data.columns = ['Postcode', 'Latitude', 'Longitude']
coordinate_data.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,Postal Code,Latitude,Longitude
1,M1B,43.8066863,-79.1943534
2,M1C,43.7845351,-79.1604971
3,M1E,43.7635726,-79.1887115
4,M1G,43.7709921,-79.2169174


In [55]:
len(coordinate_data.Postcode.unique())

104

In [56]:
coordinate_data.sort_values(by='Postcode').head()

Unnamed: 0,Postcode,Latitude,Longitude
1,M1B,43.8066863,-79.1943534
2,M1C,43.7845351,-79.1604971
3,M1E,43.7635726,-79.1887115
4,M1G,43.7709921,-79.2169174
5,M1H,43.773136,-79.2394761


In [57]:
merged_data = df2.merge(coordinate_data,on = 'Postcode')
merged_data2 = df.merge(coordinate_data,on = 'Postcode')

In [58]:
merged_data = merged_data.sort_values(by='Postcode')

In [59]:
merged_data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8066863,-79.1943534
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845351,-79.1604971
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.2169174
4,M1H,Scarborough,Cedarbrae,43.773136,-79.2394761
5,M1J,Scarborough,Scarborough Village,43.7447342,-79.2394761
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.7279292,-79.2620294
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.7111117,-79.2845772
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.2394761
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.2648481


In [60]:
merged_data.Borough.unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

In [61]:
merged_data2 = merged_data2.sort_values(by='Postcode')
merged_data2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
8,M1B,Scarborough,Rouge,43.8066863,-79.1943534
9,M1B,Scarborough,Malvern,43.8066863,-79.1943534
23,M1C,Scarborough,Port Union,43.7845351,-79.1604971
22,M1C,Scarborough,Rouge Hill,43.7845351,-79.1604971
21,M1C,Scarborough,Highland Creek,43.7845351,-79.1604971


In [62]:
filter_ = merged_data2.Borough.str.contains('Toronto')
toronto_df = merged_data2[filter_]
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
36,M4E,East Toronto,The Beaches,43.6763574,-79.2930312
72,M4K,East Toronto,The Danforth West,43.6795571,-79.352188
73,M4K,East Toronto,Riverdale,43.6795571,-79.352188
85,M4L,East Toronto,The Beaches West,43.6689985,-79.3155716
86,M4L,East Toronto,India Bazaar,43.6689985,-79.3155716
