In [1]:
import numpy as np 

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

from geopy.geocoders import Nominatim

import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium

print('Libraries imported.')

Libraries imported.


In [2]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


<h3>Accessing the Database using Pandas with BS4 <h3>

In [4]:
from bs4 import BeautifulSoup

res=requests.get(' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 

dfs = pd.read_html(str(table))
df=dfs[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


<h3>Removing 'Not assigned' Borough <h3>

In [5]:
df.drop( df[ df['Borough'] == 'Not assigned'].index , inplace=True)
df.reset_index(drop=True,inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


<h3>Checking if merging is needed <h3>

In [6]:
df1=df['Postal Code'].value_counts(sort=True).to_frame()
df1.head()

Unnamed: 0,Postal Code
M4G,1
M4M,1
M1L,1
M1W,1
M1K,1


In [None]:
#df.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
#df.reset_index(inplace=True)
# /\ run if Postal Code > 1

<h3> Checking if there are any NaN remaining cells <h3>

In [7]:
df.isna().sum()

Postal Code     0
Borough         0
Neighborhood    0
dtype: int64

In [77]:
#df['Neighborhood'] = np.where(df['Neighborhood'] == 'Not assigned',df['Borough'], df['Neighborhood'])
# /\ run if sum > 0

In [9]:
df.shape

(103, 3)

<h3> Getting the coordinates <h3>

In [12]:
cord = pd.read_csv('https://cocl.us/Geospatial_data')
cord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
df1 = pd.merge(df,cord,on='Postal Code')
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,North York,Parkwoods,43.806686,-79.194353
1,M1C,North York,Victoria Village,43.784535,-79.160497
2,M1E,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M1G,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M1H,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476


In [19]:
df1.shape

(103, 5)

<h3> Clustering <h3>

Getting all rows that contain "Toronto" in the Borough column

In [21]:
dfT = df1[df1['Borough'].str.contains('Toronto',regex=False)]
dfT

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M1E,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
4,M1H,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476
9,M1N,Downtown Toronto,"Garden District, Ryerson",43.692657,-79.264848
15,M1W,Downtown Toronto,St. James Town,43.799525,-79.318389
19,M2K,East Toronto,The Beaches,43.786947,-79.385975
20,M2L,Downtown Toronto,Berczy Park,43.75749,-79.374714
24,M2R,Downtown Toronto,Central Bay Street,43.782736,-79.442259
25,M3A,Downtown Toronto,Christie,43.753259,-79.329656
30,M3K,Downtown Toronto,"Richmond, Adelaide, King",43.737473,-79.464763
31,M3L,West Toronto,"Dufferin, Dovercourt Village",43.739015,-79.506944


Creating the map plotted with all Borough=%Toronto%

In [27]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(dfT['Latitude'],dfT['Longitude'],dfT['Borough'],dfT['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

Using K-Means clustering on our data

In [25]:
k=4
toronto_clustering = dfT.drop(['Postal Code','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
dfT.insert(0, 'Cluster Labels', kmeans.labels_)

Creating the map plotted with all defined cluster

In [26]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(dfT['Latitude'], dfT['Longitude'], dfT['Neighborhood'], dfT['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters