In [3]:
!conda install -c conda-forge folium=0.5.0 --yes

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  10.37 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  37.49 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  37.79 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  50.82 MB/s


In [4]:
import pandas     as pd
import numpy      as np
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from sklearn.cluster import KMeans

In [5]:
#get html and scrap with pandas
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
#rename columns to proper Names
data.rename(index=str,columns={0:'Postcode',1:'Borough',2:'Neighbourhood'},inplace = True)
#first row is not valid
data.drop(['0'],inplace = True)
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
data.drop(data[data['Borough'] == 'Not assigned'].index,inplace = True)
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
data['Neighbourhood'][data['Neighbourhood'] == 'Not assigned'] = data['Borough'][data['Neighbourhood'] == 'Not assigned']

In [6]:
#reset Index I think this is not mandatory
data.reset_index(inplace = True)
data.drop(columns=['index'], axis = 1,inplace = True)

In [7]:
data['Neighbourhood'] = '%' + data['Neighbourhood'].astype(str) + '%'
data = data.groupby(['Postcode' ,'Borough' ],as_index = False)['Neighbourhood'].sum()
data['Neighbourhood'] = data['Neighbourhood'].str.replace('%%',', ')
data['Neighbourhood'] = data['Neighbourhood'].str.replace('%','')

In [8]:
geo_data = pd.read_csv('http://cocl.us/Geospatial_data')
geo_data.rename(index=str,columns={'Postal Code':'Postcode'},inplace = True)

In [9]:
data = data.join(geo_data.set_index('Postcode'), on='Postcode')

In [10]:
#get coordinate of Toronto City
address = 'Toronto City, TO'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.7829772, -79.3870110894466.


I really don't have any new idea about clustering of Toronto city Neighbourhoods and I don't Like to do just the same as we did to New york city, I just decide to cluster them in 5 by their Latitude and Longitude
for a future use of Dorn postmans.

In [11]:
data_cluster = data.drop(columns={'Postcode','Borough','Neighbourhood'})
K_clusters = 5
kmeans = KMeans(n_clusters=K_clusters, random_state=0).fit(data_cluster)
data['Cluster'] = kmeans.labels_
data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,0
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029,0
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577,2
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476,0
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,2


In [12]:
map_clusters = folium.Map(width=1000,height=500,location=[latitude, longitude], zoom_start=11)

x = np.arange(K_clusters)
ys = [i + x + (i*x)**2 for i in range(K_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(data['Latitude'],data['Longitude'], data['Neighbourhood'], data['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker( [lat, lon]
                       , radius       =  5
                       , popup        =  label
                       , color        =  rainbow[cluster-1]
                       , fill         =  True
                       , fill_color   =  rainbow[cluster-1]
                       , fill_opacity =  0.7
                       , parse_html   =  False      ).add_to(map_clusters)       
map_clusters