In [1]:
import pandas as pd
import numpy as np

<B>Scrape neighbourhood data from Wiki to a Pandas DF:</B>

In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

In [3]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


<B>Drop rows where Borough value is Not assigned:</B>

In [4]:
df = df[df.Borough != 'Not assigned']

In [5]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
pd.set_option('display.max_rows', 10)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


<B>If cell has a borough but a Not assigned neighbourhood, then update Neighbourhood value with Borough value from same row </B>

In [7]:
np.where(df['Neighbourhood'].eq("Not assigned"),df['Borough'],df['Neighbourhood'])

array(['Parkwoods', 'Victoria Village', 'Regent Park, Harbourfront',
       'Lawrence Manor, Lawrence Heights',
       "Queen's Park, Ontario Provincial Government",
       'Islington Avenue, Humber Valley Village', 'Malvern, Rouge',
       'Don Mills', 'Parkview Hill, Woodbine Gardens',
       'Garden District, Ryerson', 'Glencairn',
       'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale',
       'Rouge Hill, Port Union, Highland Creek', 'Don Mills',
       'Woodbine Heights', 'St. James Town', 'Humewood-Cedarvale',
       'Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood',
       'Guildwood, Morningside, West Hill', 'The Beaches', 'Berczy Park',
       'Caledonia-Fairbanks', 'Woburn', 'Leaside', 'Central Bay Street',
       'Christie', 'Cedarbrae', 'Hillcrest Village',
       'Bathurst Manor, Wilson Heights, Downsview North',
       'Thorncliffe Park', 'Richmond, Adelaide, King',
       'Dufferin, Dovercourt Village', 'Scarborough Village',
      

In [8]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


<B>One Postal Code can exist only once per Borough</B>

In [9]:
df = df.groupby(['Postal Code','Borough'],  as_index=False).agg(lambda x: ','.join(x))

In [10]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<B> Print the number of rows of dataframe </B>

In [11]:
df.shape

(103, 3)

<B>Download geographical coordinates of neighbourhoods:</B>

In [12]:
df_geo_coord = pd.read_csv("https://cocl.us/Geospatial_data")
df_geo_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import folium # map rendering library

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



<B>Create df1 - which merges df with coordinate data from df_geo_coord</B>

In [46]:
df1 = pd.merge(df, df_geo_coord, left_on='Postal Code', right_on='Postal Code', how='left')
df1

Unnamed: 0,Postal Code,Borough,Neighbourhood,labels,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",2,43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",2,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",2,43.763573,-79.188711
3,M1G,Scarborough,Woburn,2,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,2,43.773136,-79.239476
...,...,...,...,...,...,...
98,M9N,York,Weston,1,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,1,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",1,43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",1,43.739416,-79.588437


<B>Exploring Toronto, finding its cooridinates</B>

In [27]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


<B>Create folium map of different neighbourhoods in Toronto</B>

In [16]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df1['Latitude'], df1['Longitude'], df1['Borough'], df1['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [17]:
import requests # library to handle requests
import random # library for random number generation
# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize


! pip install folium==0.5.0
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Libraries imported.



<B>Setup credentials for Foursquare </B>

In [18]:
CLIENT_ID = 'USMK0V15OJVNLUG5G4PFXODPQ00U52YRTB1XGMRHRJNGWAUO' # your Foursquare ID
CLIENT_SECRET = '4MXCRDQPX0GFAQAUJHTZCL2PFGKRSYFVZ0H21Z15NO5HK2LP' # your Foursquare Secret
ACCESS_TOKEN = 'GM5EGSJM15HF50O2WI3GWN3XRBJE2U05VI3GKC3PWJMPYWNC' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: USMK0V15OJVNLUG5G4PFXODPQ00U52YRTB1XGMRHRJNGWAUO
CLIENT_SECRET:4MXCRDQPX0GFAQAUJHTZCL2PFGKRSYFVZ0H21Z15NO5HK2LP


<B>Do a search of Chinese foods</B>

In [52]:
search_query = 'Chinese'
radius = 500
print(search_query + ' .... OK!')

Chinese .... OK!


<B>Pass query to api and make call you FourSQuare</B>

In [21]:
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude,ACCESS_TOKEN, VERSION, search_query, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/search?client_id=USMK0V15OJVNLUG5G4PFXODPQ00U52YRTB1XGMRHRJNGWAUO&client_secret=4MXCRDQPX0GFAQAUJHTZCL2PFGKRSYFVZ0H21Z15NO5HK2LP&ll=43.65053245,-79.38210488603187&oauth_token=GM5EGSJM15HF50O2WI3GWN3XRBJE2U05VI3GKC3PWJMPYWNC&v=20180604&query=Chinese&radius=500&limit=30'

In [22]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60292eee9671424fd3f462b2'},
 'notifications': [{'type': 'notificationTray', 'item': {'unreadCount': 0}}],
 'response': {'venues': [{'id': '5202d246498e9649c88150b7',
    'name': 'Toronto Chinese Academy',
    'location': {'address': '133 Richmond Street West',
     'crossStreet': 'York Street',
     'lat': 43.65014006138457,
     'lng': -79.38485690552969,
     'labeledLatLngs': [{'label': 'display',
       'lat': 43.65014006138457,
       'lng': -79.38485690552969}],
     'distance': 225,
     'postalCode': 'M5H 2L3',
     'cc': 'CA',
     'neighborhood': 'Financial District',
     'city': 'Toronto',
     'state': 'ON',
     'country': 'Canada',
     'formattedAddress': ['133 Richmond Street West (York Street)',
      'Toronto ON M5H 2L3']},
    'categories': [{'id': '4bf58dd8d48988d1ae941735',
      'name': 'University',
      'pluralName': 'Universities',
      'shortName': 'University',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categori

<B>Get relevant part of JSON and transform it into a pandas dataframe</B>

In [23]:
# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = json_normalize(venues)
dataframe.head()

  dataframe = json_normalize(venues)


Unnamed: 0,id,name,categories,referralId,hasPerk,location.address,location.crossStreet,location.lat,location.lng,location.labeledLatLngs,location.distance,location.postalCode,location.cc,location.neighborhood,location.city,location.state,location.country,location.formattedAddress
0,5202d246498e9649c88150b7,Toronto Chinese Academy,"[{'id': '4bf58dd8d48988d1ae941735', 'name': 'U...",v-1613311726,False,133 Richmond Street West,York Street,43.65014,-79.384857,"[{'label': 'display', 'lat': 43.65014006138457...",225,M5H 2L3,CA,Financial District,Toronto,ON,Canada,"[133 Richmond Street West (York Street), Toron..."
1,4f04779a02d5cce0cfc06151,Chinese Visa Application Service Center,"[{'id': '4bf58dd8d48988d126941735', 'name': 'G...",v-1613311726,False,"393 University Ave, Suite 1501",in University Centre,43.654028,-79.387365,"[{'label': 'display', 'lat': 43.65402839343005...",575,,CA,,Toronto,ON,Canada,"[393 University Ave, Suite 1501 (in University..."
2,52a7ae41498eed3af4d0a3fa,Yueh Tung Chinese Restaurant,"[{'id': '4bf58dd8d48988d145941735', 'name': 'C...",v-1613311726,False,126 Elizabeth St.,Dundas St.,43.655281,-79.385337,"[{'label': 'display', 'lat': 43.65528126342919...",589,M5G 1P5,CA,,Toronto,ON,Canada,"[126 Elizabeth St. (Dundas St.), Toronto ON M5..."
3,4b1be39df964a5209cfe23e3,Chic Chinoise Pan Asian Cuisine,"[{'id': '4bf58dd8d48988d142941735', 'name': 'A...",v-1613311726,False,214 King St. W,Simcoe,43.647074,-79.386176,"[{'label': 'display', 'lat': 43.64707408770653...",505,,CA,,Toronto,ON,Canada,"[214 King St. W (Simcoe), Toronto ON]"


<B>Now cluster the neighbourhoods</B>

In [44]:
from sklearn.neighbors import KNeighborsClassifier

<B>Drop string columns (Postal Code, Neighbourhood, Borough)</B>

In [47]:
Toronto_Clusters = df1.drop(['Postal Code','Borough','Neighbourhood'],1)
Toronto_Clusters

Unnamed: 0,labels,Latitude,Longitude
0,2,43.806686,-79.194353
1,2,43.784535,-79.160497
2,2,43.763573,-79.188711
3,2,43.770992,-79.216917
4,2,43.773136,-79.239476
...,...,...,...
98,1,43.706876,-79.518188
99,1,43.696319,-79.532242
100,1,43.688905,-79.554724
101,1,43.739416,-79.588437


<B>Using K means clustering and a cluster of 6, fit it on our data</B>

In [48]:
num_clusters = 6

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(Toronto_Clusters)
labels = k_means.labels_

print(labels)

[4 4 4 4 4 4 4 1 4 1 4 1 4 1 4 1 4 0 1 0 0 0 0 0 0 1 1 1 0 0 0 5 5 5 1 1 1
 1 1 1 1 1 1 1 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 0 3 3 3 3 3 3 3 0 0 3
 3 3 3 3 3 5 2 2 3 3 2 3 2 1 2 2 2 2 2 2 2 2 5 5 5 5 5 5 5]


In [49]:
df1["labels"] = labels
df1

Unnamed: 0,Postal Code,Borough,Neighbourhood,labels,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",4,43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",4,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",4,43.763573,-79.188711
3,M1G,Scarborough,Woburn,4,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,4,43.773136,-79.239476
...,...,...,...,...,...,...
98,M9N,York,Weston,5,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,5,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",5,43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",5,43.739416,-79.588437


<B>Rearrange columns</B>

In [56]:
df2 = df1.reindex(columns=['Postal Code','Borough','Neighbourhood','Latitude','Longitude','labels'])
df2

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,labels
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,4
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,4
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,4
3,M1G,Scarborough,Woburn,43.770992,-79.216917,4
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,4
...,...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188,5
99,M9P,Etobicoke,Westmount,43.696319,-79.532242,5
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724,5
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437,5


<B>Centroid of each cluster:</B>

In [57]:
df2.groupby('labels').mean()

Unnamed: 0_level_0,Latitude,Longitude
labels,Unnamed: 1_level_1,Unnamed: 2_level_1
0,43.748934,-79.418171
1,43.715658,-79.322769
2,43.645164,-79.524914
3,43.662402,-79.399272
4,43.774241,-79.23054
5,43.723839,-79.536324


<B>Map Clusters</B>

In [59]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df2['Latitude'], df2['Longitude'], df2['Neighbourhood'], df2['labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters