# This Notebook Maps Toronto Neighbourhoods

In [1]:
#Import all necessary libraries
!pip install beautifulsoup4
from bs4 import BeautifulSoup
import urllib.request
!pip install geopy
!pip install folium
import itertools
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import seaborn as sns
import json
import requests
from pandas.io.json import json_normalize
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

print('Libraries imported.')

Libraries imported.


In [2]:
# extract the data from the wikipedia page that is wrtten in html
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
# Find data for the table
table = soup.find('table', class_='wikitable sortable')

In [3]:
#find all rows, the [1:] is to skip the first row
rows = table.find_all('tr')[1:]

#set up lists for each of the columns needed
PostalCode=[]
Borough=[]
Neighbourhood=[]

#Iterate through rows to find td and then navigate them to pick up data needed for columns
#replace \n was inserted because I found that neighbourhoods ended with \n that was not needed
for row in rows:
    rows2=row.find_all('td')
    PostalCode.append(rows2[0].get_text())
    Borough.append(rows2[1].get_text())
    Neighbourhood.append(rows2[2].get_text().replace('\n',''))

#create a data frame from lists created
df = pd.DataFrame()
df['PostalCode']=PostalCode
df['Borough']=Borough
df['Neighbourhood']=Neighbourhood

In [25]:
#Drop rows where Borough is Not assigned and change Neighbourhood to Borough if not assigned
df.drop(df[df.Borough == 'Not assigned'].index, inplace=True)
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood']=df.loc[df['Neighbourhood']=='Not assigned','Borough']
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [26]:
#Combine rows that have the same PostalCode and Borough
df2=df.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(list)
df2=df2.sample(frac=1).reset_index()
df2['Neighbourhood']=df2['Neighbourhood'].str.join(', ')
df2

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3K,North York,"CFB Toronto, Downsview East"
1,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village"
2,M4G,East York,Leaside
3,M3B,North York,Don Mills North
4,M5T,Downtown Toronto,"Chinatown, Grange Park, Kensington Market"
5,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
6,M5E,Downtown Toronto,Berczy Park
7,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel"
8,M5R,Central Toronto,"The Annex, North Midtown, Yorkville"
9,M8W,Etobicoke,"Alderwood, Long Branch"


In [27]:
#The number of rows, columns in my table
df2.shape

(103, 3)

### Part II: Get the geographical coordinates for my table

In [28]:
#Look at where my files are being picked up from
import os
os.getcwd()

'/home/dsxuser/work'

In [52]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [64]:
#Ensure that my postal code columns are named the same
df3 = df3.rename({'Postal Code':'PostalCode'}, axis=1)

#Now I want to merge data frames by postal codes
df4 = pd.DataFrame.merge(df2,df3)
df4.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3K,North York,"CFB Toronto, Downsview East",43.737473,-79.464763
1,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191
2,M4G,East York,Leaside,43.70906,-79.363452
3,M3B,North York,Don Mills North,43.745906,-79.352188
4,M5T,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049


### Part III: Explore and Cluster Toronto Neighbourhoods

In [71]:
latitude = 43.6532
longitude = -79.3832

# create map of the Greater Toronto Area startng from the latitude and longitude of Toronto
GTA = folium.Map(location=[latitude,longitude], zoom_start=10)

# add markers
for lat, lng, Borough, Neighbourhood in zip(df4['Latitude'], df4['Longitude'], df4['Borough'], df4['Neighbourhood']):
    label = '{}, {}'.format(Neighbourhood, Borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(GTA)  
    
GTA

In [72]:
#Copying the NYC lab, I want to use the Foursquare API to explore Boroughs in Toronto and segment them

CLIENT_ID = 'XHFUDADMDCEHSIKIBLDQAHS31KWHMEOH3ZLLJGUSFFKJY4DU' 
CLIENT_SECRET = 'JBII31TTL300VB1E4XTDJ13DS5PYJGZHHDZQ00CUURLXPY2T' 
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: XHFUDADMDCEHSIKIBLDQAHS31KWHMEOH3ZLLJGUSFFKJY4DU
CLIENT_SECRET:JBII31TTL300VB1E4XTDJ13DS5PYJGZHHDZQ00CUURLXPY2T


In [73]:
#I am going to use the first borough as an example

df4.loc[0, 'Borough']

'North York'

In [74]:
#Now I am going to capture the long / latitude of the first borough

borough_lat = df4.loc[0, 'Latitude'] # borough latitude value
borough_long = df4.loc[0, 'Longitude'] # borough longitude value

borough_name = df4.loc[0, 'Borough'] # borough name

print('Latitude and longitude values of {} are {}, {}.'.format(borough_name, 
                                                               borough_lat, 
                                                               borough_long))

Latitude and longitude values of North York are 43.737473200000004, -79.46476329999999.


In [75]:
#Let us explore the top 50 venues in this borough within 500 meters

LIMIT = 50
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    borough_lat, 
    borough_long, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=XHFUDADMDCEHSIKIBLDQAHS31KWHMEOH3ZLLJGUSFFKJY4DU&client_secret=JBII31TTL300VB1E4XTDJ13DS5PYJGZHHDZQ00CUURLXPY2T&v=20180605&ll=43.737473200000004,-79.46476329999999&radius=500&limit=50'

In [76]:
#Resulting Output
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5dd9a0ce949393001baab01f'},
  'headerLocation': 'Clanton Park',
  'headerFullLocation': 'Clanton Park, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.741973204500006,
    'lng': -79.45854667937631},
   'sw': {'lat': 43.7329731955, 'lng': -79.47097992062366}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bcb76143740b7133e926265',
       'name': 'Toronto Downsview Airport (YZD)',
       'location': {'address': 'Garratt Blvd',
        'lat': 43.738882611749744,
        'lng': -79.47011109314882,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.738882611749744,
          'lng': -79.47011109314882}],
        'distance': 457,
        'cc': 'CA',
   

In [77]:
# Extract the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [83]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Toronto Downsview Airport (YZD),Airport,43.738883,-79.470111
1,First Class Realty Ltd,Electronics Store,43.737133,-79.463298
2,Ancaster Park,Park,43.734706,-79.464777


In [84]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

3 venues were returned by Foursquare.


In [94]:
#What if we look at the second borough now

df4.loc[1, 'Borough']

'West Toronto'

In [95]:
#Let us capture the long / latitude of each
borough_lat = df4.loc[1, 'Latitude'] # borough latitude value
borough_long = df4.loc[1, 'Longitude'] # borough longitude value

borough_name = df4.loc[1, 'Borough'] # borough name

In [96]:
#Let us explore the top 50 venues in each borough within 500 meters

LIMIT = 50
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    borough_lat, 
    borough_long, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=XHFUDADMDCEHSIKIBLDQAHS31KWHMEOH3ZLLJGUSFFKJY4DU&client_secret=JBII31TTL300VB1E4XTDJ13DS5PYJGZHHDZQ00CUURLXPY2T&v=20180605&ll=43.6368472,-79.42819140000002&radius=500&limit=50'

In [97]:
#Resulting Output
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5dd9a1d702a172001bd0087c'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Parkdale',
  'headerFullLocation': 'Parkdale, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 22,
  'suggestedBounds': {'ne': {'lat': 43.6413472045, 'lng': -79.42198519939873},
   'sw': {'lat': 43.632347195499996, 'lng': -79.4343976006013}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '544e9ee6498e2c55cc71bd07',
       'name': 'Starbucks',
       'location': {'address': '1200 King St W',
        'crossStreet': 'Dufferin Street',
        'lat': 43.63909,
        'lng': -79.427622,
        'labeledLatLngs': [{'label': 'display',
          'lat

In [98]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Starbucks,Coffee Shop,43.63909,-79.427622
1,Reebok Crossfit Liberty Village,Gym,43.637036,-79.424802
2,Pharmacy,Bar,43.63809,-79.43181
3,Louie Craft Coffee,Coffee Shop,43.639284,-79.42562
4,The Abbott,Café,43.637996,-79.430717


In [99]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

22 venues were returned by Foursquare.


#### It was interesting to see how North York had less venues versus West Toronto

In [103]:
#Wondering to capture any trends, define respective Foursquare URL
url = 'https://api.foursquare.com/v2/venues/trending?client_id={}&client_secret={}&ll={},{}&v={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION)

# send GET request and get trending venues
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5dd9a3260be7b4001bc69501'},
 'response': {'venues': []}}

In [104]:
if len(results['response']['venues']) == 0:
    trending_venues_df = 'No trending venues are available at the moment!'
    
else:
    trending_venues = results['response']['venues']
    trending_venues_df = json_normalize(trending_venues)

    # filter columns
    columns_filtered = ['name', 'categories'] + ['location.distance', 'location.city', 'location.postalCode', 'location.state', 'location.country', 'location.lat', 'location.lng']
    trending_venues_df = trending_venues_df.loc[:, columns_filtered]

    # filter the category for each row
    trending_venues_df['categories'] = trending_venues_df.apply(get_category_type, axis=1)

In [105]:
# display trending venues
trending_venues_df

'No trending venues are available at the moment!'

#### No trending venues! 

# This is the end of this assignment