### Create a dataframe from Wikipedia page 

In [1]:
import pandas as pd # library for data analsysis
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import urllib.request # library to handle requests
import requests # library to handle requests

import numpy as np # library to handle data in a vectorized manner
import json # library to handle JSON files

# import k-means from clustering stage
from sklearn.cluster import KMeans

import seaborn as sns # plotting tools
import matplotlib.pyplot as plt # plotting tools
import folium # map rendering library

from bs4 import BeautifulSoup

### Use of Pandas read Wikipedia page table

In [2]:
# Use pandas.read_html to obtain table from Wikipedia
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)

# Save the first table as dataframe, df
df = tables[0]

# Print the shape of original daraframe
print('Shape of df: ' + str(df.shape))
df.head()

Shape of df: (287, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Use of web scraper for Wikipedia page table

In [3]:
# Download 'List of postal codes of Canada: M' as 'toronto_zipcode.html'
# Extract heads and columns and writing into 'toronto_zipcode.txt'

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = urllib.request.urlopen(url)
article = req.read().decode()

with open('toronto_zipcode.html', 'w') as fo:
    fo.write(article)

# Load article, turn into soup and get the <table>s.
article = open('toronto_zipcode.html').read()
soup = BeautifulSoup(article, 'html.parser')
tables = soup.find_all('table', class_='sortable')

# Search through the tables for the one with the headings
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:3] == ['Postcode', 'Borough', 'Neighbourhood']:
        break

# Extract the columns we want and write to a semicolon-delimited text file.
with open('toronto_zipcode.txt', 'w') as fo:
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        Postcode, Borough, Neighbourhood = [td.text.strip() for td in tds[:3]]
        print('; '.join([Postcode, Borough, Neighbourhood]), file=fo)

# Convert to DataFrame and name the columns
df = pd.read_csv('toronto_zipcode.txt', sep=";", header=None)
df.columns = ['Postcode', 'Borough', 'Neighbourhood']

# Print the shape of original daraframe
print('Shape of df: ' + str(df.shape))
df.head()

Shape of df: (287, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [175]:
# Select rows with only Borough assigned
not_assigned = df.iloc[0,1]
df1 = df[df['Borough'] != not_assigned].reset_index(drop=True)

# Set not assigned neighbourhood to match borough
df1['Neighbourhood'] = df1['Neighbourhood'].apply(lambda x: " Queen's Park" if x == not_assigned else x)

# Combine boroughs and neighbourhoods with the same postcode and separated with comma ','
df2 = df1.groupby(['Postcode','Borough'], sort=False).agg( ','.join).reset_index()

# Shape of modified data frame
print('Shape of df: ' + str(df2.shape))
df2.head()

Shape of df: (103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [176]:
geospatial_data = pd.read_csv('Geospatial_Coordinates.csv')
geospatial_data.rename(columns={'Postal Code':'Postcode'}, inplace=True)

toronto_geospatial = pd.merge(df1, geospatial_data, on='Postcode', how='left')

print('Shape of toronto_geospatial: ' + str(toronto_geospatial.shape))
toronto_geospatial.head()

Shape of toronto_geospatial: (210, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763


### Creat a map of Toronto with clusters of neighbourhoods

In [177]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [178]:
# Obtain the geographical coordinate of Downtown Toronto

address = 'Downtown Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronro are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronro are 43.6563221, -79.3809161.


In [205]:
# Create map of Toronro using latitude and longitude values

# toronto_data = toronto_geospatial[ toronto_geospatial['Borough'].\
#                isin([' Downtown Toronto', ' East Toronto', ' West Toronto', ' Central Toronto'])].\
#              reset_index(drop=True)

toronto_data = toronto_geospatial.reset_index(drop=True)


map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [206]:
# Define Foursquare Credentials and Version

CLIENT_ID = 'ZNHCFQTVVNOVAXXA3ZDMCM4BONBX5RUAZCXCZCTRCGPET3S1' # your Foursquare ID
CLIENT_SECRET = 'PEH5LRKDELCYE4TSTP2D2M5PYJ0EWGN3YMS5KHFCAUFABQFR' # your Foursquare Secret
VERSION = '20200202' # Foursquare API version

In [207]:
# Obtain geographical coordinate of ScotiaBank Arena

scotiabank_arena = 'Scotiabank Arena'
geolocator = Nominatim(user_agent="ny_explorer")
arena_location = geolocator.geocode(scotiabank_arena)
arena_latitude = location.latitude
arena_longitude = location.longitude
print('The geograpical coordinate of Toronro are {}, {}.'.format(arena_latitude, arena_longitude))

The geograpical coordinate of Toronro are 43.6563221, -79.3809161.


In [208]:
# Request top 100 results near ScotiaBank Arena with a radius of 500m

search_query = ''
radius = 500
LIMIT = 200

url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'\
      .format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e4ca009edbcad001bda18d0'},
 'response': {'venues': [{'id': '524a40b411d2aabd790873de',
    'name': 'Shark Club Sports Bar & Grill',
    'location': {'address': '10 Dundas St E',
     'crossStreet': 'at Yonge St',
     'lat': 43.6563806118487,
     'lng': -79.38092110037203,
     'labeledLatLngs': [{'label': 'display',
       'lat': 43.6563806118487,
       'lng': -79.38092110037203}],
     'distance': 6,
     'cc': 'CA',
     'city': 'Toronto',
     'state': 'ON',
     'country': 'Canada',
     'formattedAddress': ['10 Dundas St E (at Yonge St)',
      'Toronto ON',
      'Canada']},
    'categories': [{'id': '4bf58dd8d48988d11d941735',
      'name': 'Sports Bar',
      'pluralName': 'Sports Bars',
      'shortName': 'Sports Bar',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/nightlife/sportsbar_',
       'suffix': '.png'},
      'primary': True}],
    'referralId': 'v-1582080037',
    'hasPerk': False},
   {'id': '4fe4a773e4b0ef

In [209]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [210]:
venues = results['response']['venues']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['name', 'categories', 'location.lat', 'location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng,categories.1
0,Shark Club Sports Bar & Grill,"[{'id': '4bf58dd8d48988d11d941735', 'name': 'S...",43.656381,-79.380921,Sports Bar
1,Spring Sushi,"[{'id': '4bf58dd8d48988d1d2941735', 'name': 'S...",43.656253,-79.38066,Sushi Restaurant
2,Harvey's,"[{'id': '4bf58dd8d48988d16e941735', 'name': 'F...",43.656341,-79.380947,Fast Food Restaurant
3,Rolltation Sushi Burrito,"[{'id': '4bf58dd8d48988d142941735', 'name': 'A...",43.656709,-79.38092,Asian Restaurant
4,Dundas Subway Station,"[{'id': '4bf58dd8d48988d1fd931735', 'name': 'M...",43.656096,-79.380785,Metro Station


In [211]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

132 venues were returned by Foursquare.


In [212]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [215]:
# Venues near downtown Toronto
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

 Parkwoods
 Victoria Village
 Harbourfront
 Lawrence Heights
 Lawrence Manor
 Queen's Park
 Queen's Park
 Rouge
 Malvern
 Don Mills North
 Woodbine Gardens
 Parkview Hill
 Ryerson
 Garden District
 Glencairn
 Cloverdale
 Islington
 Martin Grove
 Princess Gardens
 West Deane Park
 Highland Creek
 Rouge Hill
 Port Union
 Flemingdon Park
 Don Mills South
 Woodbine Heights
 St. James Town
 Humewood-Cedarvale
 Bloordale Gardens
 Eringate
 Markland Wood
 Old Burnhamthorpe
 Guildwood
 Morningside
 West Hill
 The Beaches
 Berczy Park
 Caledonia-Fairbanks
 Woburn
 Leaside
 Central Bay Street
 Christie
 Cedarbrae
 Hillcrest Village
 Bathurst Manor
 Downsview North
 Wilson Heights
 Thorncliffe Park
 Adelaide
 King
 Richmond
 Dovercourt Village
 Dufferin
 Scarborough Village
 Fairview
 Henry Farm
 Oriole
 Northwood Park
 York University
 East Toronto
 Harbourfront East
 Toronto Islands
 Union Station
 Little Portugal
 Trinity
 East Birchmount Park
 Ionview
 Kennedy Park
 Bayview Village
 CFB Toron

In [216]:
toronto_venues['Neighborhood']

0               Parkwoods
1               Parkwoods
2        Victoria Village
3        Victoria Village
4        Victoria Village
              ...        
4305       South of Bloor
4306       South of Bloor
4307       South of Bloor
4308       South of Bloor
4309       South of Bloor
Name: Neighborhood, Length: 4310, dtype: object

In [217]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()

# top 5 most common venues in each neighbourhood
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Adelaide----
             venue  freq
0      Coffee Shop  0.06
1       Steakhouse  0.04
2             Café  0.04
3              Bar  0.04
4  Thai Restaurant  0.04


---- Agincourt----
                       venue  freq
0             Breakfast Spot   0.2
1                     Lounge   0.2
2             Clothing Store   0.2
3  Latin American Restaurant   0.2
4               Skating Rink   0.2


---- Agincourt North----
                venue  freq
0          Playground   0.5
1                Park   0.5
2  Miscellaneous Shop   0.0
3       Movie Theater   0.0
4               Motel   0.0


---- Albion Gardens----
                  venue  freq
0         Grocery Store  0.22
1  Fast Food Restaurant  0.11
2   Fried Chicken Joint  0.11
3           Coffee Shop  0.11
4            Beer Store  0.11


---- Alderwood----
          venue  freq
0   Pizza Place  0.25
1  Skating Rink  0.12
2   Coffee Shop  0.12
3           Pub  0.12
4           Gym  0.12


---- Bathurst Manor----
                venue

In [218]:
# sort the venues in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [281]:
# display the top 10 venues for each neighborhood

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Steakhouse,Bar,Café,Thai Restaurant,Burger Joint,Bakery,Sushi Restaurant,Cosmetics Shop,Restaurant
1,Agincourt,Lounge,Latin American Restaurant,Skating Rink,Clothing Store,Breakfast Spot,Falafel Restaurant,Event Space,Farmers Market,Ethiopian Restaurant,Discount Store
2,Agincourt North,Playground,Park,Yoga Studio,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
3,Albion Gardens,Grocery Store,Pizza Place,Fried Chicken Joint,Coffee Shop,Beer Store,Pharmacy,Fast Food Restaurant,Sandwich Place,Ethiopian Restaurant,Empanada Restaurant
4,Alderwood,Pizza Place,Pub,Pharmacy,Gym,Sandwich Place,Coffee Shop,Skating Rink,Yoga Studio,Diner,Discount Store


In [279]:
# set number of clusters
kclusters = 6

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=4).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 5, 1,
       1, 1, 1, 1, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1,
       3, 1, 5, 1, 1, 1, 1, 0, 1, 1, 1, 4, 1, 1, 1, 5, 3, 1, 1, 5, 1, 1,
       1, 1, 1, 2, 1, 0, 1, 1, 1, 0, 1, 5, 3, 1, 1, 1, 1, 5, 1, 1, 1, 1,
       1, 1, 1, 1, 5, 1, 4, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 5, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 5, 3, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 5, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       5, 1, 1], dtype=int32)

In [278]:
# drop clustering labels if exists
neighborhoods_venues_sorted.drop('Cluster Labels', 1, inplace=True)

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)


toronto_merged = toronto_data

toronto_merged.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = pd.merge(toronto_merged, neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,5,Park,Food & Drink Shop,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Empanada Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,1,Portuguese Restaurant,Pizza Place,Hockey Arena,Coffee Shop,Intersection,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636,1,Coffee Shop,Park,Bakery,Café,Pub,Breakfast Spot,Restaurant,Mexican Restaurant,Theater,Performing Arts Venue
3,M6A,North York,Lawrence Heights,43.718518,-79.464763,1,Clothing Store,Furniture / Home Store,Accessories Store,Boutique,Event Space,Miscellaneous Shop,Coffee Shop,Women's Store,Vietnamese Restaurant,Airport Terminal
4,M6A,North York,Lawrence Manor,43.718518,-79.464763,1,Clothing Store,Furniture / Home Store,Accessories Store,Boutique,Event Space,Miscellaneous Shop,Coffee Shop,Women's Store,Vietnamese Restaurant,Airport Terminal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,M8Z,Etobicoke,Kingsway Park South West,43.628841,-79.520999,1,Hardware Store,Discount Store,Social Club,Burger Joint,Burrito Place,Sandwich Place,Supplement Shop,Bakery,Fast Food Restaurant,Grocery Store
200,M8Z,Etobicoke,Mimico NW,43.628841,-79.520999,1,Hardware Store,Discount Store,Social Club,Burger Joint,Burrito Place,Sandwich Place,Supplement Shop,Bakery,Fast Food Restaurant,Grocery Store
201,M8Z,Etobicoke,The Queensway West,43.628841,-79.520999,1,Hardware Store,Discount Store,Social Club,Burger Joint,Burrito Place,Sandwich Place,Supplement Shop,Bakery,Fast Food Restaurant,Grocery Store
202,M8Z,Etobicoke,Royal York South West,43.628841,-79.520999,1,Hardware Store,Discount Store,Social Club,Burger Joint,Burrito Place,Sandwich Place,Supplement Shop,Bakery,Fast Food Restaurant,Grocery Store


In [263]:
# create map with clustering results
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters