In [2]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup # library for scraping from a website

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium
import folium # map rendering library

!pip install geocoder
import geocoder # to get longitude and latitude



In [3]:
# Get the webpage
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url).text

# Extract only the table
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', class_='sortable')

# Get the values from the Wikipedia table and store into a dataframe
row = [] # initialize row list

for tr in table.find_all('tr'):                                           # for every row in the original table
    if tr.find_all('th') == []:                                           # unless it's a header
        row.append([td.get_text(strip=True) for td in tr.find_all('td')]) # every item w/i 'td' tag appended to row list

# Assign columns names and turn row into a dataframe of neighborhoods
column_names = ['Postcode', 'Borough', 'Neighborhood']

neighborhoods_raw = pd.DataFrame(row, columns=column_names) # create a raw table of neighborhoods
neighborhoods_raw.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


In [4]:
# Rows containing 'Not assigned' boroughs are dropped.
drop_index = neighborhoods_raw[neighborhoods_raw['Borough'] == 'Not assigned'].index # get indexes of rows containing 'Not assigned' borough
neighborhoods = neighborhoods_raw.drop(drop_index, axis=0)                           # rows dropped to create neighborhoods table
neighborhoods.reset_index(drop=True, inplace=True)                                   # resets the index after dropping the rows

In [5]:

# Assign borough names to neighborhoods when neighborhood is 'Not assigned'
nh_na = neighborhoods[neighborhoods['Neighborhood'] == 'Not assigned'].index # index of those rows
neighborhoods.iloc[nh_na, 2] = neighborhoods['Borough'][nh_na]

In [6]:

# Neighborhoods with a same postcode are merged into a single cell, separated by commas
neighborhoods = neighborhoods.groupby(['Postcode', 'Borough'], as_index=False).agg(lambda x: ', '.join(x))
neighborhoods

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [7]:
neighborhoods.shape

(103, 3)

In [8]:
# Initialize variables
lat = []
lng = []
lat_lng_coords = None

# Get postcodes from neighborhoods table
postal_code = neighborhoods['Postcode']

# Store latitude and longitude values in lat and lng
for pc in postal_code:
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(pc))
    lat_lng_coords = g.latlng
    lat.append(lat_lng_coords[0])
    lng.append(lat_lng_coords[1])

In [9]:
nh_complete = neighborhoods
nh_complete['Latitude'] = lat
nh_complete['Longitude'] = lng

In [10]:
nh_complete

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944
5,M1J,Scarborough,Scarborough Village,43.743125,-79.23175
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.726276,-79.263625
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.713054,-79.285055
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.724235,-79.227925
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.69677,-79.259967


In [12]:
# New dataframe with only the original city included
toronto = nh_complete[nh_complete['Borough'].str.find('Toronto') != -1].reset_index(drop=True)
toronto.shape

(39, 5)

In [13]:
# Get the latitude and longitude of Toronto
g = geocoder.arcgis('Toronto, Ontario')
lat_tor = g.latlng[0]
lng_tor = g.latlng[1]

# Create a map of Toronto
map_toronto = folium.Map(location=[lat_tor, lng_tor], zoom_start=11)

# Add markers to map
for lat, lng, bor, postcode in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Postcode']):
    label = '{}, {}'.format(postcode, bor)        # popup labels with postcode and borough
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng],
                        radius=5,
                        popup=label,
                        color='blue',
                        fill=True,
                        fill_color='#3186cc',
                        fill_opacity=0.7,
                        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [14]:
# The code was removed by Watson Studio for sharing.


In [40]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id=HEDPDQO2VWJF1GR21YKJRDLWCDDWSRM10JXXVRFTHA3L3SRM&client_secret=2WV4TH5E3IDMERCQIK0B0HXVZFTF5H4422I0W2HRUP13IG40&v=20180602&ll=43.648690,-79.489550&radius=500&limit=100'
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(name,
                             lat,
                             lng,
                             v['venue']['name'],
                             v['venue']['location']['lat'],
                             v['venue']['location']['lng'],
                             v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [41]:
# Limit the number of venues to 100
LIMIT = 100

# Get the venues for each postcode
toronto_venues = getNearbyVenues(names=toronto['Postcode'],
                                 latitudes=toronto['Latitude'],
                                 longitudes=toronto['Longitude'],
                                 radius=700) # limit radius to 700 meters

In [42]:
# Check the venues dataframe
print(toronto_venues.shape)
toronto_venues.head(10)

(819, 7)


Unnamed: 0,Postcode,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,43.676531,-79.295425,The Good Fork,43.649565,-79.484023,Food
1,M4E,43.676531,-79.295425,Old Mill Toronto,43.651011,-79.493222,American Restaurant
2,M4E,43.676531,-79.295425,Wibke's Espresso Bar,43.649132,-79.484802,Coffee Shop
3,M4E,43.676531,-79.295425,Bryden's Pub,43.649259,-79.484651,Pub
4,M4E,43.676531,-79.295425,Goodfellas Wood Oven Pizza,43.648224,-79.486356,Italian Restaurant
5,M4E,43.676531,-79.295425,Asa Sushi,43.649902,-79.484611,Sushi Restaurant
6,M4E,43.676531,-79.295425,Booster Juice,43.649802,-79.483683,Juice Bar
7,M4E,43.676531,-79.295425,Subway,43.649622,-79.484028,Sandwich Place
8,M4E,43.676531,-79.295425,Humber Cinema,43.649118,-79.484818,Indie Movie Theater
9,M4E,43.676531,-79.295425,Tim Hortons,43.648526,-79.485066,Coffee Shop


In [43]:
# Check the number of venues for each postcode
num_venues = pd.DataFrame()
num_venues['Postcode'] = toronto_venues.groupby('Postcode').count().reset_index()['Postcode']
num_venues['Venue Count'] = toronto_venues.groupby('Postcode').count().reset_index()['Venue']
num_venues


Unnamed: 0,Postcode,Venue Count
0,M4E,21
1,M4K,21
2,M4L,21
3,M4M,21
4,M4N,21
5,M4P,21
6,M4R,21
7,M4S,21
8,M4T,21
9,M4V,21


In [44]:
# One hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# Add Postcode column
toronto_onehot['Postcode'] = toronto_venues['Postcode'] 

# Move postcode column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()
toronto_grouped.head(10)

Unnamed: 0,Postcode,American Restaurant,Coffee Shop,Diner,Food,Gastropub,Gym,Indie Movie Theater,Italian Restaurant,Juice Bar,Metro Station,Pizza Place,Pub,Restaurant,Sandwich Place,Spa,Sushi Restaurant,Tennis Court
0,M4E,0.047619,0.142857,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619
1,M4K,0.047619,0.142857,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619
2,M4L,0.047619,0.142857,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619
3,M4M,0.047619,0.142857,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619
4,M4N,0.047619,0.142857,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619
5,M4P,0.047619,0.142857,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619
6,M4R,0.047619,0.142857,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619
7,M4S,0.047619,0.142857,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619
8,M4T,0.047619,0.142857,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619
9,M4V,0.047619,0.142857,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619,0.095238,0.047619,0.047619,0.047619,0.047619,0.047619,0.047619


In [45]:
# Define a function that returns the sorted categories
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]                                       # row of the table of interest
    row_categories_sorted = row_categories.sort_values(ascending=False) # sort the mean frequency of occurrences
    
    return row_categories_sorted.index.values[0:num_top_venues]         # return the index (category) of the mena frequency values