### Import Library

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

### Scrape the Wikipedia Page

In [2]:
# Define column names
col = ['PostalCode', 'Borough', 'Neighborhood']

# Create a dafaframe
df = pd.DataFrame(columns = col)

# Scrape the page
html = urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(html, "html.parser")

# Get each row in table
row_list = soup.findAll("div", {'class':'mw-content-ltr'})[0].find("tbody").findAll("tr") 
print("Original table's length:",len(row_list)-1)

for i, row in enumerate(row_list):
    
    # skip header
    if i == 0:
        continue
    
    # Get each value of cloumns
    td_list = row.findAll('td')
    postalcode = td_list[0].text
    borough = td_list[1].text
    neigh = td_list[2].text.replace("\n", "")
    
    # Ignore cells with a borough that is Not assigned.
    if borough == 'Not assigned':
        continue
    
    df = df.append({'PostalCode':postalcode , 'Borough':borough, 'Neighborhood':neigh} , ignore_index=True)

print("Table's length after ignore cells with a borough that is not assigned:", len(df))
df.head()

Original table's length: 287
Table's length after ignore cells with a borough that is not assigned: 210


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


### Processing

* Combined two rows with same postalcode into one row with the neighborhoods, and separated with a comma
* Changed a **Not assigned** neighborhood which has a borough with the borough

In [3]:
new_df = pd.DataFrame(columns = col)

group_df = df.groupby(['PostalCode','Borough'])
for group in group_df:
    #g = group[1].tolist()
    new_post = group[0][0]
    new_bor = group[0][1]
    new_neigh = group[1]['Neighborhood'].values
    
    if new_neigh[0] == 'Not assigned':
        new_neigh[0] = new_bor
    
    new_df = new_df.append({'PostalCode':new_post , 'Borough':new_bor, 'Neighborhood':new_neigh} , ignore_index=True)
    
new_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]


In [4]:
new_df.shape

(103, 3)

### Add The Latitude And The Longitude

In [5]:
# Use csv file that has the geographica
geo_file='./dataset/Geospatial_Coordinates.csv'

geo_df = pd.read_csv(geo_file)
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
# Rename column of Postal code, then we can merge two tables
geo_df = geo_df.rename(columns = {'Postal Code':'PostalCode'})
new_df_geo = pd.merge(new_df, geo_df, on='PostalCode')
new_df_geo.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"[Rouge, Malvern]",43.806686,-79.194353
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,Scarborough,[Woburn],43.770992,-79.216917
4,M1H,Scarborough,[Cedarbrae],43.773136,-79.239476


### Explore And Cluster The Neighborhoods in Toronto

In [7]:
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors

import folium # map rendering library

import requests # library to handle requests
from pandas.io.json import json_normalize # tranf
import json
import numpy as np

from sklearn.cluster import KMeans

#### Use geopy library to get the latitude and longitude values of Toronto

In [8]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Create a map of New York with neighborhoods superimposed on top.

In [13]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(new_df_geo['Latitude'], new_df_geo['Longitude'], new_df_geo['Borough'], new_df_geo['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Define Foursquare Credentials and Version

In [10]:
CLIENT_ID = '5KIT1XEF0XJXAHMNA1ID2GDMJFWFS4D3L35F21OUNCH1R2YR' # your Foursquare ID
CLIENT_SECRET = 'PXSB5SMQDAELPPZXFDYIMTGLGNDOV0C3V2NVFQVHJ0M4DXWL' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius


Your credentails:
CLIENT_ID: 5KIT1XEF0XJXAHMNA1ID2GDMJFWFS4D3L35F21OUNCH1R2YR
CLIENT_SECRET:PXSB5SMQDAELPPZXFDYIMTGLGNDOV0C3V2NVFQVHJ0M4DXWL


#### Explore Neighborhoods

In [11]:
def getNearbyVenues(posts, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for post, lat, lng in zip(posts, latitudes, longitudes):
        #print(post)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            post, 
            #lat, 
            #lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  #'Neighborhood Latitude', 
                  #'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
toronto_venues = getNearbyVenues(posts=new_df_geo['PostalCode'],
                                   latitudes=new_df_geo['Latitude'],
                                   longitudes=new_df_geo['Longitude']
                                  )
# check the size of the resulting dataframe
print(toronto_venues.shape)
toronto_venues.head()

(2213, 5)


Unnamed: 0,PostalCode,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1B,Interprovincial Group,43.80563,-79.200378,Print Shop
2,M1C,Chris Effects Painting,43.784343,-79.163742,Construction & Landscaping
3,M1C,Royal Canadian Legion,43.782533,-79.163085,Bar
4,M1C,Affordable Toronto Movers,43.787919,-79.162977,Moving Target


In [15]:
# check how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

# check how many venues were returned for each neighborhood
venue_count_df = toronto_venues['Venue'].groupby(toronto_venues['PostalCode']).count().to_frame().reset_index()
print("There are total ", venue_count_df.Venue.sum(), "venues.")
print("The average number of venue is", venue_count_df.Venue.mean())
print("The std number of venue is", venue_count_df.Venue.std())
venue_count_df.head()

There are 266 uniques categories.
There are total  2213 venues.
The average number of venue is 21.91089108910891
The std number of venue is 28.946882046224246


Unnamed: 0,PostalCode,Venue
0,M1B,2
1,M1C,3
2,M1E,7
3,M1G,3
4,M1H,8


#### Analyze Each Neighborhood

In [16]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add PostalCode column back to dataframe
toronto_onehot = pd.concat([toronto_venues['PostalCode'], toronto_onehot], axis=1)

print("Dataframe shape =",toronto_onehot.shape)
toronto_onehot.head()

Dataframe shape = (2213, 267)


Unnamed: 0,PostalCode,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
print("Dataframe's shape =", toronto_grouped.shape)
toronto_grouped.head()

Dataframe's shape = (101, 267)


Unnamed: 0,PostalCode,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


First, let's write a function to sort the venues in descending order.

In [18]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [19]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Print Shop,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
1,M1C,Bar,Moving Target,Construction & Landscaping,Yoga Studio,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
2,M1E,Medical Center,Intersection,Electronics Store,Breakfast Spot,Pizza Place,Mexican Restaurant,Rental Car Location,Dim Sum Restaurant,Diner,Discount Store
3,M1G,Coffee Shop,Korean Restaurant,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant
4,M1H,Bakery,Hakka Restaurant,Fried Chicken Joint,Caribbean Restaurant,Athletics & Sports,Thai Restaurant,Gas Station,Bank,Dog Run,Dim Sum Restaurant


#### Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [20]:
toronto_grouped_clustering = toronto_grouped.drop('PostalCode', 1)

# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 4, 4, 4, 2, 4, 4, 4, 4])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [21]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = new_df_geo

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = pd.merge(toronto_merged, neighborhoods_venues_sorted, on='PostalCode', how='left')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"[Rouge, Malvern]",43.806686,-79.194353,4.0,Fast Food Restaurant,Print Shop,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497,4.0,Bar,Moving Target,Construction & Landscaping,Yoga Studio,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]",43.763573,-79.188711,4.0,Medical Center,Intersection,Electronics Store,Breakfast Spot,Pizza Place,Mexican Restaurant,Rental Car Location,Dim Sum Restaurant,Diner,Discount Store
3,M1G,Scarborough,[Woburn],43.770992,-79.216917,4.0,Coffee Shop,Korean Restaurant,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant
4,M1H,Scarborough,[Cedarbrae],43.773136,-79.239476,4.0,Bakery,Hakka Restaurant,Fried Chicken Joint,Caribbean Restaurant,Athletics & Sports,Thai Restaurant,Gas Station,Bank,Dog Run,Dim Sum Restaurant


Check if there is any borough without any venue when explore

In [22]:
without_venue_df = toronto_merged[pd.isnull(toronto_merged).any(axis=1)]
without_venue_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,M1X,Scarborough,[Upper Rouge],43.836125,-79.205636,,,,,,,,,,,
93,M9A,Queen's Park,[Queen's Park],43.667856,-79.532242,,,,,,,,,,,


In [23]:
# Drop out the borough without venue
toronto_merged = toronto_merged.drop(toronto_merged[pd.isnull(toronto_merged).any(axis=1)].index)
print(len(toronto_merged))

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
# add borough without venue to map
for lat, lon, poi, cluster in zip(without_venue_df['Latitude'], without_venue_df['Longitude'], without_venue_df['Neighborhood'], without_venue_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color='black',
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

101
