## Week 3 Assignment Part 3

In [26]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
import folium
import json
from pandas import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
# Checking the given url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get(url)
print(result.status_code)

200


Status code 200 means that the query was successful.

In [3]:
# Getting the table on the webpage
soup = BeautifulSoup(result.content, 'html.parser')
table = soup.find('table')
trs = table.find_all('tr')
rows = []
for tr in trs:
    i = tr.find_all('td')
    if i:
        rows.append(i)
        
post_list = []
for row in rows:
    postalcode = row[0].text.rstrip()
    borough = row[1].text.rstrip()
    neighborhood = row[2].text.rstrip()
    if borough != 'Not assigned':
        if neighborhood == 'Not assigned':
            neighborhood = borough
        post_list.append([postalcode, borough, neighborhood])
        

In [4]:
# Converting the list of lists to a dataframe
data = pd.DataFrame(post_list, columns=['PostalCode','Borough','Neighborhood'])
data.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [5]:
# Checking for a value
data.loc[data.PostalCode == 'M5G']

Unnamed: 0,PostalCode,Borough,Neighborhood
24,M5G,Downtown Toronto,Central Bay Street


In [6]:
data.shape

(103, 3)

In [7]:
# Reading geo data
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [76]:
data_geo = data.copy()

In [77]:
# Create Latitude and Longitude columns in df_assigned
data_geo['Latitude'] = np.nan
data_geo['Longitude'] = np.nan

# For each postcode in df_assigned, find corresponding coordinates in df_cord and assign it to df_assigned
for idx in data_geo.index:
    cord_idx = geo_df['Postal Code'] == data_geo.loc[idx, 'PostalCode']
    data_geo.at[idx, 'Latitude'] = geo_df.loc[cord_idx, 'Latitude'].values
    data_geo.at[idx, 'Longitude'] = geo_df.loc[cord_idx, 'Longitude'].values

# Display the results
data_geo.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


Getting a map of Toronto & Labelling all the different Postal Codes on it.

In [13]:
geolocator = Nominatim(user_agent="coursera")
address = 'Toronto'
try:
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinates of {} are {}, {}.'.format(address, latitude, longitude))
except AttributeError:
    print('Cannot find: {}, will drop index: {}'.format(address, index))

my_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(data_geo['Latitude'], data_geo['Longitude'], data_geo['PostalCode']):
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(my_map)  
    
my_map

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


### FourSquare Credentials & Functions

In [15]:
CLIENT_ID = '0VPXNS01CBOXV1UH2GE2EHLPDQE3HZUPO3L4O1KVE41N2A3A' # Foursquare ID
CLIENT_SECRET = 'XWM4RE0LH5AT1PZKON0QCF1VIDAO4HL2F4H1TBNAN2MF0L4R' # Foursquare Secret
VERSION = '20180605' # API version

In [16]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [32]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

_

Lets get the venues available for Parkwoods in North York (M3A)

In [78]:
# Setting the Postal Code as Index for easier searching
data_geo.set_index('PostalCode', inplace = True) 

In [19]:
# Getting parameters for a venues call
neighborhood_latitude = data_geo.loc['M3A']['Latitude']
neighborhood_longitude = data_geo.loc['M3A']['Longitude']

In [20]:
# Creating URL
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=0VPXNS01CBOXV1UH2GE2EHLPDQE3HZUPO3L4O1KVE41N2A3A&client_secret=XWM4RE0LH5AT1PZKON0QCF1VIDAO4HL2F4H1TBNAN2MF0L4R&v=20180605&ll=43.7532586,-79.3296565&radius=500&limit=100'

In [21]:
results = requests.get(url).json()

In [22]:
venues = results['response']['groups'][0]['items']

In [30]:
nearby_venues = json_normalize(venues) # flatten JSON

In [31]:
# Cleaning it up a bit
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]


print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))
nearby_venues.head()

3 venues were returned by Foursquare.


Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Variety Store,Food & Drink Shop,43.751974,-79.333114
2,TTC stop - 44 Valley Woods,Bus Stop,43.755402,-79.333741


Lets analyze the venues in different neighbourhoods. (We will limit the analysis to just the boroughs containing the word 'Toronto').

First, lets give each neighborhood its own row.

In [79]:
data_geo.head()

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [80]:
neighborhood = data_geo.pop('Neighborhood')
data_geo.insert(0, 'NeighborHood', neighborhood)
data_geo.head()

Unnamed: 0_level_0,NeighborHood,Borough,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,Parkwoods,North York,43.753259,-79.329656
M4A,Victoria Village,North York,43.725882,-79.315572
M5A,"Regent Park, Harbourfront",Downtown Toronto,43.65426,-79.360636
M6A,"Lawrence Manor, Lawrence Heights",North York,43.718518,-79.464763
M7A,"Queen's Park, Ontario Provincial Government",Downtown Toronto,43.662301,-79.389494


In [81]:
data_geo['consolidated'] = data_geo[data_geo.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
data_geo.head()

Unnamed: 0_level_0,NeighborHood,Borough,Latitude,Longitude,consolidated
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M3A,Parkwoods,North York,43.753259,-79.329656,"North York,43.7532586,-79.3296565"
M4A,Victoria Village,North York,43.725882,-79.315572,"North York,43.725882299999995,-79.31557159999998"
M5A,"Regent Park, Harbourfront",Downtown Toronto,43.65426,-79.360636,"Downtown Toronto,43.6542599,-79.3606359"
M6A,"Lawrence Manor, Lawrence Heights",North York,43.718518,-79.464763,"North York,43.718517999999996,-79.46476329999999"
M7A,"Queen's Park, Ontario Provincial Government",Downtown Toronto,43.662301,-79.389494,"Downtown Toronto,43.6623015,-79.3894938"


In [84]:
new_df = pd.DataFrame(data_geo.NeighborHood.str.split(',').tolist(), index=data_geo.consolidated).stack()
new_df = new_df.reset_index([0, 'consolidated'])
new_df.columns = ['consolidated', 'Neighborhood']
new_df.head(20)

Unnamed: 0,consolidated,Neighborhood
0,"North York,43.7532586,-79.3296565",Parkwoods
1,"North York,43.725882299999995,-79.31557159999998",Victoria Village
2,"Downtown Toronto,43.6542599,-79.3606359",Regent Park
3,"Downtown Toronto,43.6542599,-79.3606359",Harbourfront
4,"North York,43.718517999999996,-79.46476329999999",Lawrence Manor
5,"North York,43.718517999999996,-79.46476329999999",Lawrence Heights
6,"Downtown Toronto,43.6623015,-79.3894938",Queen's Park
7,"Downtown Toronto,43.6623015,-79.3894938",Ontario Provincial Government
8,"Etobicoke,43.6678556,-79.53224240000002",Islington Avenue
9,"Scarborough,43.806686299999996,-79.19435340000001",Malvern


In [86]:
new_df[['Borough','Latitude','Longitude']] = new_df.consolidated.str.split(',', expand=True)
new_df.head()

Unnamed: 0,consolidated,Neighborhood,Borough,Latitude,Longitude
0,"North York,43.7532586,-79.3296565",Parkwoods,North York,43.7532586,-79.3296565
1,"North York,43.725882299999995,-79.31557159999998",Victoria Village,North York,43.7258823,-79.31557159999998
2,"Downtown Toronto,43.6542599,-79.3606359",Regent Park,Downtown Toronto,43.6542599,-79.3606359
3,"Downtown Toronto,43.6542599,-79.3606359",Harbourfront,Downtown Toronto,43.6542599,-79.3606359
4,"North York,43.718517999999996,-79.46476329999999",Lawrence Manor,North York,43.718518,-79.46476329999999


In [88]:
new_df[['Latitude', 'Longitude']] = new_df[['Latitude', 'Longitude']].apply(pd.to_numeric)
new_df.dtypes

consolidated     object
Neighborhood     object
Borough          object
Latitude        float64
Longitude       float64
dtype: object

In [96]:
new_df = new_df.drop(columns=['consolidated'])

Now lets filter only relevant boroughs.

In [98]:
borough_list = ['Downtown Toronto', 'East Toronto', 'West Toronto', 'Central Toronto']

In [101]:
data_geo_filtered = new_df.loc[new_df['Borough'].isin(borough_list)]
data_geo_filtered.head()

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude
2,Regent Park,Downtown Toronto,43.65426,-79.360636
3,Harbourfront,Downtown Toronto,43.65426,-79.360636
6,Queen's Park,Downtown Toronto,43.662301,-79.389494
7,Ontario Provincial Government,Downtown Toronto,43.662301,-79.389494
14,Garden District,Downtown Toronto,43.657162,-79.378937


Now that we have our final dataset. We can start the analysis.

In [103]:
toronto_venues = getNearbyVenues(names=data_geo_filtered['Neighborhood'],
                                   latitudes=data_geo_filtered['Latitude'],
                                   longitudes=data_geo_filtered['Longitude']
                                  )

Regent Park
 Harbourfront
Queen's Park
 Ontario Provincial Government
Garden District
 Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond
 Adelaide
 King
Dufferin
 Dovercourt Village
Harbourfront East
 Union Station
 Toronto Islands
Little Portugal
 Trinity
The Danforth West
 Riverdale
Toronto Dominion Centre
 Design Exchange
Brockton
 Parkdale Village
 Exhibition Place
India Bazaar
 The Beaches West
Commerce Court
 Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park
 The Junction South
North Toronto West
The Annex
 North Midtown
 Yorkville
Parkdale
 Roncesvalles
Davisville
University of Toronto
 Harbord
Runnymede
 Swansea
Moore Park
 Summerhill East
Kensington Market
 Chinatown
 Grange Park
Summerhill West
 Rathnelly
 South Hill
 Forest Hill SE
 Deer Park
CN Tower
 King and Spadina
 Railway Lands
 Harbourfront West
 Bathurst Quay
 South Niagara
 Island airport
Rosedale
Stn A PO Boxes
St. James Town


In [104]:
print(toronto_venues.shape)
toronto_venues.head()

(3140, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Regent Park,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Regent Park,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Regent Park,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
3,Regent Park,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
4,Regent Park,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


In [105]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 233 uniques categories.


In [106]:
# Adding OneHot Encoding
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
toronto_onehot.shape

(3140, 233)

In [108]:
# Taking mean frequency & grouping by Neighborhoods
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.sample(5)

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store
28,The Beaches West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,Trinity,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02381,0.0,0.0,0.0,0.0,0.047619,0.0,0.047619,0.02381,0.0
25,South Niagara,0.0,0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,Victoria Hotel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0
5,Design Exchange,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.01,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0


In [109]:
# Printing top 5 in each neighborhood
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Adelaide----
            venue  freq
0     Coffee Shop  0.10
1            Café  0.05
2      Restaurant  0.04
3           Hotel  0.03
4  Clothing Store  0.03


---- Bathurst Quay----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3   Harbor / Marina  0.06
4  Sculpture Garden  0.06


---- Cabbagetown----
                venue  freq
0         Coffee Shop  0.08
1          Restaurant  0.06
2              Bakery  0.04
3  Chinese Restaurant  0.04
4                Park  0.04


---- Chinatown----
                   venue  freq
0                   Café  0.09
1            Coffee Shop  0.07
2  Vietnamese Restaurant  0.05
3                 Bakery  0.05
4     Mexican Restaurant  0.05


---- Deer Park----
              venue  freq
0       Coffee Shop  0.12
1               Pub  0.12
2        Sports Bar  0.06
3              Bank  0.06
4  Sushi Restaurant  0.06


---- Design Exchange----
                 venue  freq
0          Coffee Shop  0.

            venue  freq
0     Coffee Shop  0.10
1            Café  0.05
2      Restaurant  0.04
3           Hotel  0.03
4  Clothing Store  0.03


----Rosedale----
                 venue  freq
0                 Park  0.50
1           Playground  0.25
2                Trail  0.25
3  Moroccan Restaurant  0.00
4         Liquor Store  0.00


----Roselawn----
                 venue  freq
0         Home Service  0.33
1               Garden  0.33
2          Music Venue  0.33
3          Yoga Studio  0.00
4  Moroccan Restaurant  0.00


----Runnymede----
              venue  freq
0       Pizza Place  0.11
1       Coffee Shop  0.09
2              Café  0.09
3  Sushi Restaurant  0.06
4               Pub  0.06


----St. James Town----
                venue  freq
0         Coffee Shop  0.07
1                Café  0.06
2          Restaurant  0.04
3  Italian Restaurant  0.03
4           Gastropub  0.03


----Stn A PO Boxes----
                venue  freq
0         Coffee Shop  0.11
1                Caf

In [110]:
# Function for most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [111]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Restaurant,Deli / Bodega,Thai Restaurant,Clothing Store,Hotel,Gym,Seafood Restaurant,Sushi Restaurant
1,Bathurst Quay,Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Harbor / Marina,Sculpture Garden,Boat or Ferry,Rental Car Location,Plane,Boutique
2,Cabbagetown,Coffee Shop,Restaurant,Bakery,Pizza Place,Chinese Restaurant,Italian Restaurant,Park,Pub,Pet Store,Café
3,Chinatown,Café,Coffee Shop,Mexican Restaurant,Vietnamese Restaurant,Bakery,Dessert Shop,Gaming Cafe,Bar,Vegetarian / Vegan Restaurant,Pharmacy
4,Deer Park,Coffee Shop,Pub,Supermarket,Fried Chicken Joint,Sports Bar,Bagel Shop,Sushi Restaurant,Bank,Pizza Place,American Restaurant


## Clustering Neighborhoods

In [112]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 4, 1, 1, 0, 1, 1, 1, 0, 1])

In [113]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = data_geo_filterd

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Regent Park,Downtown Toronto,43.65426,-79.360636,1,Coffee Shop,Bakery,Park,Pub,Café,Theater,Breakfast Spot,Restaurant,Electronics Store,Event Space
3,Harbourfront,Downtown Toronto,43.65426,-79.360636,1,Coffee Shop,Bakery,Park,Pub,Café,Theater,Breakfast Spot,Restaurant,Electronics Store,Event Space
6,Queen's Park,Downtown Toronto,43.662301,-79.389494,1,Coffee Shop,Sushi Restaurant,Beer Bar,Restaurant,Bank,Bar,Café,Diner,Discount Store,Yoga Studio
7,Ontario Provincial Government,Downtown Toronto,43.662301,-79.389494,1,Coffee Shop,Sushi Restaurant,Beer Bar,Restaurant,Bank,Bar,Café,Diner,Discount Store,Yoga Studio
14,Garden District,Downtown Toronto,43.657162,-79.378937,1,Clothing Store,Coffee Shop,Café,Bubble Tea Shop,Cosmetics Shop,Restaurant,Japanese Restaurant,Italian Restaurant,Middle Eastern Restaurant,Tea Room


In [114]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examining Clusters

### Cluster 1

In [120]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
36,East Toronto,Trail,Health Food Store,Pub,Women's Store,Cupcake Shop,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center
156,Central Toronto,Coffee Shop,Pub,Supermarket,Fried Chicken Joint,Sports Bar,Bagel Shop,Sushi Restaurant,Bank,Pizza Place,American Restaurant
157,Central Toronto,Coffee Shop,Pub,Supermarket,Fried Chicken Joint,Sports Bar,Bagel Shop,Sushi Restaurant,Bank,Pizza Place,American Restaurant
158,Central Toronto,Coffee Shop,Pub,Supermarket,Fried Chicken Joint,Sports Bar,Bagel Shop,Sushi Restaurant,Bank,Pizza Place,American Restaurant
159,Central Toronto,Coffee Shop,Pub,Supermarket,Fried Chicken Joint,Sports Bar,Bagel Shop,Sushi Restaurant,Bank,Pizza Place,American Restaurant
160,Central Toronto,Coffee Shop,Pub,Supermarket,Fried Chicken Joint,Sports Bar,Bagel Shop,Sushi Restaurant,Bank,Pizza Place,American Restaurant


### Cluster 2

In [116]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,Coffee Shop,Bakery,Park,Pub,Café,Theater,Breakfast Spot,Restaurant,Electronics Store,Event Space
3,Downtown Toronto,Coffee Shop,Bakery,Park,Pub,Café,Theater,Breakfast Spot,Restaurant,Electronics Store,Event Space
6,Downtown Toronto,Coffee Shop,Sushi Restaurant,Beer Bar,Restaurant,Bank,Bar,Café,Diner,Discount Store,Yoga Studio
7,Downtown Toronto,Coffee Shop,Sushi Restaurant,Beer Bar,Restaurant,Bank,Bar,Café,Diner,Discount Store,Yoga Studio
14,Downtown Toronto,Clothing Store,Coffee Shop,Café,Bubble Tea Shop,Cosmetics Shop,Restaurant,Japanese Restaurant,Italian Restaurant,Middle Eastern Restaurant,Tea Room
15,Downtown Toronto,Clothing Store,Coffee Shop,Café,Bubble Tea Shop,Cosmetics Shop,Restaurant,Japanese Restaurant,Italian Restaurant,Middle Eastern Restaurant,Tea Room
27,Downtown Toronto,Coffee Shop,Café,Restaurant,American Restaurant,Italian Restaurant,Gastropub,Pharmacy,Bakery,Cocktail Bar,Park
37,Downtown Toronto,Coffee Shop,Cocktail Bar,Restaurant,Seafood Restaurant,Bakery,Beer Bar,Cheese Shop,Café,Gourmet Shop,Pub
41,Downtown Toronto,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Burger Joint,Bar,Japanese Restaurant,Salad Place,Thai Restaurant,Bubble Tea Shop
42,Downtown Toronto,Grocery Store,Café,Park,Candy Store,Italian Restaurant,Diner,Restaurant,Baby Store,Athletics & Sports,Nightclub


### Cluster 3

In [117]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
110,Central Toronto,Park,Bus Line,Swim School,Ethiopian Restaurant,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store
120,Central Toronto,Park,Jewelry Store,Trail,Sushi Restaurant,Cupcake Shop,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center
181,Downtown Toronto,Park,Trail,Playground,Creperie,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner


### Cluster 4

In [118]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
147,Central Toronto,Playground,Women's Store,Cupcake Shop,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store
148,Central Toronto,Playground,Women's Store,Cupcake Shop,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store


### Cluster 5

In [119]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
161,Downtown Toronto,Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Harbor / Marina,Sculpture Garden,Boat or Ferry,Rental Car Location,Plane,Boutique
162,Downtown Toronto,Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Harbor / Marina,Sculpture Garden,Boat or Ferry,Rental Car Location,Plane,Boutique
163,Downtown Toronto,Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Harbor / Marina,Sculpture Garden,Boat or Ferry,Rental Car Location,Plane,Boutique
164,Downtown Toronto,Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Harbor / Marina,Sculpture Garden,Boat or Ferry,Rental Car Location,Plane,Boutique
165,Downtown Toronto,Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Harbor / Marina,Sculpture Garden,Boat or Ferry,Rental Car Location,Plane,Boutique
166,Downtown Toronto,Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Harbor / Marina,Sculpture Garden,Boat or Ferry,Rental Car Location,Plane,Boutique
167,Downtown Toronto,Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Harbor / Marina,Sculpture Garden,Boat or Ferry,Rental Car Location,Plane,Boutique
