# Segmenting and Clustering Neighborhoods in Toronto #

In [65]:
#Importing libraries
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
import folium
import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [66]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get(url)
print(url)
print(result.status_code)
print(result.headers)

https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
200
{'Date': 'Wed, 22 Apr 2020 17:20:39 GMT', 'Vary': 'Accept-Encoding,Cookie,Authorization', 'Server': 'ATS/8.0.7', 'Content-Type': 'text/html; charset=UTF-8', 'X-Content-Type-Options': 'nosniff', 'P3P': 'CP="See https://en.wikipedia.org/wiki/Special:CentralAutoLogin/P3P for more info."', 'Content-language': 'en', 'Last-Modified': 'Sat, 18 Apr 2020 18:27:33 GMT', 'Content-Encoding': 'gzip', 'Age': '16759', 'X-Cache': 'cp1081 miss, cp1081 hit/30', 'X-Cache-Status': 'hit-front', 'Server-Timing': 'cache;desc="hit-front"', 'Strict-Transport-Security': 'max-age=106384710; includeSubDomains; preload', 'Set-Cookie': 'WMF-Last-Access=22-Apr-2020;Path=/;HttpOnly;secure;Expires=Sun, 24 May 2020 12:00:00 GMT, WMF-Last-Access-Global=22-Apr-2020;Path=/;Domain=.wikipedia.org;HttpOnly;secure;Expires=Sun, 24 May 2020 12:00:00 GMT, GeoIP=US:::37.75:-97.82:v4; Path=/; secure; Domain=.wikipedia.org', 'X-Client-IP': '169.60.39.180', 'Cache-C

In [67]:
#retrieving table
soup = BeautifulSoup(result.content, 'html.parser')
table = soup.find('table')
trs = table.find_all('tr')
rows = []
for tr in trs:
    i = tr.find_all('td')
    if i:
        rows.append(i)
        
lst = []
for row in rows:
    postalcode = row[0].text.rstrip()
    borough = row[1].text.rstrip()
    neighborhood = row[2].text.rstrip()
    if borough != 'Not assigned':
        if neighborhood == 'Not assigned':
            neighborhood = borough
        lst.append([postalcode, borough, neighborhood])

In [70]:
#convert table to data frame
cols = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(lst, columns=cols)
print(df.shape)
df.head(3)

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront


## Parte 2 ##

In [71]:
#get latitude and longitude file
latlong= pd.read_csv('http://cocl.us/Geospatial_data')

In [73]:
latlong.head(3)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711


In [74]:
#Rename so both tables have same column names
df.rename(columns = {'PostalCode':'Postal Code'}, inplace = True)
df.head(2)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village


In [75]:
#merge both tables
df2 = pd.merge(df, latlong, on="Postal Code", how='left')

In [76]:
df2.head(3)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636


## Part 3 ##

In [4]:
#Install more libraries
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

In [14]:
import pandas as pd

In [5]:
address = 'Toronto'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [80]:
# create map of Toronto with latitude and longitude values
map_Tor = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, label in zip(df2['Latitude'], df2['Longitude'], df2['Postal Code']):
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='steelblue',
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Tor)  
    
map_Tor

In [84]:
#Segmenting map
Etobicoke_data = df2[df2['Borough'] == 'Etobicoke'].reset_index(drop=True)
Etobicoke_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
1,M9B,Etobicoke,West Deane Park / Princess Gardens / Martin Gr...,43.650943,-79.554724
2,M9C,Etobicoke,Eringate / Bloordale Gardens / Old Burnhamthor...,43.643515,-79.577201
3,M9P,Etobicoke,Westmount,43.696319,-79.532242
4,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,43.688905,-79.554724


In [87]:
#Getting geographical coordinates
address = 'Etobicoke, Tor'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Etobicoke are {}, {}.'.format(latitude, longitude))

AttributeError: 'NoneType' object has no attribute 'latitude'

In [85]:
#Define Foursquare credentials
CLIENT_ID = 'EL0CDRI4EYC441RJAOWH2RTYRVIWGHZA2O3VGDNFAGQM5JRF' # your Foursquare ID
CLIENT_SECRET = 'NBE0GPVWLIENIYWBTPMDL2EPGQTWJQX5PWRUA0JKMC0R4QP1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: EL0CDRI4EYC441RJAOWH2RTYRVIWGHZA2O3VGDNFAGQM5JRF
CLIENT_SECRET:NBE0GPVWLIENIYWBTPMDL2EPGQTWJQX5PWRUA0JKMC0R4QP1


In [89]:
#DEFINING coorinates of North York
df2.set_index('Postal Code', inplace = True) 
neighborhood_latitude = df2.loc['M4A']['Latitude']
neighborhood_longitude = df2.loc['M4A']['Longitude']

In [90]:
#Getting credentials
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=EL0CDRI4EYC441RJAOWH2RTYRVIWGHZA2O3VGDNFAGQM5JRF&client_secret=NBE0GPVWLIENIYWBTPMDL2EPGQTWJQX5PWRUA0JKMC0R4QP1&v=20180605&ll=43.725882299999995,-79.31557159999998&radius=500&limit=100'

In [91]:

results = requests.get(url).json()
# results

In [92]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [93]:
venues = results['response']['groups'][0]['items']

In [94]:
nearby_venues = json_normalize(venues) # flatten JSON
nearby_venues

Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.id,venue.location.address,venue.location.cc,venue.location.city,venue.location.country,venue.location.crossStreet,venue.location.distance,venue.location.formattedAddress,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.postalCode,venue.location.state,venue.name,venue.photos.count,venue.photos.groups
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4c633acb86b6be9a61268e34-0,"[{'id': '4bf58dd8d48988d185941735', 'name': 'H...",4c633acb86b6be9a61268e34,,CA,,Canada,,267,[Canada],"[{'label': 'display', 'lat': 43.72348055545508...",43.723481,-79.315635,,,Victoria Village Arena,0,[]
1,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4bbe904a85fbb713420d7167-1,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",4bbe904a85fbb713420d7167,1733 Eglinton Ave East,CA,Toronto,Canada,at Bermondsey Rd,202,"[1733 Eglinton Ave East (at Bermondsey Rd), To...","[{'label': 'display', 'lat': 43.72551663171475...",43.725517,-79.313103,M4A 1J8,ON,Tim Hortons,0,[]
2,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4f3ecce6e4b0587016b6f30d-2,"[{'id': '4def73e84765ae376e57713a', 'name': 'P...",4f3ecce6e4b0587016b6f30d,1733 Eglinton Avenue East,CA,Toronto,Canada,Bermondsey,224,"[1733 Eglinton Avenue East (Bermondsey), Toron...","[{'label': 'display', 'lat': 43.72581876267242...",43.725819,-79.312785,,ON,Portugril,0,[]
3,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4d689350b6f46dcb77ee15b2-3,"[{'id': '4bf58dd8d48988d10c941735', 'name': 'F...",4d689350b6f46dcb77ee15b2,,CA,,Canada,,197,[Canada],"[{'label': 'display', 'lat': 43.72705130603407...",43.727051,-79.317418,,,The Frig,0,[]


In [95]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Victoria Village Arena,Hockey Arena,43.723481,-79.315635
1,Tim Hortons,Coffee Shop,43.725517,-79.313103
2,Portugril,Portuguese Restaurant,43.725819,-79.312785
3,The Frig,French Restaurant,43.727051,-79.317418


In [96]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


In [97]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [98]:
venues = getNearbyVenues(names=df2['Borough'],latitudes=df2['Latitude'],longitudes=df2['Longitude'])

North York
North York
Downtown Toronto
North York
Downtown Toronto
Etobicoke
Scarborough
North York
East York
Downtown Toronto
North York
Etobicoke
Scarborough
North York
East York
Downtown Toronto
York
Etobicoke
Scarborough
East Toronto
Downtown Toronto
York
Scarborough
East York
Downtown Toronto
Downtown Toronto
Scarborough
North York
North York
East York
Downtown Toronto
West Toronto
Scarborough
North York
North York
East York
Downtown Toronto
West Toronto
Scarborough
North York
North York
East Toronto
Downtown Toronto
West Toronto
Scarborough
North York
North York
East Toronto
Downtown Toronto
North York
North York
Scarborough
North York
North York
East Toronto
North York
York
North York
Scarborough
North York
North York
Central Toronto
Central Toronto
York
York
Scarborough
North York
Central Toronto
Central Toronto
West Toronto
Etobicoke
Scarborough
North York
Central Toronto
Central Toronto
West Toronto
Mississauga
Etobicoke
Scarborough
Central Toronto
Downtown Toronto
West Toron

In [99]:

print(venues.shape)
venues.head()

(2115, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,North York,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,North York,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,North York,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,North York,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,North York,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [102]:
# one hot encoding
onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot['Neighborhood'] = venues['Neighborhood'] 

onehot.head()

Unnamed: 0,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [103]:
grouped = onehot.groupby('Neighborhood').mean().reset_index()
grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,...,0.009091,0.0,0.0,0.009091,0.0,0.0,0.0,0.0,0.0,0.009091
1,Downtown Toronto,0.0,0.000816,0.000816,0.001631,0.002447,0.001631,0.015498,0.001631,0.004078,...,0.009788,0.001631,0.0,0.003263,0.0,0.004894,0.0,0.0,0.000816,0.00571
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.02439,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00813,0.0,0.0,0.0,0.02439
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.013158,0.0,0.013158,0.0,0.0,0.0,0.0,0.013158
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.014085,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014085,0.0,0.0
5,Mississauga,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North York,0.008584,0.004292,0.0,0.0,0.0,0.0,0.008584,0.0,0.0,...,0.0,0.004292,0.004292,0.008584,0.0,0.0,0.0,0.0,0.008584,0.0
7,Scarborough,0.0,0.0,0.0,0.0,0.0,0.0,0.010753,0.0,0.0,...,0.0,0.0,0.0,0.010753,0.0,0.0,0.0,0.0,0.0,0.0
8,West Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.019608,0.0,0.0,0.013072,0.0,0.006536,0.006536,0.0,0.0,0.013072
9,York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0


In [104]:
num_top_venues = 5

for hood in grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = grouped[grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Toronto----
            venue  freq
0     Coffee Shop  0.07
1  Sandwich Place  0.06
2            Café  0.05
3            Park  0.05
4     Pizza Place  0.05


----Downtown Toronto----
                venue  freq
0         Coffee Shop  0.10
1                Café  0.05
2          Restaurant  0.04
3  Italian Restaurant  0.03
4               Hotel  0.03


----East Toronto----
                venue  freq
0    Greek Restaurant  0.07
1         Coffee Shop  0.05
2             Brewery  0.04
3  Italian Restaurant  0.04
4                Café  0.04


----East York----
                 venue  freq
0                 Bank  0.05
1          Coffee Shop  0.05
2  Sporting Goods Shop  0.04
3         Burger Joint  0.04
4                 Park  0.04


----Etobicoke----
            venue  freq
0     Pizza Place  0.11
1     Coffee Shop  0.07
2  Sandwich Place  0.07
3             Gym  0.04
4        Pharmacy  0.04


----Mississauga----
                 venue  freq
0         Intersection  0.15
1       

In [105]:
grouped = onehot.groupby('Neighborhood').mean().reset_index()
grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,...,0.009091,0.0,0.0,0.009091,0.0,0.0,0.0,0.0,0.0,0.009091
1,Downtown Toronto,0.0,0.000816,0.000816,0.001631,0.002447,0.001631,0.015498,0.001631,0.004078,...,0.009788,0.001631,0.0,0.003263,0.0,0.004894,0.0,0.0,0.000816,0.00571
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.02439,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00813,0.0,0.0,0.0,0.02439
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.013158,0.0,0.013158,0.0,0.0,0.0,0.0,0.013158
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.014085,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014085,0.0,0.0
5,Mississauga,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North York,0.008584,0.004292,0.0,0.0,0.0,0.0,0.008584,0.0,0.0,...,0.0,0.004292,0.004292,0.008584,0.0,0.0,0.0,0.0,0.008584,0.0
7,Scarborough,0.0,0.0,0.0,0.0,0.0,0.0,0.010753,0.0,0.0,...,0.0,0.0,0.0,0.010753,0.0,0.0,0.0,0.0,0.0,0.0
8,West Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.019608,0.0,0.0,0.013072,0.0,0.006536,0.006536,0.0,0.0,0.013072
9,York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0


In [106]:
num_top_venues = 5

for hood in grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = grouped[grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Toronto----
            venue  freq
0     Coffee Shop  0.07
1  Sandwich Place  0.06
2            Café  0.05
3            Park  0.05
4     Pizza Place  0.05


----Downtown Toronto----
                venue  freq
0         Coffee Shop  0.10
1                Café  0.05
2          Restaurant  0.04
3  Italian Restaurant  0.03
4               Hotel  0.03


----East Toronto----
                venue  freq
0    Greek Restaurant  0.07
1         Coffee Shop  0.05
2             Brewery  0.04
3  Italian Restaurant  0.04
4                Café  0.04


----East York----
                 venue  freq
0                 Bank  0.05
1          Coffee Shop  0.05
2  Sporting Goods Shop  0.04
3         Burger Joint  0.04
4                 Park  0.04


----Etobicoke----
            venue  freq
0     Pizza Place  0.11
1     Coffee Shop  0.07
2  Sandwich Place  0.07
3             Gym  0.04
4        Pharmacy  0.04


----Mississauga----
                 venue  freq
0         Intersection  0.15
1       

In [107]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [108]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = grouped['Neighborhood']

for ind in np.arange(grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Coffee Shop,Sandwich Place,Café,Park,Pizza Place,Sushi Restaurant,Dessert Shop,Restaurant,Clothing Store,Gym
1,Downtown Toronto,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Seafood Restaurant,Bakery,Gym
2,East Toronto,Greek Restaurant,Coffee Shop,Café,Italian Restaurant,Brewery,Ice Cream Shop,Pizza Place,Yoga Studio,Park,Restaurant
3,East York,Bank,Coffee Shop,Sandwich Place,Pizza Place,Pharmacy,Park,Sporting Goods Shop,Burger Joint,Fast Food Restaurant,Restaurant
4,Etobicoke,Pizza Place,Sandwich Place,Coffee Shop,Gym,Pharmacy,Liquor Store,Grocery Store,Fast Food Restaurant,Discount Store,Fried Chicken Joint
5,Mississauga,Coffee Shop,Intersection,Hotel,Gym,Burrito Place,Sandwich Place,Mediterranean Restaurant,Middle Eastern Restaurant,Fried Chicken Joint,American Restaurant
6,North York,Coffee Shop,Clothing Store,Restaurant,Bank,Japanese Restaurant,Pizza Place,Park,Sandwich Place,Café,Grocery Store
7,Scarborough,Coffee Shop,Chinese Restaurant,Breakfast Spot,Fast Food Restaurant,Bank,Bakery,Intersection,Pizza Place,Pharmacy,Park
8,West Toronto,Café,Bar,Italian Restaurant,Coffee Shop,Restaurant,Grocery Store,Pizza Place,Bakery,Park,Vegetarian / Vegan Restaurant
9,York,Park,Convenience Store,Trail,Sandwich Place,Fast Food Restaurant,Field,Bus Line,Restaurant,Pool,Tennis Court


In [109]:
# set number of clusters
kclusters = 5

grouped_clustering = grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 1, 1, 3, 4, 2, 1, 3, 1, 0], dtype=int32)

In [110]:
merged = grouped

# add clustering labels
merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
merged = merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

merged.head() # check the last columns!

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,...,Coffee Shop,Sandwich Place,Café,Park,Pizza Place,Sushi Restaurant,Dessert Shop,Restaurant,Clothing Store,Gym
1,Downtown Toronto,0.0,0.000816,0.000816,0.001631,0.002447,0.001631,0.015498,0.001631,0.004078,...,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Park,Seafood Restaurant,Bakery,Gym
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.02439,0.0,0.0,...,Greek Restaurant,Coffee Shop,Café,Italian Restaurant,Brewery,Ice Cream Shop,Pizza Place,Yoga Studio,Park,Restaurant
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Bank,Coffee Shop,Sandwich Place,Pizza Place,Pharmacy,Park,Sporting Goods Shop,Burger Joint,Fast Food Restaurant,Restaurant
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.014085,0.0,0.0,...,Pizza Place,Sandwich Place,Coffee Shop,Gym,Pharmacy,Liquor Store,Grocery Store,Fast Food Restaurant,Discount Store,Fried Chicken Joint
