# Segmenting & Clustering Toronto

First we need to import libraries

In [1]:
import pandas as pd
from geopy.geocoders import Nominatim
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#import folium
!conda install -c conda-forge folium=0.5.0 --yes 
print('Libraries imported!')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  54.02 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  35.21 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  41.03 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  48.25 MB/s
Libraries imported!


In [6]:
#import pd from previous task
newdf_toronto = pd.read_pickle('newdf_toronto.pkl')

In [7]:
newdf_toronto.shape

(103, 4)

In [8]:
newdf_toronto.head()

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [10]:
#Find the Boroughs that contain the word "Toronto". I will need these lis later when choosing some Borough to explore

borough_list = []
for i in range(1,102):
    temp = newdf_toronto.ix[i,'Borough']
    if temp.find('Toronto') != -1:
        borough_list.append(temp)
borough_list = sorted(set(borough_list))
borough_list

['Central Toronto', 'Downtown Toronto', 'East Toronto', 'West Toronto']

I will use geopy library to find the coordinates of Toronto

In [11]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


# Create a map of Toronto

In [12]:
#Create map
import folium
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

#add markers
for lat, lng, borough, neighborhood in zip(newdf_toronto['Latitude'], newdf_toronto['Longitude'], newdf_toronto['Borough'], newdf_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

I will explore the Boroughs containig the word Toronto. As I have previously generated a lift with them, they are: Central Toronto, Downtown Toronto, East Toronto, West Toronto. Below I will explore these 4 Boroughs.

# Central Toronto

Extracting the data for borough Central Toronto

In [18]:
central_toronto_data = newdf_toronto[newdf_toronto['Borough'] == 'Central Toronto'].reset_index(drop=True)
central_toronto_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Central Toronto,Lawrence Park,43.72802,-79.38879
1,Central Toronto,Roselawn,43.711695,-79.416936
2,Central Toronto,Davisville North,43.712751,-79.390197
3,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307
4,Central Toronto,North Toronto West,43.715383,-79.405678


I will extract latitude and longitude for Central Toronto but, as we will see, they are the same for all Boroughs of Toronto as for Toronto. So, I will not repeat this step for the other 3 Boroughs.

In [13]:
address = 'Central Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Central Toronto are 43.653963, -79.387207.


I will generate the map containing the marks only for those Neighborhoods located in the Borough Central Toronto

In [19]:
map_central_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)


for lat, lng, borough, neighborhood in zip(central_toronto_data['Latitude'], central_toronto_data['Longitude'], central_toronto_data['Borough'], central_toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_central_toronto)  

map_central_toronto

# Downtown Toronto

I will repeat the steps of extractng Neighborhoods in Downtown Toronto and generate them on the map

In [20]:
downtown_toronto_data = newdf_toronto[newdf_toronto['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_toronto_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,Downtown Toronto,St. James Town,43.651494,-79.375418
3,Downtown Toronto,Berczy Park,43.644771,-79.373306
4,Downtown Toronto,Central Bay Street,43.657952,-79.387383


In [17]:

map_downtown_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)


for lat, lng, borough, neighborhood in zip(downtown_toronto_data['Latitude'], downtown_toronto_data['Longitude'], downtown_toronto_data['Borough'], downtown_toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown_toronto)  

map_downtown_toronto

# East Toronto

Same steps to generate a map containig only marks for East Toronto

In [22]:
east_toronto_data = newdf_toronto[newdf_toronto['Borough'] == 'East Toronto'].reset_index(drop=True)
east_toronto_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,East Toronto,The Beaches,43.676357,-79.293031
1,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,East Toronto,Studio District,43.659526,-79.340923
4,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558


In [23]:
map_east_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)


for lat, lng, borough, neighborhood in zip(east_toronto_data['Latitude'], east_toronto_data['Longitude'], east_toronto_data['Borough'], east_toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_east_toronto)  

map_east_toronto

# West Toronto

Finally, a amap with venues situated only in West Toronto

In [24]:
west_toronto_data = newdf_toronto[newdf_toronto['Borough'] == 'West Toronto'].reset_index(drop=True)
west_toronto_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
1,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975
2,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191
3,West Toronto,"High Park, The Junction South",43.661608,-79.464763
4,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325


In [25]:
map_west_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)


for lat, lng, borough, neighborhood in zip(west_toronto_data['Latitude'], west_toronto_data['Longitude'], west_toronto_data['Borough'], west_toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_west_toronto)  

map_west_toronto

Initiating Foursquare Credentials:

In [28]:
CLIENT_ID = 'YIQJW1Z1QKNDA31QQJXLXX40CNMY3UEZQV3TG4MTWBXTRHZZ' # your Foursquare ID
CLIENT_SECRET = 'H4QD52ATEFVNLIK3MLRPL1XILQUD3G3ZLMPT3EBLCOG0I1BQ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: YIQJW1Z1QKNDA31QQJXLXX40CNMY3UEZQV3TG4MTWBXTRHZZ
CLIENT_SECRET:H4QD52ATEFVNLIK3MLRPL1XILQUD3G3ZLMPT3EBLCOG0I1BQ


In [29]:
import json
from pandas.io.json import json_normalize

Only for Central Toronro I will make an analyze of venues by there type as in previous week lab

Finding the first Neighborhood in Central Toronto list created above

In [30]:
central_toronto_data.loc[0, 'Neighborhood']

'Lawrence Park'

In [31]:
neighborhood_latitude = central_toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = central_toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = central_toronto_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


Asking Forsquare to explore Lawrence Park. We will ask for 100 venues on a radius of 500

In [33]:
limit = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)


In [34]:

#Foursquare results

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c97803d1ed219272ec90c0f'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4e5ffdf962e13e3bcd932a0a-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/default_',
          'suffix': '.png'},
         'id': '4d4b7105d754a06374d81259',
         'name': 'Food',
         'pluralName': 'Food',
         'primary': True,
         'shortName': 'Food'}],
       'id': '4e5ffdf962e13e3bcd932a0a',
       'location': {'address': '2432 Bloor St. W',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'Jane',
        'distance': 225,
        'formattedAddress': ['2432 Bloor St. W (Jane)',
         'Toronto ON M6S 1P9',
         'Canada'],
        'labeledLatLngs': [{'label': 'display',
          'lat

In [35]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Create a dataframe with venues returned by Foursquare and see the number of returned venues

In [37]:

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,The Good Fork,Food,43.649565,-79.484023
1,Coffee Tree Roastery,Café,43.649647,-79.483436
2,The One That Got Away,Fish & Chips Shop,43.649842,-79.482615
3,Fat Bastard Burrito,Burrito Place,43.649779,-79.482894
4,Asa Sushi,Sushi Restaurant,43.649902,-79.484611


In [38]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

37 venues were returned by Foursquare.


Create a function to generate venues for all Neighborhoods in Central Toronto

In [39]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [40]:
#start creating the dataframe central_toronto_venues
central_toronto_venues = getNearbyVenues(names=central_toronto_data['Neighborhood'],
                                   latitudes=central_toronto_data['Latitude'],
                                   longitudes=central_toronto_data['Longitude']
                                  )

Lawrence Park
Roselawn
Davisville North
Forest Hill North, Forest Hill West
North Toronto West
The Annex, North Midtown, Yorkville
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West


In [83]:
print(central_toronto_venues.shape)
central_toronto_venues.head()

(116, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,Roselawn,43.711695,-79.416936,Ceiling Champions,43.713891,-79.420702,Home Service
4,Roselawn,43.711695,-79.416936,Rosalind's Garden Oasis,43.712189,-79.411978,Garden


Group the venues in each Neighborhood

In [85]:
central_toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,36,36,36,36,36,36
Davisville North,9,9,9,9,9,9
"Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West",15,15,15,15,15,15
"Forest Hill North, Forest Hill West",4,4,4,4,4,4
Lawrence Park,3,3,3,3,3,3
"Moore Park, Summerhill East",2,2,2,2,2,2
North Toronto West,21,21,21,21,21,21
Roselawn,2,2,2,2,2,2
"The Annex, North Midtown, Yorkville",24,24,24,24,24,24


Let's find out how many unique categories are

In [86]:
print('There are {} uniques categories.'.format(len(central_toronto_venues['Venue Category'].unique())))

There are 64 uniques categories.


# Analyze each neighborhood

Convert text label in numbers with onehot

In [42]:
# one hot encoding for venue category
central_toronto_onehot = pd.get_dummies(central_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
central_toronto_onehot['Neighborhood'] = central_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [central_toronto_onehot.columns[-1]] + list(central_toronto_onehot.columns[:-1])
central_toronto_onehot = central_toronto_onehot[fixed_columns]

central_toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,...,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Roselawn,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Roselawn,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
central_toronto_onehot.shape

(116, 65)

Group rows and take the mean of frequency to find out the frequency with which a venue type is present in th neighborhood

In [44]:
central_toronto_grouped = central_toronto_onehot.groupby('Neighborhood').mean().reset_index()
central_toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,...,Supermarket,Sushi Restaurant,Swim School,Tennis Court,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.027778,0.0,0.0,0.0,0.027778,0.027778,0.0,0.055556,...,0.0,0.055556,0.0,0.0,0.027778,0.027778,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",0.066667,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,...,0.066667,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0
3,"Forest Hill North, Forest Hill West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
4,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,...,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Moore Park, Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
6,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619
7,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"The Annex, North Midtown, Yorkville",0.041667,0.0,0.041667,0.0,0.0,0.0,0.041667,0.0,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0


In [45]:
central_toronto_grouped.shape

(9, 65)

In [46]:
Print each neighborhood along with the top 5 most common venues

SyntaxError: invalid syntax (<ipython-input-46-0bef2339cb0d>, line 1)

In [97]:
num_top_venues = 5

for hood in central_toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = central_toronto_grouped[central_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
                venue  freq
0         Pizza Place  0.11
1      Sandwich Place  0.08
2        Dessert Shop  0.08
3                Café  0.06
4  Italian Restaurant  0.06


----Davisville North----
               venue  freq
0              Hotel  0.11
1       Burger Joint  0.11
2  Food & Drink Shop  0.11
3       Dance Studio  0.11
4               Park  0.11


----Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West----
                 venue  freq
0                  Pub  0.13
1          Coffee Shop  0.13
2  American Restaurant  0.07
3          Supermarket  0.07
4          Pizza Place  0.07


----Forest Hill North, Forest Hill West----
                 venue  freq
0        Jewelry Store  0.25
1                Trail  0.25
2                 Park  0.25
3     Sushi Restaurant  0.25
4  American Restaurant  0.00


----Lawrence Park----
                 venue  freq
0          Swim School  0.33
1             Bus Line  0.33
2                 Park  0.33
3  American Re

Create a dataframe with these

In [47]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


In [48]:
import numpy as np

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = central_toronto_grouped['Neighborhood']

for ind in np.arange(central_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(central_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Pizza Place,Sandwich Place,Dessert Shop,Coffee Shop,Italian Restaurant,Café,Sushi Restaurant,Restaurant,Diner,Indian Restaurant
1,Davisville North,Gym,Food & Drink Shop,Hotel,Dance Studio,Gym / Fitness Center,Breakfast Spot,Burger Joint,Sandwich Place,Park,Farmers Market
2,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",Pub,Coffee Shop,Sports Bar,Vietnamese Restaurant,Fried Chicken Joint,Light Rail Station,Medical Center,Pizza Place,Convenience Store,American Restaurant
3,"Forest Hill North, Forest Hill West",Trail,Park,Jewelry Store,Sushi Restaurant,Yoga Studio,Food & Drink Shop,Dessert Shop,Diner,Farmers Market,Fast Food Restaurant
4,Lawrence Park,Bus Line,Park,Swim School,Yoga Studio,Diner,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Furniture / Home Store


# Cluster Neighborhoods

In [52]:
# set number of clusters
kclusters = 5

central_toronto_grouped_clustering = central_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(central_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 4, 3, 2, 0, 1, 0], dtype=int32)

In [53]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ct_merged = central_toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
ct_merged = ct_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

ct_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Bus Line,Park,Swim School,Yoga Studio,Diner,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Furniture / Home Store
1,Central Toronto,Roselawn,43.711695,-79.416936,1,Home Service,Garden,Vietnamese Restaurant,Gym / Fitness Center,Gym,Grocery Store,Greek Restaurant,Gourmet Shop,Furniture / Home Store,Fried Chicken Joint
2,Central Toronto,Davisville North,43.712751,-79.390197,0,Gym,Food & Drink Shop,Hotel,Dance Studio,Gym / Fitness Center,Breakfast Spot,Burger Joint,Sandwich Place,Park,Farmers Market
3,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307,4,Trail,Park,Jewelry Store,Sushi Restaurant,Yoga Studio,Food & Drink Shop,Dessert Shop,Diner,Farmers Market,Fast Food Restaurant
4,Central Toronto,North Toronto West,43.715383,-79.405678,0,Coffee Shop,Sporting Goods Shop,Clothing Store,Salon / Barbershop,Gym / Fitness Center,Furniture / Home Store,Fast Food Restaurant,Diner,Dessert Shop,Mexican Restaurant


Visualize clusters

In [54]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ct_merged['Latitude'], ct_merged['Longitude'], ct_merged['Neighborhood'], ct_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Cluster 1 examination (Red)

In [104]:
ct_merged.loc[ct_merged['Cluster Labels'] == 0, ct_merged.columns[[1] + list(range(5, ct_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Davisville North,Gym,Food & Drink Shop,Hotel,Dance Studio,Gym / Fitness Center,Breakfast Spot,Burger Joint,Sandwich Place,Park,Farmers Market
4,North Toronto West,Coffee Shop,Sporting Goods Shop,Clothing Store,Salon / Barbershop,Gym / Fitness Center,Furniture / Home Store,Fast Food Restaurant,Diner,Dessert Shop,Mexican Restaurant
5,"The Annex, North Midtown, Yorkville",Sandwich Place,Café,Coffee Shop,Pizza Place,American Restaurant,History Museum,Indian Restaurant,Jewish Restaurant,Liquor Store,Park
6,Davisville,Pizza Place,Sandwich Place,Dessert Shop,Coffee Shop,Italian Restaurant,Café,Sushi Restaurant,Restaurant,Diner,Indian Restaurant
8,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",Pub,Coffee Shop,Sports Bar,Vietnamese Restaurant,Fried Chicken Joint,Light Rail Station,Medical Center,Pizza Place,Convenience Store,American Restaurant


Cluster 2 examination

In [105]:
ct_merged.loc[ct_merged['Cluster Labels'] == 1, ct_merged.columns[[1] + list(range(5, ct_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Roselawn,Home Service,Garden,Vietnamese Restaurant,Gym / Fitness Center,Gym,Grocery Store,Greek Restaurant,Gourmet Shop,Furniture / Home Store,Fried Chicken Joint


and so on

Thank you!