## Franks - Segmenting and Clustering Neighborhoods in Toronto

In [79]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis

In [80]:
# get the data from webpage
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
df1=pd.read_html(url)
df1

[    Postcode           Borough  \
 0        M1A      Not assigned   
 1        M2A      Not assigned   
 2        M3A        North York   
 3        M4A        North York   
 4        M5A  Downtown Toronto   
 5        M5A  Downtown Toronto   
 6        M6A        North York   
 7        M6A        North York   
 8        M7A      Queen's Park   
 9        M8A      Not assigned   
 10       M9A         Etobicoke   
 11       M1B       Scarborough   
 12       M1B       Scarborough   
 13       M2B      Not assigned   
 14       M3B        North York   
 15       M4B         East York   
 16       M4B         East York   
 17       M5B  Downtown Toronto   
 18       M5B  Downtown Toronto   
 19       M6B        North York   
 20       M7B      Not assigned   
 21       M8B      Not assigned   
 22       M9B         Etobicoke   
 23       M9B         Etobicoke   
 24       M9B         Etobicoke   
 25       M9B         Etobicoke   
 26       M9B         Etobicoke   
 27       M1C       

In [81]:
# we are only interested in the first table
df2=df1[0]
df2

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [82]:
# check to have a working pandas dataframe
#df2['Postcode']
#df2['Borough']
df2['Neighbourhood']

0                                           Not assigned
1                                           Not assigned
2                                              Parkwoods
3                                       Victoria Village
4                                           Harbourfront
5                                            Regent Park
6                                       Lawrence Heights
7                                         Lawrence Manor
8                                           Not assigned
9                                           Not assigned
10                                      Islington Avenue
11                                                 Rouge
12                                               Malvern
13                                          Not assigned
14                                       Don Mills North
15                                      Woodbine Gardens
16                                         Parkview Hill
17                             

## Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [83]:
# clean up the dataframe
#df2.dropna(subset = ['borough'])
# df[df.line_race != 0]
df3=df2[df2['Borough'] != 'Not assigned']

## If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [84]:
#df3['Neighbourhood']
#df3['Neighbourhood'].iloc[df3['Neighbourhood'].length]
for index, row in df3.iterrows():
    #print(row['Neighbourhood'], row['Borough'])
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']
        print(row['Neighbourhood'], row['Borough'])

# show changed data
df3    

Queen's Park Queen's Park


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


## More than one neighborhood can exist in one postal code area.
## These two rows will be combined into one row with the neighborhoods separated with a comma.

In [85]:
# do more data cleaning
for index, row in df3.iterrows():
    for index2, row2 in df3.iterrows():
        #print(row['Neighbourhood'], row['Borough'])
        if (row['Borough'] != 'Not assigned') and (row['Borough'] == row2['Borough']) and (index!=index2):
            row['Neighbourhood'] = row['Neighbourhood'] + ',' + row2['Neighbourhood']
            row2['Borough'] = 'Not assigned'
            print(row['Borough'], row['Neighbourhood'])
#df3
# delete the data no longer needed - Not assigned
df4=df3[df3['Borough'] != 'Not assigned']
df4

North York Parkwoods,Victoria Village
North York Parkwoods,Victoria Village,Lawrence Heights
North York Parkwoods,Victoria Village,Lawrence Heights,Lawrence Manor
North York Parkwoods,Victoria Village,Lawrence Heights,Lawrence Manor,Don Mills North
North York Parkwoods,Victoria Village,Lawrence Heights,Lawrence Manor,Don Mills North,Glencairn
North York Parkwoods,Victoria Village,Lawrence Heights,Lawrence Manor,Don Mills North,Glencairn,Flemingdon Park
North York Parkwoods,Victoria Village,Lawrence Heights,Lawrence Manor,Don Mills North,Glencairn,Flemingdon Park,Don Mills South
North York Parkwoods,Victoria Village,Lawrence Heights,Lawrence Manor,Don Mills North,Glencairn,Flemingdon Park,Don Mills South,Hillcrest Village
North York Parkwoods,Victoria Village,Lawrence Heights,Lawrence Manor,Don Mills North,Glencairn,Flemingdon Park,Don Mills South,Hillcrest Village,Bathurst Manor
North York Parkwoods,Victoria Village,Lawrence Heights,Lawrence Manor,Don Mills North,Glencairn,Flemingdon P

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,"Parkwoods,Victoria Village,Lawrence Heights,La..."
4,M5A,Downtown Toronto,"Harbourfront,Regent Park,Ryerson,Garden Distri..."
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,"Islington Avenue,Cloverdale,Islington,Martin G..."
11,M1B,Scarborough,"Rouge,Malvern,Highland Creek,Rouge Hill,Port U..."
15,M4B,East York,"Woodbine Gardens,Parkview Hill,Woodbine Height..."
35,M6C,York,"Humewood-Cedarvale,Caledonia-Fairbanks,Del Ray..."
47,M4E,East Toronto,"The Beaches,The Danforth West,Riverdale,The Be..."
71,M6H,West Toronto,"Dovercourt Village,Dufferin,Little Portugal,Tr..."
144,M4N,Central Toronto,"Lawrence Park,Roselawn,Davisville North,Forest..."


In [86]:
df4.shape

(11, 3)

## Get LAT and LONG for Bourgh using CSV file 

In [87]:
dflatlong=pd.read_csv('http://cocl.us/Geospatial_data')
dflatlong

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [88]:
# add columns for lat and long data
df4["Latitude"] = np.zeros
df4["Longitude"] = np.zeros

#rowdata=dflatlong.loc[dflatlong['Postal Code']=='M7R']
#rowdata
#rowdata.iloc[0,0]
#rowdata.iloc[0,1]
#rowdata['Postal Code']
#idx=dflatlong['Postal Code']=='M7R'
#idx

for index, row in df4.iterrows():
    rowdata=dflatlong.loc[dflatlong['Postal Code']==row['Postcode']]
    #row["Latitude"]  = rowdata.iloc[0,1]
    #row["Longitude"] = rowdata.iloc[0,2]
    tmp1  = rowdata.iloc[0,1]
    tmp2  = rowdata.iloc[0,2]
    print('ADD',row['Postcode'],tmp1,tmp2)
    row["Latitude"]  = tmp1
    row["Longitude"]  = tmp2

# show new dataframe
df4

ADD M3A 43.7532586 -79.3296565
ADD M5A 43.6542599 -79.3606359
ADD M7A 43.6623015 -79.3894938
ADD M9A 43.6678556 -79.53224240000002
ADD M1B 43.806686299999996 -79.19435340000001
ADD M4B 43.7063972 -79.309937
ADD M6C 43.6937813 -79.42819140000002
ADD M4E 43.67635739999999 -79.2930312
ADD M6H 43.66900510000001 -79.4422593
ADD M4N 43.7280205 -79.3887901
ADD M7R 43.6369656 -79.61581899999999


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,"Parkwoods,Victoria Village,Lawrence Heights,La...",43.7533,-79.3297
4,M5A,Downtown Toronto,"Harbourfront,Regent Park,Ryerson,Garden Distri...",43.6543,-79.3606
8,M7A,Queen's Park,Queen's Park,43.6623,-79.3895
10,M9A,Etobicoke,"Islington Avenue,Cloverdale,Islington,Martin G...",43.6679,-79.5322
11,M1B,Scarborough,"Rouge,Malvern,Highland Creek,Rouge Hill,Port U...",43.8067,-79.1944
15,M4B,East York,"Woodbine Gardens,Parkview Hill,Woodbine Height...",43.7064,-79.3099
35,M6C,York,"Humewood-Cedarvale,Caledonia-Fairbanks,Del Ray...",43.6938,-79.4282
47,M4E,East Toronto,"The Beaches,The Danforth West,Riverdale,The Be...",43.6764,-79.293
71,M6H,West Toronto,"Dovercourt Village,Dufferin,Little Portugal,Tr...",43.669,-79.4423
144,M4N,Central Toronto,"Lawrence Park,Roselawn,Davisville North,Forest...",43.728,-79.3888


In [89]:
df4.shape

(11, 5)

## Doing clustering 

In [90]:
# select Toronto boroughs
#for index, row in df4.iterrows():
#    if row['Borough'].contains('Toronto'):
#        print(row)
toronto_data=df4[df4['Borough'].str.contains('Toronto')]
toronto_data.drop(['Postcode'], axis=1, inplace=True)
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']
toronto_data.columns=column_names
toronto_data

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
4,Downtown Toronto,"Harbourfront,Regent Park,Ryerson,Garden Distri...",43.6543,-79.3606
47,East Toronto,"The Beaches,The Danforth West,Riverdale,The Be...",43.6764,-79.293
71,West Toronto,"Dovercourt Village,Dufferin,Little Portugal,Tr...",43.669,-79.4423
144,Central Toronto,"Lawrence Park,Roselawn,Davisville North,Forest...",43.728,-79.3888


## Use processing from New York clustering

In [91]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [92]:
# define the dataframe columns
#column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
#neighborhoods = pd.DataFrame(columns=column_names)

#neighborhoods

#for data in neighborhoods_data:
#    borough = neighborhood_name = data['properties']['borough'] 
#    neighborhood_name = data['properties']['name']
#        
#    neighborhood_latlon = data['geometry']['coordinates']
#    neighborhood_lat = neighborhood_latlon[1]
#    neighborhood_lon = neighborhood_latlon[0]
#    
#    neighborhoods = neighborhoods.append({'Borough': borough,
#                                          'Neighborhood': neighborhood_name,
#                                          'Latitude': neighborhood_lat,
#                                          'Longitude': neighborhood_lon}, ignore_index=True)

#neighborhoods.head()



In [93]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [94]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Define Foursquare Credentials and Version

In [95]:
# The code was removed by Watson Studio for sharing.

In [96]:
LIMIT=100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Explore Neighborhoods

In [97]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )
print(toronto_venues.shape)
toronto_venues.head()

Harbourfront,Regent Park,Ryerson,Garden District,St. James Town,Berczy Park,Central Bay Street,Christie,Adelaide,King,Richmond,Harbourfront East,Toronto Islands,Union Station,Design Exchange,Toronto Dominion Centre,Commerce Court,Victoria Hotel,Harbord,University of Toronto,Chinatown,Grange Park,Kensington Market,CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara,Rosedale,Stn A PO Boxes 25 The Esplanade,Cabbagetown,St. James Town,First Canadian Place,Underground city,Church and Wellesley
The Beaches,The Danforth West,Riverdale,The Beaches West,India Bazaar,Studio District,Business Reply Mail Processing Centre 969 Eastern
Dovercourt Village,Dufferin,Little Portugal,Trinity,Brockton,Exhibition Place,Parkdale Village,High Park,The Junction South,Parkdale,Roncesvalles,Runnymede,Swansea
Lawrence Park,Roselawn,Davisville North,Forest Hill North,Forest Hill West,North Toronto West,The Annex,North Midtown,Yorkville,Davisville,Moore Park,Summerh

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Harbourfront,Regent Park,Ryerson,Garden Distri...",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Harbourfront,Regent Park,Ryerson,Garden Distri...",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Harbourfront,Regent Park,Ryerson,Garden Distri...",43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,"Harbourfront,Regent Park,Ryerson,Garden Distri...",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Harbourfront,Regent Park,Ryerson,Garden Distri...",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


In [98]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Dovercourt Village,Dufferin,Little Portugal,Trinity,Brockton,Exhibition Place,Parkdale Village,High Park,The Junction South,Parkdale,Roncesvalles,Runnymede,Swansea",21,21,21,21,21,21
"Harbourfront,Regent Park,Ryerson,Garden District,St. James Town,Berczy Park,Central Bay Street,Christie,Adelaide,King,Richmond,Harbourfront East,Toronto Islands,Union Station,Design Exchange,Toronto Dominion Centre,Commerce Court,Victoria Hotel,Harbord,University of Toronto,Chinatown,Grange Park,Kensington Market,CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara,Rosedale,Stn A PO Boxes 25 The Esplanade,Cabbagetown,St. James Town,First Canadian Place,Underground city,Church and Wellesley",50,50,50,50,50,50
"Lawrence Park,Roselawn,Davisville North,Forest Hill North,Forest Hill West,North Toronto West,The Annex,North Midtown,Yorkville,Davisville,Moore Park,Summerhill East,Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West",3,3,3,3,3,3
"The Beaches,The Danforth West,Riverdale,The Beaches West,India Bazaar,Studio District,Business Reply Mail Processing Centre 969 Eastern",5,5,5,5,5,5


In [99]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 48 uniques categories.


## Analyze Each Neighborhood

In [101]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Antique Shop,Art Gallery,Bakery,Bank,Bar,Beer Store,Brazilian Restaurant,Breakfast Spot,Brewery,...,Pizza Place,Portuguese Restaurant,Pub,Restaurant,Shoe Store,Spa,Supermarket,Swim School,Theater,Trail
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Antique Shop,Art Gallery,Bakery,Bank,Bar,Beer Store,Brazilian Restaurant,Breakfast Spot,...,Pizza Place,Portuguese Restaurant,Pub,Restaurant,Shoe Store,Spa,Supermarket,Swim School,Theater,Trail
0,"Dovercourt Village,Dufferin,Little Portugal,Tr...",0.0,0.0,0.0,0.095238,0.047619,0.047619,0.0,0.047619,0.0,...,0.047619,0.047619,0.0,0.0,0.0,0.0,0.095238,0.0,0.0,0.0
1,"Harbourfront,Regent Park,Ryerson,Garden Distri...",0.02,0.02,0.02,0.06,0.02,0.0,0.02,0.0,0.04,...,0.0,0.0,0.04,0.04,0.02,0.02,0.0,0.0,0.04,0.0
2,"Lawrence Park,Roselawn,Davisville North,Forest...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0
3,"The Beaches,The Danforth West,Riverdale,The Be...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.2


In [104]:
toronto_grouped.shape

(4, 48)

In [105]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Dovercourt Village,Dufferin,Little Portugal,Trinity,Brockton,Exhibition Place,Parkdale Village,High Park,The Junction South,Parkdale,Roncesvalles,Runnymede,Swansea----
                  venue  freq
0                Bakery  0.10
1              Pharmacy  0.10
2           Supermarket  0.10
3  Gym / Fitness Center  0.05
4                  Café  0.05


----Harbourfront,Regent Park,Ryerson,Garden District,St. James Town,Berczy Park,Central Bay Street,Christie,Adelaide,King,Richmond,Harbourfront East,Toronto Islands,Union Station,Design Exchange,Toronto Dominion Centre,Commerce Court,Victoria Hotel,Harbord,University of Toronto,Chinatown,Grange Park,Kensington Market,CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara,Rosedale,Stn A PO Boxes 25 The Esplanade,Cabbagetown,St. James Town,First Canadian Place,Underground city,Church and Wellesley----
         venue  freq
0  Coffee Shop  0.18
1         Park  0.06
2       Bakery  0.06
3          

In [106]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [107]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Dovercourt Village,Dufferin,Little Portugal,Tr...",Supermarket,Bakery,Pharmacy,Gym / Fitness Center,Park,Fast Food Restaurant,Café,Bus Stop,Brewery,Liquor Store
1,"Harbourfront,Regent Park,Ryerson,Garden Distri...",Coffee Shop,Bakery,Park,Gym / Fitness Center,Breakfast Spot,Mexican Restaurant,Theater,Café,Pub,Restaurant
2,"Lawrence Park,Roselawn,Davisville North,Forest...",Park,Bus Line,Swim School,Event Space,Electronics Store,Discount Store,Dessert Shop,Cosmetics Shop,Coffee Shop,Chocolate Shop
3,"The Beaches,The Danforth West,Riverdale,The Be...",Trail,Other Great Outdoors,Health Food Store,Pub,Event Space,Electronics Store,Discount Store,Dessert Shop,Cosmetics Shop,Coffee Shop


## Cluster Neighborhoods

In [109]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 1, 2], dtype=int32)

In [110]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Downtown Toronto,"Harbourfront,Regent Park,Ryerson,Garden Distri...",43.6543,-79.3606,3,Coffee Shop,Bakery,Park,Gym / Fitness Center,Breakfast Spot,Mexican Restaurant,Theater,Café,Pub,Restaurant
47,East Toronto,"The Beaches,The Danforth West,Riverdale,The Be...",43.6764,-79.293,2,Trail,Other Great Outdoors,Health Food Store,Pub,Event Space,Electronics Store,Discount Store,Dessert Shop,Cosmetics Shop,Coffee Shop
71,West Toronto,"Dovercourt Village,Dufferin,Little Portugal,Tr...",43.669,-79.4423,0,Supermarket,Bakery,Pharmacy,Gym / Fitness Center,Park,Fast Food Restaurant,Café,Bus Stop,Brewery,Liquor Store
144,Central Toronto,"Lawrence Park,Roselawn,Davisville North,Forest...",43.728,-79.3888,1,Park,Bus Line,Swim School,Event Space,Electronics Store,Discount Store,Dessert Shop,Cosmetics Shop,Coffee Shop,Chocolate Shop


In [111]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [112]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
71,"Dovercourt Village,Dufferin,Little Portugal,Tr...",Supermarket,Bakery,Pharmacy,Gym / Fitness Center,Park,Fast Food Restaurant,Café,Bus Stop,Brewery,Liquor Store


In [113]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
144,"Lawrence Park,Roselawn,Davisville North,Forest...",Park,Bus Line,Swim School,Event Space,Electronics Store,Discount Store,Dessert Shop,Cosmetics Shop,Coffee Shop,Chocolate Shop


In [114]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
47,"The Beaches,The Danforth West,Riverdale,The Be...",Trail,Other Great Outdoors,Health Food Store,Pub,Event Space,Electronics Store,Discount Store,Dessert Shop,Cosmetics Shop,Coffee Shop


In [115]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,"Harbourfront,Regent Park,Ryerson,Garden Distri...",Coffee Shop,Bakery,Park,Gym / Fitness Center,Breakfast Spot,Mexican Restaurant,Theater,Café,Pub,Restaurant
