<h2>Project Capstone Assignment: Segmenting and Clustering Neighborhoods in Toronto</h2>

In assignment, required to explore, segment, and cluster the neighborhoods in the city of Toronto based on the postalcode and borough information.

In [32]:
# Import libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from IPython.display import display_html
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

print("Imported!")

Imported!


<h2>Question 1</h2>
<p>Scraping Wikipedia page.</p>

In [2]:
# Webscraping
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'html5lib')

In [3]:
# View information
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"42e74c67-f679-4c90-8733-f925cbdc0afe","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":1032600019,"wgRevisionId":1032600019,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Communica

In [4]:
# Initialize an empty dictionary to save the data in
postal_codes_dict = {}

for table_cell in soup.find_all('td'):
    try:
        postal_code = table_cell.p.text # get the postal code
        postal_code_investigate = table_cell.span.text
        neighborhoods_data = table_cell.span.text # get the rest(รับส่วนที่เหลือ) of the data in the cell
        borough = neighborhoods_data.split('(')[0] # get the borough in the cell
        
        # if the cell is not assigned then ignore it
        if neighborhoods_data == 'Not assigned':
            neighborhoods = []
        # else process the data and add it to the dictionary
        else:
            postal_codes_dict[postal_code] = {}
            
            try:
                neighborhoods = neighborhoods_data.split('(')[1]
            
                # remove parantheses from neighborhoods string
                neighborhoods = neighborhoods.replace('(', ' ')
                neighborhoods = neighborhoods.replace(')', ' ')

                neighborhoods_names = neighborhoods.split('/')
                neighborhoods_clean = ', '.join([name.strip() for name in neighborhoods_names])
            except:
                borough = borough.strip('\n')
                neighborhoods_clean = borough
 
            # add borough and neighborhood to dictionary
            postal_codes_dict[postal_code]['borough'] = borough
            postal_codes_dict[postal_code]['neighborhoods'] = neighborhoods_clean
    except:
        pass
    
# Create an empty dataframe
columns = ['Postal Code', 'Borough', 'Neighborhood']
toronto_data = pd.DataFrame(columns=columns)
toronto_data

# Populate dataframe with data from dictionary
for ind, postal_code in enumerate(postal_codes_dict):
    borough = postal_codes_dict[postal_code]['borough']
    neighborhood = postal_codes_dict[postal_code]['neighborhoods']
    toronto_data = toronto_data.append({"Postal Code": postal_code[0:3], # show Postal code in 3-character
                                        "Borough": borough, 
                                        "Neighborhood": neighborhood},
                                        ignore_index=True)

# Print number of rows,column of dataframe
print(toronto_data.shape)

# Print number of rows of dataframe
print(toronto_data.shape[0])

(103, 3)
103


In [5]:
toronto_data.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


<h2>Question 2</h2>
<p>Get the latitude and the longitude coordinates of each neighborhood. </p>

In [6]:
# Import geocoder
import geocoder

# Convert an address into latitude and longitude values
from geopy.geocoders import Nominatim

# Map rendering library
import folium

In [7]:
# Importing the CSV file from the URL
geo_spatial = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv")

# data = pd.read_csv("https://cocl.us/Geospatial_data") --another link

# Print number of rows,column of dataframe
print("Rows and column of dataframe is:", geo_spatial.shape)

geo_spatial.head()

Rows and column of dataframe is: (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<b>Checking the column types of both the dataframes, especially Postal Code column since we are trying to join on it.</b>

In [8]:
toronto_data.dtypes

Postal Code     object
Borough         object
Neighborhood    object
dtype: object

In [9]:
geo_spatial.dtypes

Postal Code     object
Latitude       float64
Longitude      float64
dtype: object

<b>Since the dimensions are the same, we can try to join on the postal codes to get the required data.</b>

In [10]:
# Combined data
combined_data = toronto_data.join(geo_spatial.set_index('Postal Code'), on='Postal Code', how='inner')

# Print number of rows,column of combined_data
print("Rows and column of combined data is:", combined_data.shape)

combined_data.head(10)

Rows and column of combined data is: (103, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [11]:
# Find Postal Code = M5G
combined_data[combined_data['Postal Code'] == 'M5G']

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


<h2>Question 3</h2>
<p>Explore and cluster the neighborhoods in Toronto.</p>

<b>Use geopy library to get the latitude and longitude values of Toronto.</b>

In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent <em>tr_explorer</em>, as shown below.

In [12]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent='tr_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


<b>Create a map of Toronto with neighborhoods superimposed on top.</b>

In [13]:
# Create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# Add markers
for lat, lng, borough, neighborhood in zip(combined_data['Latitude'], combined_data['Longitude'], combined_data['Borough'], combined_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue', # color line of the circle
        fill=True,
        fill_color='#3186cc', # color fill in the circle
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

<b>Work with only boroughs that contain the word Toronto.</b>

In [14]:
# Find boroughs that contain the word Toronto
toronto_data = combined_data[combined_data['Borough'].str.contains('Toronto')].reset_index(drop=True) # ให้นับ index ใหม่

print("Rows and column of toronto data is:", toronto_data.shape)

toronto_data.head()

Rows and column of toronto data is: (39, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


<b>Create a map of boroughs that contain the word Toronto.</b>

In [15]:
# Create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# Add markers
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'],toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue', # color line of the circle
        fill=True,
        fill_color='#3186cc', # color fill in the circle
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

<b>Define Foursquare Credentials and Version.</b>

In [16]:
CLIENT_ID = 'YHQ1QKGPYP2R0CWK1IWMUKFJOG0PRFXTGBTCTQ5KMDIHSIMN' # your Foursquare ID
CLIENT_SECRET = 'UXI2VPFT3TAQSSSWVOELV3XDLCQC4IFI05JUXCDIYU5PGT0Z' # your Foursquare Secret
VERSION = '20210927' # Foursquare API version (update)
LIMIT = 100 # A default Foursquare API limit value
radius = 500 # Define radius

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: YHQ1QKGPYP2R0CWK1IWMUKFJOG0PRFXTGBTCTQ5KMDIHSIMN
CLIENT_SECRET:UXI2VPFT3TAQSSSWVOELV3XDLCQC4IFI05JUXCDIYU5PGT0Z


<h2><b>1. Exploring Neighbourhood in Toronto.</b></h2>

Now, let's get the top 100 venues that are in Toronto within a radius of 500 meters.

In [17]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # Create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            LIMIT)
        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for ech nearby venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

<b>Now write the code to run the above function on each neighborhood and create a new dataframe called toronto_venues.</b>

In [18]:
# toronto_data: boroughs that contain the word Toronto
df = toronto_data

# Use function getNearbyVenues
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                 latitudes=df['Latitude'],
                                 longitudes=df['Longitude']
                                )

Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
The Danforth  East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park, The Junction South
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Enclave of M5E
St. James Town, Cabbagetown
First Canadi

<b>Let's check the size of the resulting dataframe</b>

In [19]:
print(toronto_venues.shape)

toronto_venues.head()

(1633, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,The Extension Room,43.653313,-79.359725,Gym / Fitness Center


Let's check how many venues were returned for each neighborhood.

In [20]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,59,59,59,59,59,59
"Brockton, Parkdale Village, Exhibition Place",24,24,24,24,24,24
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",18,18,18,18,18,18
Central Bay Street,69,69,69,69,69,69
Christie,15,15,15,15,15,15
Church and Wellesley,80,80,80,80,80,80
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,35,35,35,35,35,35
Davisville North,9,9,9,9,9,9
"Dufferin, Dovercourt Village",15,15,15,15,15,15


<b>Let's find out how many unique categories can be curated from all the returned venues.</b>

In [21]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 234 uniques categories.


<h2><b>2. Analyze Each Neighborhood.</b></h2>

In [22]:
# One hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# Add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

# Move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theater,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [23]:
toronto_onehot.shape

(1633, 234)

<b>Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category.</b>

In [24]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theater,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.016949,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.055556,0.055556,0.055556,0.111111,0.166667,0.111111,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.014493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.014493,0.0,0.0,0.0
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Church and Wellesley,0.025,0.0125,0.0,0.0,0.0,0.0,0.0,0.0,0.0125,...,0.0125,0.0125,0.0,0.0,0.0,0.0,0.0,0.0,0.0125,0.0
6,"Commerce Court, Victoria Hotel",0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.01
7,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Dufferin, Dovercourt Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<b>Let's confirm the new size.</b>

In [25]:
toronto_grouped.shape

(39, 234)

<b>Let's print each neighborhood along with the top 5 most common venues.</b>

In [26]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq':2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1            Pharmacy  0.03
2          Restaurant  0.03
3         Cheese Shop  0.03
4  Seafood Restaurant  0.03


----Brockton, Parkdale Village, Exhibition Place----
            venue  freq
0            Café  0.12
1     Yoga Studio  0.08
2  Breakfast Spot  0.08
3     Coffee Shop  0.08
4          Bakery  0.04


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0   Airport Service  0.17
1    Airport Lounge  0.11
2  Airport Terminal  0.11
3             Plane  0.06
4     Boat or Ferry  0.06


----Central Bay Street----
              venue  freq
0       Coffee Shop  0.20
1    Sandwich Place  0.06
2              Café  0.04
3   Bubble Tea Shop  0.03
4  Department Store  0.03


----Christie----
           venue  freq
0  Grocery Store  0.27
1           Café  0.20
2    Coffee Shop  0.13
3           Park  0.13
4     Baby Store  0.0

<b>Let's put that into a pandas dataframe.</b>

First, let's write a function to sort the venues in descending order.

In [27]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

<b>Now let's create the new dataframe and display the top 10 venues for each neighborhood.</b>

In [28]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# Create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venues'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venues'.format(ind+1))
        
# Create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    
print(neighborhoods_venues_sorted.shape)

neighborhoods_venues_sorted.head()

(39, 11)


Unnamed: 0,Neighborhood,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venues,5th Most Common Venues,6th Most Common Venues,7th Most Common Venues,8th Most Common Venues,9th Most Common Venues,10th Most Common Venues
0,Berczy Park,Coffee Shop,Pharmacy,Restaurant,Cheese Shop,Seafood Restaurant,Farmers Market,Cocktail Bar,Bakery,Beer Bar,Pub
1,"Brockton, Parkdale Village, Exhibition Place",Café,Yoga Studio,Breakfast Spot,Coffee Shop,Bakery,Burrito Place,Climbing Gym,Convenience Store,Furniture / Home Store,Grocery Store
2,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Plane,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Coffee Shop,Harbor / Marina
3,Central Bay Street,Coffee Shop,Sandwich Place,Café,Bubble Tea Shop,Department Store,Restaurant,Thai Restaurant,Salad Place,Burger Joint,Japanese Restaurant
4,Christie,Grocery Store,Café,Coffee Shop,Park,Baby Store,Nightclub,Restaurant,Candy Store,Modern European Restaurant,Museum


<h2><b>4. Cluster Neighborhoods</b></h2>

Run k-means to cluster the neighborhood into 5 clusters.

In [29]:
# Set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_[0:50] 

  toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)


array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 0, 3, 1,
       3, 3, 3, 3, 0, 2, 3, 3, 3, 3, 3, 3, 0, 0, 3, 3, 3], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [30]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venues,5th Most Common Venues,6th Most Common Venues,7th Most Common Venues,8th Most Common Venues,9th Most Common Venues,10th Most Common Venues
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3,Coffee Shop,Park,Café,Pub,Bakery,Breakfast Spot,Theater,Restaurant,French Restaurant,Spa
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,3,Clothing Store,Coffee Shop,Cosmetics Shop,Japanese Restaurant,Café,Bubble Tea Shop,Middle Eastern Restaurant,Burger Joint,Ramen Restaurant,Fast Food Restaurant
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3,Coffee Shop,Restaurant,Clothing Store,Cocktail Bar,Café,Hotel,Cosmetics Shop,Bakery,Italian Restaurant,Gym
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Trail,Pub,Yoga Studio,Museum,Martial Arts School,Mediterranean Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,3,Coffee Shop,Pharmacy,Restaurant,Cheese Shop,Seafood Restaurant,Farmers Market,Cocktail Bar,Bakery,Beer Bar,Pub


Finally, let's visualize the resulting clusters.

In [33]:
# Create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

<h2><b>5. Examine Clusters</b></h2>

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

<b>Cluster 1</b>

In [35]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venues,5th Most Common Venues,6th Most Common Venues,7th Most Common Venues,8th Most Common Venues,9th Most Common Venues,10th Most Common Venues
3,East Toronto,0,Health Food Store,Trail,Pub,Yoga Studio,Museum,Martial Arts School,Mediterranean Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant
9,East YorkEast Toronto,0,Convenience Store,Park,Coffee Shop,Yoga Studio,Music Venue,Mediterranean Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop
18,Central Toronto,0,Park,Bus Line,Dim Sum Restaurant,Swim School,Yoga Studio,Movie Theater,Martial Arts School,Mediterranean Restaurant,Men's Store,Mexican Restaurant
33,Downtown Toronto,0,Park,Playground,Trail,Yoga Studio,Movie Theater,Market,Martial Arts School,Mediterranean Restaurant,Men's Store,Mexican Restaurant


<b>Cluster 2</b>

In [36]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venues,5th Most Common Venues,6th Most Common Venues,7th Most Common Venues,8th Most Common Venues,9th Most Common Venues,10th Most Common Venues
29,Central Toronto,1,Tennis Court,Yoga Studio,Museum,Market,Martial Arts School,Mediterranean Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop


<b>Cluster 3</b>

In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venues,5th Most Common Venues,6th Most Common Venues,7th Most Common Venues,8th Most Common Venues,9th Most Common Venues,10th Most Common Venues
19,Central Toronto,2,Fast Food Restaurant,Garden,Ice Cream Shop,Home Service,Miscellaneous Shop,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Middle Eastern Restaurant


<b>Cluster 4</b>

In [39]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venues,5th Most Common Venues,6th Most Common Venues,7th Most Common Venues,8th Most Common Venues,9th Most Common Venues,10th Most Common Venues
0,Downtown Toronto,3,Coffee Shop,Park,Café,Pub,Bakery,Breakfast Spot,Theater,Restaurant,French Restaurant,Spa
1,Downtown Toronto,3,Clothing Store,Coffee Shop,Cosmetics Shop,Japanese Restaurant,Café,Bubble Tea Shop,Middle Eastern Restaurant,Burger Joint,Ramen Restaurant,Fast Food Restaurant
2,Downtown Toronto,3,Coffee Shop,Restaurant,Clothing Store,Cocktail Bar,Café,Hotel,Cosmetics Shop,Bakery,Italian Restaurant,Gym
4,Downtown Toronto,3,Coffee Shop,Pharmacy,Restaurant,Cheese Shop,Seafood Restaurant,Farmers Market,Cocktail Bar,Bakery,Beer Bar,Pub
5,Downtown Toronto,3,Coffee Shop,Sandwich Place,Café,Bubble Tea Shop,Department Store,Restaurant,Thai Restaurant,Salad Place,Burger Joint,Japanese Restaurant
6,Downtown Toronto,3,Grocery Store,Café,Coffee Shop,Park,Baby Store,Nightclub,Restaurant,Candy Store,Modern European Restaurant,Museum
7,Downtown Toronto,3,Coffee Shop,Café,Hotel,Restaurant,Gym,Vegetarian / Vegan Restaurant,Thai Restaurant,Sushi Restaurant,Bakery,Bar
8,West Toronto,3,Pharmacy,Bakery,Pet Store,Music Venue,Middle Eastern Restaurant,Café,Supermarket,Bar,Bank,Brewery
10,Downtown Toronto,3,Coffee Shop,Aquarium,Café,Hotel,Restaurant,Brewery,Fried Chicken Joint,Scenic Lookout,Bar,Baseball Stadium
11,West Toronto,3,Bar,Café,Men's Store,Vietnamese Restaurant,Coffee Shop,Restaurant,Pizza Place,Bakery,Asian Restaurant,Ice Cream Shop


<b>Cluster 5</b>

In [40]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venues,2nd Most Common Venues,3rd Most Common Venues,4th Most Common Venues,5th Most Common Venues,6th Most Common Venues,7th Most Common Venues,8th Most Common Venues,9th Most Common Venues,10th Most Common Venues
21,Central Toronto,4,Trail,Mexican Restaurant,Jewelry Store,Sushi Restaurant,Yoga Studio,Movie Theater,Market,Martial Arts School,Mediterranean Restaurant,Men's Store
