---

## Segmenting and Clustering Neighborhoods in Toronto (Week 3 Assignment)

---



* All the three parts of the assignment are on this single notebook (marked by their respective titles)

## PART 1 (Data Scraping)

In [1]:
# Import libraries

import pandas as pd
from bs4 import BeautifulSoup
import requests

print("Libraries imported successfully")

Libraries imported successfully



#### Scrape the data using BeautifulSoup
---

In [2]:
# Get the xml from the source
url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=890001695"
source = requests.get(url).text
soup = BeautifulSoup(source, 'xml')

In [3]:
# Extract the table from the Wikipedia page
table = soup.find('table',{'class':'wikitable sortable'})

In [4]:
# Fetch the data from the table
table_rows = table.find_all('tr')
data = []
for row in table_rows:
    td=[]
    for t in row.find_all('td'):
        td.append(t.text.strip())
    data.append(td)

In [5]:
# Put the data in a Dataframe
df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighborhood'])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


##### Data Cleaning
---

In [6]:
# Remove rows where Borough is "Not assigned"
df=df[df['Borough']!='Not assigned']

In [7]:
# If a cell has a borough but Neigborhood = "Not assigned" , then the neighborhood = borough.
df['Neighborhood'] = df.apply(
    lambda row: 
    row['Borough'] if row['Neighborhood'] == 'Not assigned' 
    else row['Neighborhood'],
    axis=1)

In [8]:
# More than one neighborhood can exist in one postal code area. .  
# The rows from the neighborhoods will be combined into one row with the neighborhoods separated with a comma.
df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].\
    apply(', '.join).to_frame()
df.reset_index(inplace=True)

In [9]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
df.shape

(103, 3)

In [11]:
data=df

## PART 2 (Latitude and Longitude)

##### Getting the latitude and longitude information using Geocoder

In [12]:
!pip install geocoder 
import geocoder
print("Geocoder correctly imported")

Geocoder correctly imported


In [13]:
# Get Toronto Latitude and Longitudes from Geocoder
def get_latlng(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords

postal_codes = df['PostalCode']    
coords = [ get_latlng(postal_code) for postal_code in postal_codes.tolist() ]

##### Add the Toronto latitudes and logitudes to the Dataframe


In [14]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944


## PART 3 (Exploring and Clustering)

###### Download dependencies

In [15]:
import numpy as np

!pip install geopy 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium
import folium # map rendering library

import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


print('Libraries imported')

Libraries imported


In [16]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="Toronto")
location = geolocator.geocode(address)
lat_toronto = location.latitude
long_toronto = location.longitude
print('The geographical coordinates of Toronto are {}, {}.'.format(lat_toronto, long_toronto))

The geographical coordinates of Toronto are 43.653963, -79.387207.


Create a map of Toronto superimposed on top

In [17]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[lat_toronto, long_toronto], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Define Foursquare Credentials and Version

In [18]:
CLIENT_ID = '3WPY3MFYVUUTVB0H4HNW2OP0OFXCKVSFUR24SB5ZTC2PSUGM' # your Foursquare ID
CLIENT_SECRET = 'NKG2GJSCFIXAW4ZZX5BX2RNTFMIHNN0MVP2ZZIFFRXXVVNZT' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3WPY3MFYVUUTVB0H4HNW2OP0OFXCKVSFUR24SB5ZTC2PSUGM
CLIENT_SECRET:NKG2GJSCFIXAW4ZZX5BX2RNTFMIHNN0MVP2ZZIFFRXXVVNZT


###### Explore Toronto top 20 within 500 meters from Toronto's given latitude and logitude (in Toronto's downtown core near Yonge-Dundas square)

In [19]:
LIMIT = 20 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    lat_toronto, 
    long_toronto, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=3WPY3MFYVUUTVB0H4HNW2OP0OFXCKVSFUR24SB5ZTC2PSUGM&client_secret=NKG2GJSCFIXAW4ZZX5BX2RNTFMIHNN0MVP2ZZIFFRXXVVNZT&v=20180605&ll=43.653963,-79.387207&radius=500&limit=20'

In [20]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e7e7f8f29ce6a001c7cdff5'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Bay Street Corridor',
  'headerFullLocation': 'Bay Street Corridor, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 74,
  'suggestedBounds': {'ne': {'lat': 43.6584630045, 'lng': -79.38099903084075},
   'sw': {'lat': 43.649462995499995, 'lng': -79.39341496915925}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5227bb01498e17bf485e6202',
       'name': 'Downtown Toronto',
       'location': {'lat': 43.65323167517444,
        'lng': -79.38529600606677,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.65323167517444,
          

In [21]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [22]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]


nearby_venues.set_index('name', inplace=True)


print("Top 20 Nearby Venues:")
nearby_venues.head(20)

Top 20 Nearby Venues:


Unnamed: 0_level_0,categories,lat,lng
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Downtown Toronto,Neighborhood,43.653232,-79.385296
Japango,Sushi Restaurant,43.655268,-79.385165
Poke Guys,Poke Place,43.654895,-79.385052
Rolltation,Japanese Restaurant,43.654918,-79.387424
Sansotei Ramen 三草亭,Ramen Restaurant,43.655157,-79.386501
Karine's,Breakfast Spot,43.653699,-79.390743
Manpuku まんぷく,Japanese Restaurant,43.653612,-79.390613
Fugo Desserts,Ice Cream Shop,43.654923,-79.387382
Chatime 日出茶太,Bubble Tea Shop,43.655542,-79.384684
The Library Specialty Coffee,Coffee Shop,43.654413,-79.390902


Now, this process is extended to find nearby venues for all the neighborhoods in our dataframe

In [23]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [24]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

In [25]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.811525,-79.195517,Wood Bison Paddock,43.811732,-79.200708,Zoo Exhibit
1,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, Morningside, West Hill",43.765815,-79.175193,Homestead Roofing Repair,43.76514,-79.178663,Construction & Landscaping
3,"Guildwood, Morningside, West Hill",43.765815,-79.175193,Heron Park Community Centre,43.768867,-79.176958,Gym / Fitness Center
4,"Guildwood, Morningside, West Hill",43.765815,-79.175193,Heron Park,43.769327,-79.177201,Park


In [26]:
toronto_venues.shape

(1071, 7)

Let's check how many venues were returned for each neighborhood

In [27]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",20,20,20,20,20,20
Agincourt,15,15,15,15,15,15
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",2,2,2,2,2,2
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",15,15,15,15,15,15
"Alderwood, Long Branch",4,4,4,4,4,4
Bayview Village,3,3,3,3,3,3
"Bedford Park, Lawrence Manor East",20,20,20,20,20,20
Berczy Park,20,20,20,20,20,20
"Birch Cliff, Cliffside West",6,6,6,6,6,6
"Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe",6,6,6,6,6,6


#### Analyze Each Neighborhood

In [28]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Zoo Exhibit,Airport,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,Auto Garage,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
toronto_onehot.shape

(1071, 201)

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category¶


In [30]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Zoo Exhibit,Airport,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Let's print each neighborhood along with the top 5 most common venues¶


In [31]:

num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                venue  freq
0                Café  0.15
1  Seafood Restaurant  0.10
2          Restaurant  0.10
3         Coffee Shop  0.10
4    Greek Restaurant  0.05


----Agincourt----
                  venue  freq
0         Shopping Mall  0.13
1    Chinese Restaurant  0.13
2  Hong Kong Restaurant  0.07
3   Shanghai Restaurant  0.07
4                Bakery  0.07


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                venue  freq
0            Pharmacy   0.5
1    Sushi Restaurant   0.5
2  Miscellaneous Shop   0.0
3              Lounge   0.0
4              Market   0.0


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                 venue  freq
0        Grocery Store  0.13
1             Gym Pool  0.07
2           Beer Store  0.07
3  Fried Chicken Joint  0.07
4         Liquor Store  0.07


----Alderwood, Long Branch----
                        venue  freq
0

            venue  freq
0  Clothing Store  0.15
1       Juice Bar  0.10
2     Coffee Shop  0.10
3         Theater  0.05
4  Chocolate Shop  0.05


----First Canadian Place, Underground city----
                    venue  freq
0                    Café  0.15
1             Coffee Shop  0.15
2              Restaurant  0.15
3               Gastropub  0.05
4  Gluten-free Restaurant  0.05


----Flemingdon Park, Don Mills South----
             venue  freq
0       Beer Store  0.14
1  Bubble Tea Shop  0.14
2              Gym  0.14
3    Grocery Store  0.14
4      Coffee Shop  0.14


----Forest Hill North, Forest Hill West----
                        venue  freq
0                        Park   1.0
1                 Zoo Exhibit   0.0
2  Modern European Restaurant   0.0
3                      Lounge   0.0
4                      Market   0.0


----Glencairn----
                  venue  freq
0           Pizza Place  0.23
1      Sushi Restaurant  0.08
2        Ice Cream Shop  0.08
3   Japanese Restaur

4             Burrito Place  0.05


----Studio District----
                 venue  freq
0              Brewery  0.15
1          Coffee Shop  0.10
2                Diner  0.10
3   Italian Restaurant  0.05
4  American Restaurant  0.05


----The Annex, North Midtown, Yorkville----
                venue  freq
0      Sandwich Place  0.10
1                Café  0.10
2       Historic Site  0.05
3   French Restaurant  0.05
4  Italian Restaurant  0.05


----The Beaches----
               venue  freq
0               Park   0.2
1  Health Food Store   0.2
2              Trail   0.2
3                Pub   0.2
4        Zoo Exhibit   0.0


----The Beaches West, India Bazaar----
                venue  freq
0                Park  0.11
1  Italian Restaurant  0.06
2           Pet Store  0.06
3         Pizza Place  0.06
4         Coffee Shop  0.06


----The Danforth West, Riverdale----
            venue  freq
0  Discount Store   0.2
1            Park   0.2
2   Grocery Store   0.2
3        Bus Line   0.2


Let's put that into a pandas dataframe


In [32]:
# Sort venues in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [33]:
# Create a dataframe with top 10 venues from each neighborhood
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Café,Coffee Shop,Restaurant,Seafood Restaurant,Gym / Fitness Center,Bakery,Greek Restaurant,Pizza Place,Food Court,Steakhouse
1,Agincourt,Shopping Mall,Chinese Restaurant,Bakery,Pool,Discount Store,Bubble Tea Shop,Shanghai Restaurant,Skating Rink,Supermarket,Sushi Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Pharmacy,Sushi Restaurant,Yoga Studio,Dessert Shop,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Park,Pharmacy,Liquor Store,Fast Food Restaurant,Caribbean Restaurant,Beer Store,Fried Chicken Joint,Pizza Place,Sandwich Place
4,"Alderwood, Long Branch",Performing Arts Venue,Gym,Convenience Store,Pub,Department Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office


### Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.



In [34]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ [0:10] 


array([0, 2, 2, 2, 0, 4, 2, 0, 0, 2])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.



In [35]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() 

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517,1.0,Zoo Exhibit,Dessert Shop,Farm,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office,Distribution Center
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725,3.0,Bar,Yoga Studio,Diner,Farm,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765815,-79.175193,4.0,Gym / Fitness Center,Bus Stop,Construction & Landscaping,Park,Gluten-free Restaurant,Department Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office
3,M1G,Scarborough,Woburn,43.768369,-79.21759,0.0,Park,Korean Restaurant,Coffee Shop,Business Service,Diner,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944,0.0,Playground,Trail,Yoga Studio,Department Store,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office


In [39]:
# Drop neighborhoods that have not available data

toronto_merged=toronto_merged.dropna()
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)

#### Cluster Visualization

In [42]:
# create map
map_clusters = folium.Map(location=[lat_toronto, long_toronto], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examine Each Cluster

Cluster 1

In [44]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Scarborough,0,Park,Korean Restaurant,Coffee Shop,Business Service,Diner,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run
4,Scarborough,0,Playground,Trail,Yoga Studio,Department Store,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office
6,Scarborough,0,Coffee Shop,Convenience Store,Hobby Shop,Chinese Restaurant,Discount Store,Department Store,Yoga Studio,Diner,Falafel Restaurant,Electronics Store
7,Scarborough,0,Bakery,Bus Line,Intersection,Metro Station,Bus Station,Soccer Field,Coffee Shop,Creperie,Distribution Center,Electronics Store
9,Scarborough,0,College Stadium,Skating Rink,Gym Pool,General Entertainment,Park,Gym,Arts & Crafts Store,Asian Restaurant,Farmers Market,Farm
10,Scarborough,0,Bakery,Gift Shop,Rental Service,Brewery,Convenience Store,Diner,Farm,Falafel Restaurant,Electronics Store,Eastern European Restaurant
11,Scarborough,0,Convenience Store,Auto Garage,Yoga Studio,Diner,Farm,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run
21,North York,0,Café,Middle Eastern Restaurant,Pizza Place,Korean Restaurant,Coffee Shop,Hookah Bar,Dessert Shop,Ramen Restaurant,Bus Line,Sandwich Place
22,North York,0,Café,Shopping Mall,Fried Chicken Joint,Juice Bar,Electronics Store,Lounge,Fast Food Restaurant,Indonesian Restaurant,Ice Cream Shop,Ramen Restaurant
23,North York,0,Bank,Park,Speakeasy,Convenience Store,Yoga Studio,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run


Cluster 2

In [45]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,1,Zoo Exhibit,Dessert Shop,Farm,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office,Distribution Center


Cluster 3

In [46]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Scarborough,2,Grocery Store,Train Station,Restaurant,Indian Restaurant,Yoga Studio,Dessert Shop,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run
8,Scarborough,2,Bank,Liquor Store,Gift Shop,Pharmacy,Coffee Shop,Sandwich Place,Bistro,Dessert Shop,Eastern European Restaurant,Dumpling Restaurant
12,Scarborough,2,Shopping Mall,Chinese Restaurant,Bakery,Pool,Discount Store,Bubble Tea Shop,Shanghai Restaurant,Skating Rink,Supermarket,Sushi Restaurant
13,Scarborough,2,Pharmacy,Coffee Shop,Convenience Store,Fast Food Restaurant,Thai Restaurant,Chinese Restaurant,Hobby Shop,Fried Chicken Joint,Shopping Mall,Pizza Place
14,Scarborough,2,Pharmacy,Sushi Restaurant,Yoga Studio,Dessert Shop,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office
15,Scarborough,2,Fast Food Restaurant,Chinese Restaurant,Bank,Pizza Place,Sandwich Place,Cosmetics Shop,Burger Joint,Coffee Shop,Other Great Outdoors,Grocery Store
17,North York,2,Residential Building (Apartment / Condo),Dog Run,Yoga Studio,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Doctor's Office,Distribution Center,Discount Store
18,North York,2,Clothing Store,Juice Bar,Coffee Shop,Shopping Mall,Theater,Food Court,Bank,Liquor Store,Tea Room,Restaurant
27,North York,2,Grocery Store,Coffee Shop,Intersection,Beer Store,Supermarket,Bubble Tea Shop,Gym,Cosmetics Shop,Creperie,Farm
29,North York,2,Bar,Furniture / Home Store,Fast Food Restaurant,Massage Studio,Japanese Restaurant,Restaurant,Caribbean Restaurant,Bank,Falafel Restaurant,Pizza Place


Cluster 4

In [47]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Scarborough,3,Bar,Yoga Studio,Diner,Farm,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office


Cluster 5

In [49]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Scarborough,4,Gym / Fitness Center,Bus Stop,Construction & Landscaping,Park,Gluten-free Restaurant,Department Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office
19,North York,4,Park,Construction & Landscaping,Trail,Department Store,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office
25,North York,4,Park,Food & Drink Shop,Yoga Studio,Department Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office,Distribution Center
34,North York,4,Grocery Store,Park,Department Store,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office,Distribution Center
46,Central Toronto,4,Park,Gym Pool,Playground,Garden,Deli / Bodega,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office,Distribution Center
50,Downtown Toronto,4,Grocery Store,Playground,Candy Store,Park,Dessert Shop,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run
64,Central Toronto,4,Park,Yoga Studio,Department Store,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office,Distribution Center
73,York,4,Grocery Store,Park,Field,Trail,Hockey Arena,Department Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office
79,North York,4,Park,Basketball Court,Bakery,Concert Hall,Diner,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run
90,Etobicoke,4,Park,Lounge,Dessert Shop,Falafel Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dog Run,Doctor's Office,Distribution Center
