# Create Dataframe

In [1]:
import pandas as pd

In [2]:
table_TO = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [3]:
print(f'Total tables: {len(table_TO)}')

Total tables: 3


In [4]:
#Guess that it is the first table
df = table_TO[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
#Size before cleaning
df.shape

(180, 3)

In [6]:
#drop non asigned rows
df2=df[df.Borough != 'Not assigned']
df2.shape

(103, 3)

In [None]:
#See if there are any "Not assigned" Neigborhoods
for i in df2.columns:
    df2[i].astype('str').apply(lambda x: print(df2[i].name) if x.startswith('Not assigned') else 'pass')

In [None]:
#Combine rows with same Borough
df2['Neighbourhood'] = df.groupby(['Postal Code'])['Neighbourhood'].transform(lambda x : ','.join(x)) 
df2 = df2.drop_duplicates(subset=['Neighbourhood'])  
df2=df.loc[(df['Postal Code'] == 'M5G') | (df['Postal Code'] == 'M2H')| (df['Postal Code'] == 'M4B') 
               | (df['Postal Code'] == 'M1J') | (df['Postal Code'] == 'M4G') | (df['Postal Code'] == 'M4M')
                | (df['Postal Code'] == 'M1R') | (df['Postal Code'] == 'M9V') | (df['Postal Code'] == 'M9L')
                   | (df['Postal Code'] == 'M5V') | (df['Postal Code'] == 'M1B') | (df['Postal Code'] == 'M5A')]
df2.reset_index(drop=True)

In [7]:
df2.shape

(103, 3)

# Add Zipcodes

In [8]:
PCLATLON=pd.read_csv('GSPC.csv')

In [9]:
PCLATLON.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
zips = pd.merge(df2,
                 PCLATLON[['Postal Code','Latitude', 'Longitude']],on='Postal Code')
zips.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Explore and Cluster

In [11]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Create a map of Toronto with neighborhoods

In [12]:
import folium # map rendering library

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(zips['Latitude'], zips['Longitude'], zips['Borough'], zips['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Define Foursquare Credentials and Version

In [13]:
CLIENT_ID = 'S3IVNSJ3WG4Q2JY222OV4EZE4NFJP0AWDY3OX5ZXJ4ZAS0RG' # your Foursquare ID
CLIENT_SECRET = 'V3UPS1CTX4KSEPQ2ETFBDRQIRT3T4KEAJKQ2NG24EOZRICND' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: S3IVNSJ3WG4Q2JY222OV4EZE4NFJP0AWDY3OX5ZXJ4ZAS0RG
CLIENT_SECRET:V3UPS1CTX4KSEPQ2ETFBDRQIRT3T4KEAJKQ2NG24EOZRICND


#### Let's explore the first neighborhood in our dataframe.

In [14]:
zips.loc[0, 'Neighbourhood']

'Parkwoods'

In [15]:
#Get the neighborhood's latitude and longitude values.
neighborhood_latitude = zips.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = zips.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = ['Regent Park'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of ['Regent Park'] are 43.7532586, -79.3296565.


In [16]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=S3IVNSJ3WG4Q2JY222OV4EZE4NFJP0AWDY3OX5ZXJ4ZAS0RG&client_secret=V3UPS1CTX4KSEPQ2ETFBDRQIRT3T4KEAJKQ2NG24EOZRICND&v=20180605&ll=43.7532586,-79.3296565&radius=500&limit=100'

In [17]:
#send GET request
import requests # library to handle requests
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '6007139dcb7cea4da4402109'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.757758604500005,
    'lng': -79.32343823984928},
   'sw': {'lat': 43.7487585955, 'lng': -79.33587476015072}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 245,
        'cc': 'CA',
        'c

In [18]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [19]:
import json # library to handle JSON files

venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Variety Store,Food & Drink Shop,43.751974,-79.333114


In [20]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

2 venues were returned by Foursquare.


#### Let's create a function to repeat the same process to all the neighborhoods in Toronto

In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
toronto_venues = getNearbyVenues(names=zips['Neighbourhood'],
                                   latitudes=zips['Latitude'],
                                   longitudes=zips['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

In [23]:
#Lets check the dataframe
print(toronto_venues.shape)
toronto_venues.head()

(2129, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [24]:
#group by venue
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
"Alderwood, Long Branch",6,6,6,6,6,6
"Bathurst Manor, Wilson Heights, Downsview North",21,21,21,21,21,21
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",22,22,22,22,22,22
...,...,...,...,...,...,...
"Willowdale, Willowdale East",35,35,35,35,35,35
"Willowdale, Willowdale West",5,5,5,5,5,5
Woburn,3,3,3,3,3,3
Woodbine Heights,7,7,7,7,7,7


In [25]:
#How many unique values
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 271 uniques categories.


## Analyze Each Neighborhood

In [26]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
#What is the size of the new dataframe
toronto_onehot.shape

(2129, 271)

In [28]:
#Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,"Willowdale, Willowdale East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0
93,"Willowdale, Willowdale West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
94,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
95,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [29]:
#size
toronto_grouped.shape

(97, 271)

In [30]:
#Let's print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0                     Lounge   0.2
1               Skating Rink   0.2
2             Breakfast Spot   0.2
3             Clothing Store   0.2
4  Latin American Restaurant   0.2


----Alderwood, Long Branch----
            venue  freq
0     Pizza Place  0.33
1             Gym  0.17
2  Sandwich Place  0.17
3     Coffee Shop  0.17
4             Pub  0.17


----Bathurst Manor, Wilson Heights, Downsview North----
                venue  freq
0         Coffee Shop  0.10
1                Bank  0.10
2      Ice Cream Shop  0.05
3  Frozen Yogurt Shop  0.05
4    Sushi Restaurant  0.05


----Bayview Village----
                 venue  freq
0                 Café  0.25
1  Japanese Restaurant  0.25
2                 Bank  0.25
3   Chinese Restaurant  0.25
4        Movie Theater  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0         Coffee Shop  0.09
1      Sandwich Place  0.09
2  Italian Restaurant  0.09
3     Thai Re

4         Yoga Studio  0.00


----Leaside----
                 venue  freq
0          Coffee Shop  0.09
1  Sporting Goods Shop  0.09
2         Burger Joint  0.06
3        Shopping Mall  0.06
4                 Bank  0.06


----Little Portugal, Trinity----
                   venue  freq
0                    Bar  0.12
1            Coffee Shop  0.07
2             Restaurant  0.05
3       Asian Restaurant  0.05
4  Vietnamese Restaurant  0.05


----Malvern, Rouge----
                             venue  freq
0             Fast Food Restaurant   0.5
1                       Print Shop   0.5
2                      Yoga Studio   0.0
3               Mexican Restaurant   0.0
4  Molecular Gastronomy Restaurant   0.0


----Milliken, Agincourt North, Steeles East, L'Amoreaux East----
                 venue  freq
0         Intersection  0.33
1                 Park  0.33
2           Playground  0.33
3   Mexican Restaurant  0.00
4  Monument / Landmark  0.00


----Mimico NW, The Queensway West, South of B

4             Sandwich Place  0.14


----Willowdale, Newtonbrook----
                             venue  freq
0                        Piano Bar   0.5
1                             Park   0.5
2                      Yoga Studio   0.0
3                    Metro Station   0.0
4  Molecular Gastronomy Restaurant   0.0


----Willowdale, Willowdale East----
              venue  freq
0  Ramen Restaurant  0.09
1       Pizza Place  0.06
2       Coffee Shop  0.06
3    Sandwich Place  0.06
4              Café  0.06


----Willowdale, Willowdale West----
           venue  freq
0       Pharmacy   0.2
1    Pizza Place   0.2
2    Coffee Shop   0.2
3        Butcher   0.2
4  Grocery Store   0.2


----Woburn----
                       venue  freq
0                Coffee Shop  0.67
1      Korean BBQ Restaurant  0.33
2                Yoga Studio  0.00
3  Middle Eastern Restaurant  0.00
4        Monument / Landmark  0.00


----Woodbine Heights----
          venue  freq
0  Dance Studio  0.14
1          Park  

In [31]:
#### Let's put that into a _pandas_ dataframe
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [32]:
#Now let's create the new dataframe and display the top 10 venues for each neighborhood.
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Clothing Store,Breakfast Spot,Skating Rink,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Department Store
1,"Alderwood, Long Branch",Pizza Place,Gym,Sandwich Place,Pub,Coffee Shop,Airport Gate,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Ice Cream Shop,Fried Chicken Joint,Sandwich Place,Bridal Shop,Diner,Restaurant,Deli / Bodega,Middle Eastern Restaurant
3,Bayview Village,Café,Bank,Japanese Restaurant,Chinese Restaurant,Women's Store,Department Store,Dim Sum Restaurant,Diner,Discount Store,Distribution Center
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Thai Restaurant,Liquor Store,Juice Bar,Pub,Butcher,Café,Sushi Restaurant


## Cluster Neighborhoods

In [64]:
from sklearn.cluster import KMeans
#Run _k_-means to cluster the neighborhood into 4 clusters.
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0, verbose=1,algorithm='full',max_iter=1000).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

Initialization complete
Iteration 0, inertia 17.140031777018763
Iteration 1, inertia 14.275944620763806
Iteration 2, inertia 14.19765197377997
Converged at iteration 2: strict convergence.
Initialization complete
Iteration 0, inertia 16.270358328416787
Iteration 1, inertia 13.228692847956578
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 18.8079735690669
Iteration 1, inertia 15.02251828611972
Iteration 2, inertia 14.960615243022778
Converged at iteration 2: strict convergence.
Initialization complete
Iteration 0, inertia 17.61366055326252
Iteration 1, inertia 13.487559910881346
Iteration 2, inertia 13.27417204014001
Iteration 3, inertia 13.157751774298003
Iteration 4, inertia 13.120872500367339
Iteration 5, inertia 13.1073640875928
Iteration 6, inertia 13.03437698255465
Converged at iteration 6: strict convergence.
Initialization complete
Iteration 0, inertia 24.47245880393033
Iteration 1, inertia 14.086269696717082
Iteration 2, inertia 13.93

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1])

In [65]:
# merge dataframes for plot
zips.rename(columns = {"Neighbourhood": "Neighborhood"}, inplace=True)

In [57]:
toronto_merged

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Latitude,Longitude
0,1,Agincourt,Lounge,Latin American Restaurant,Clothing Store,Breakfast Spot,Skating Rink,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Department Store,43.794200,-79.262029
1,0,"Alderwood, Long Branch",Pizza Place,Gym,Sandwich Place,Pub,Coffee Shop,Airport Gate,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,43.602414,-79.543484
2,1,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Ice Cream Shop,Fried Chicken Joint,Sandwich Place,Bridal Shop,Diner,Restaurant,Deli / Bodega,Middle Eastern Restaurant,43.754328,-79.442259
3,1,Bayview Village,Café,Bank,Japanese Restaurant,Chinese Restaurant,Women's Store,Department Store,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,43.786947,-79.385975
4,1,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Thai Restaurant,Liquor Store,Juice Bar,Pub,Butcher,Café,Sushi Restaurant,43.733283,-79.419750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1,"Willowdale, Willowdale East",Ramen Restaurant,Sushi Restaurant,Sandwich Place,Shopping Mall,Café,Coffee Shop,Restaurant,Pizza Place,Fast Food Restaurant,Hotel,43.770120,-79.408493
97,0,"Willowdale, Willowdale West",Grocery Store,Coffee Shop,Pharmacy,Butcher,Pizza Place,Women's Store,Distribution Center,Department Store,Dessert Shop,Dim Sum Restaurant,43.782736,-79.442259
98,1,Woburn,Coffee Shop,Korean BBQ Restaurant,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Women's Store,43.770992,-79.216917
99,1,Woodbine Heights,Park,Skating Rink,Dance Studio,Beer Store,Athletics & Sports,Curling Ice,Bus Stop,Distribution Center,Dim Sum Restaurant,Diner,43.695344,-79.318389


In [66]:
toronto_merged = pd.merge(neighborhoods_venues_sorted,
                 zips[['Neighborhood','Latitude', 'Longitude']],on='Neighborhood')
toronto_merged.head() # check the last columns!

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Latitude,Longitude
0,1,Agincourt,Lounge,Latin American Restaurant,Clothing Store,Breakfast Spot,Skating Rink,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Department Store,43.7942,-79.262029
1,0,"Alderwood, Long Branch",Pizza Place,Gym,Sandwich Place,Pub,Coffee Shop,Airport Gate,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,43.602414,-79.543484
2,1,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Ice Cream Shop,Fried Chicken Joint,Sandwich Place,Bridal Shop,Diner,Restaurant,Deli / Bodega,Middle Eastern Restaurant,43.754328,-79.442259
3,1,Bayview Village,Café,Bank,Japanese Restaurant,Chinese Restaurant,Women's Store,Department Store,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,43.786947,-79.385975
4,1,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Thai Restaurant,Liquor Store,Juice Bar,Pub,Butcher,Café,Sushi Restaurant,43.733283,-79.41975


In [67]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Examine Clusters

In [84]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,1st Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Latitude,Longitude
1,Pizza Place,Pub,Coffee Shop,Airport Gate,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,43.602414,-79.543484
36,Japanese Restaurant,Pub,Pizza Place,Women's Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,43.709577,-79.445073
42,Intersection,Women's Store,Distribution Center,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,43.756303,-79.565963
46,Pizza Place,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Doner Restaurant,43.667856,-79.532242
65,Pizza Place,Gym / Fitness Center,Intersection,Pharmacy,Athletics & Sports,Gastropub,Flea Market,Diner,43.706397,-79.309937
74,Convenience Store,Brewery,Women's Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,43.673185,-79.487262
90,Hockey Arena,Portuguese Restaurant,Women's Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,43.725882,-79.315572
92,Pizza Place,Intersection,Middle Eastern Restaurant,Sandwich Place,Chinese Restaurant,Women's Store,Dessert Shop,Dim Sum Restaurant,43.696319,-79.532242
97,Grocery Store,Butcher,Pizza Place,Women's Store,Distribution Center,Department Store,Dessert Shop,Dim Sum Restaurant,43.782736,-79.442259


In [85]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,1st Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Latitude,Longitude
0,Lounge,Breakfast Spot,Skating Rink,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Department Store,43.794200,-79.262029
2,Coffee Shop,Fried Chicken Joint,Sandwich Place,Bridal Shop,Diner,Restaurant,Deli / Bodega,Middle Eastern Restaurant,43.754328,-79.442259
3,Café,Chinese Restaurant,Women's Store,Department Store,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,43.786947,-79.385975
4,Coffee Shop,Thai Restaurant,Liquor Store,Juice Bar,Pub,Butcher,Café,Sushi Restaurant,43.733283,-79.419750
5,Coffee Shop,Restaurant,Bakery,Farmers Market,Beer Bar,Cheese Shop,Beach,Shopping Mall,43.644771,-79.373306
...,...,...,...,...,...,...,...,...,...,...
91,Bakery,Distribution Center,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Deli / Bodega,43.650943,-79.554724
94,Middle Eastern Restaurant,Auto Garage,Smoke Shop,Shopping Mall,Drugstore,Donut Shop,Doner Restaurant,Dog Run,43.750072,-79.295849
96,Ramen Restaurant,Shopping Mall,Café,Coffee Shop,Restaurant,Pizza Place,Fast Food Restaurant,Hotel,43.770120,-79.408493
98,Coffee Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Women's Store,43.770992,-79.216917


In [86]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,1st Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Latitude,Longitude
10,Park,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant,Electronics Store,Dance Studio,43.689026,-79.453512
30,Park,Coffee Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,43.685347,-79.338106
55,Park,Distribution Center,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,43.815252,-79.284577
66,Park,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Ethiopian Restaurant,43.753259,-79.329656
70,Park,Dumpling Restaurant,Drugstore,Donut Shop,Eastern European Restaurant,Doner Restaurant,Dog Run,Curling Ice,43.679563,-79.377529
86,River,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,43.653654,-79.506944
93,Park,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,43.706876,-79.518188
95,Park,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Women's Store,43.789053,-79.408493
100,Park,Distribution Center,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,43.752758,-79.400049


In [87]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[2] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,1st Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Latitude,Longitude
43,Paper / Office Supplies Store,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Doner Restaurant,43.724766,-79.532242
63,Baseball Field,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Event Space,43.636258,-79.498509


Cluster 1 could be the "Pizza Cluster" as 4 out of 10 of the most common venue are pizza places.
Cluster 2 could be the "Coffee Shop Cluster" as 4 out of 10 of the most common venues are coffee places.
Cluster 3 is clearly the "Park" Cluster, as 9 out of 10 are parks
