In [164]:
import pandas as pd
import numpy as np
import urllib.request

from bs4 import BeautifulSoup

# 1. Data Collection
We load the list of post codes in canada from wikipedia, clean the table and convert it into a dataframe of Post Codes, Boroughs and Neighbourhoods.

In [165]:
#load the webpage with beautifulsoup4
website_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
webpage = urllib.request.urlopen(website_url)
soup = BeautifulSoup(webpage, 'lxml')

#locate the table and get the type
tables = soup.find_all('table')

#select it like so...
pstl_table = soup.find('table', {'class':'wikitable sortable'})

In [166]:
#convert from an html table into a pandas dataframe
postcodes = []
boroughs = []
neighbourhoods = []

for row in pstl_table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells) == 3:
        postcodes.append(cells[0].find(text=True))
        boroughs.append(cells[1].find(text=True))
        neighbourhoods.append(cells[2].find(text=True))
    
df = pd.DataFrame({'Postal Code': postcodes, 'Borough':boroughs, 'Neighbourhood':neighbourhoods})
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
...,...,...,...
175,M5Z\n,Not assigned\n,Not assigned\n
176,M6Z\n,Not assigned\n,Not assigned\n
177,M7Z\n,Not assigned\n,Not assigned\n
178,M8Z\n,Etobicoke\n,"Mimico NW, The Queensway West, South of Bloor,..."


In [167]:
#check if there are any unassigned neighbourhoods we can identify from their burough
df[(df['Borough'] != 'Not assigned\n') & (df['Neighbourhood'] == 'Not assigned\n')]

Unnamed: 0,Postal Code,Borough,Neighbourhood


The are no neighbourhoods with an assigned burough, so we don't need to handle that case. Moving on to cleaning up the dataframe:

In [168]:
#remove the messy newline characters...
df.replace('\\n','', regex=True, inplace = True)

#we don't need any rows with a borough that isn't assigned
df = df[df['Borough'] != 'Not assigned'].reset_index(drop=True)

#assuming that you can't have different burrows within the same post code...
df = df.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(','.join).reset_index()
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


# 2. Location Data
We now use geocoder to get the lat/long coordinates of the post codes we have in our dataframe.

In [169]:
import geocoder

def get_lat_long(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude

#df[['Latitude', 'Longitude']] = zip(*df['Postal Code'].apply(get_lat_long))
#df

Geocoder isn't working - freezes on first attempt to find coords. So, we have to use the file provided in the course materials instead.

In [170]:
import os
from pathlib import Path
#geodata = pd.read_csv('')
path = Path(os.getcwd()) / 'Geospatial_Coordinates.csv'

geo_df = pd.read_csv(path)
geo_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [171]:
df = pd.merge(df, geo_df, how='left', on='Postal Code')
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


For this analysis, we will exclude everything outwith the city of Toronto itself, keeping only the three buroughs with Toronto in the name. 

In [172]:
import folium
from sklearn.cluster import KMeans


In [173]:
#we'll limit this to the city of Toronto itself, and exclude the other boroughs
df.Borough.value_counts()
keep_list = ['Downtown Toronto', 'East Toronto', 'West Toronto']
df = df[df['Borough'].isin(keep_list)]

## Mapping
Now we'll put those points on a map and see how they're distributed.

In [174]:
#To create the map, we need the lat/long coords of Toronto, which we can get from geopy, or just average
#the values we have to get close enough

lat, lon = df.Latitude.mean(), df.Longitude.mean()
map_toronto = folium.Map(location = [lat, lon])

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng],
                        radius=4,
                        popup=label,
                        color='red',
                        fill=True,
                        fill_color='red',
                        fill_opacity=0.4,
                        parse_html=False).add_to(map_toronto)

map_toronto

# Preparing the data
We want to cluster those points, but first we have some preparation to do. We'll one-hot encode the borough names, and then get some information on venues within each neighbourhood.

In [175]:

#we can one-hot encode the buroughs
df = df.join(pd.get_dummies(df.Borough, prefix='Borough')).reset_index()
df.head()

Unnamed: 0,index,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Borough_Downtown Toronto,Borough_East Toronto,Borough_West Toronto
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,1,0
1,41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,1,0
2,42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,0,1,0
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923,0,1,0
4,50,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,1,0,0


In [176]:
#we kept the borough column so that we can join data to it, but we will need to drop it before running any ML algorithm on
#the data
import requests

creds = pd.read_csv(Path(os.getcwd())/'foursquare_creds.csv')
CLIENT_ID = creds.loc[0][0]
CLIENT_SECRET = creds.loc[0][1]
version = creds.loc[0][2]
LIMIT = 100

def get_location_data(neighbourhood_latitude, neighbourhood_longitude, radius, limit):

    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        version, 
        neighbourhood_latitude, 
        neighbourhood_longitude, 
        radius, 
        limit)
    
    results = requests.get(url).json()
    
    return results

results = get_location_data(df.loc[0, 'Latitude'], df.loc[0, 'Longitude'], 500, 100)
venues = results['response']['groups'][0]['items']
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
nearby_venues = json_normalize(venues)

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

In [177]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

6 venues were returned by Foursquare.


Now we can get the lists of venues in our locations:

In [178]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            version, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

And finally we can join that venue data back to our neighbourhoods.

In [150]:
toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_groups = toronto_onehot.groupby('Neighborhood').mean().reset_index()

toronto_groups.head()

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.066667,0.066667,0.133333,0.2,0.066667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.015625,0.0,0.0


Now, we look for the most common venues in each location, to get an idea of what the most important aspects of each neighbourhood are.

In [179]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_groups['Neighborhood']

for ind in np.arange(toronto_groups.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_groups.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Café,Cheese Shop,Bakery,Seafood Restaurant,Farmers Market,Restaurant,Beer Bar,Cocktail Bar,Liquor Store
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Performing Arts Venue,Climbing Gym,Burrito Place,Restaurant,Italian Restaurant,Intersection,Bar
2,"Business reply mail Processing Centre, South C...",Light Rail Station,Yoga Studio,Pizza Place,Fast Food Restaurant,Recording Studio,Burrito Place,Auto Workshop,Restaurant,Spa,Brewery
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Sculpture Garden,Plane,Airport,Airport Food Court,Airport Terminal,Boat or Ferry,Coffee Shop,Bar
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Burger Joint,Department Store,Salad Place,Japanese Restaurant,Bubble Tea Shop,Hotel


# 3. Clustering
With our data prepared, we will group the neighbourhoods by the types of venues they contain.

In [180]:
# set number of clusters - 4 works well here
kclusters = 4

toronto_groups_clustering = toronto_groups.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_groups_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 3, 0, 2, 0, 0, 2, 0])

In [181]:
# add clustering labels
# neighborhoods_venues_sorted.drop('Cluster Labels', axis = 1, inplace=True) #for reruns
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df.rename(columns={'Neighbourhood':'Neighborhood'})

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,index,Postal Code,Borough,Neighborhood,Latitude,Longitude,Borough_Downtown Toronto,Borough_East Toronto,Borough_West Toronto,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,1,0,0,Coffee Shop,Trail,Pub,Health Food Store,Asian Restaurant,Comfort Food Restaurant,Cupcake Shop,Donut Shop,Doner Restaurant,Dog Run
1,41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,1,0,2,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Restaurant,Bubble Tea Shop,Bakery,Pub,Pizza Place
2,42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,0,1,0,0,Fast Food Restaurant,Fish & Chips Shop,Brewery,Board Shop,Liquor Store,Restaurant,Italian Restaurant,Burrito Place,Pub,Ice Cream Shop
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923,0,1,0,2,Café,Coffee Shop,Gastropub,American Restaurant,Brewery,Bakery,Latin American Restaurant,Diner,Seafood Restaurant,Bookstore
4,50,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,1,0,0,1,Park,Playground,Trail,Cuban Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner


In [182]:
toronto_merged.dropna()['Cluster Labels'].unique()

array([0, 2, 1, 3], dtype=int64)

# Visualising Clusters
With our neighbourhoods clustered, let's see what the results look like on the map.

In [183]:
# create map
import matplotlib.cm as cm
import matplotlib.colors as colors
map_clusters = folium.Map(location=[lat, lon], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
toronto_merged = toronto_merged.dropna() #some clusters have no label here
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'],
                                  toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examination
And now at last we see what was in each cluster.

In [157]:

def show_cluster(label):
    return toronto_merged.loc[toronto_merged['Cluster Labels'] == label, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].loc[toronto_merged['Cluster Labels'] == label, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
show_cluster(0)

Unnamed: 0,Postal Code,Longitude,Borough_Downtown Toronto,Borough_East Toronto,Borough_West Toronto,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,-79.293031,0,1,0,0,Coffee Shop,Trail,Pub,Health Food Store,Asian Restaurant,Comfort Food Restaurant,Cupcake Shop,Donut Shop,Doner Restaurant,Dog Run
2,M4L,-79.315572,0,1,0,0,Fast Food Restaurant,Fish & Chips Shop,Brewery,Board Shop,Liquor Store,Restaurant,Italian Restaurant,Burrito Place,Pub,Ice Cream Shop
5,M4X,-79.367675,1,0,0,0,Coffee Shop,Café,Park,Bakery,Pub,Restaurant,Italian Restaurant,Pizza Place,Caribbean Restaurant,Pet Store
6,M4Y,-79.38316,1,0,0,0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant,Yoga Studio,Dance Studio,Bubble Tea Shop,Pub,Hotel
7,M5A,-79.360636,1,0,0,0,Coffee Shop,Bakery,Park,Pub,Café,Theater,Breakfast Spot,Electronics Store,Hotel,Spa
8,M5B,-79.378937,1,0,0,0,Clothing Store,Coffee Shop,Bubble Tea Shop,Café,Japanese Restaurant,Cosmetics Shop,Italian Restaurant,Hotel,Pizza Place,Bookstore
11,M5G,-79.387383,1,0,0,0,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Burger Joint,Department Store,Salad Place,Japanese Restaurant,Bubble Tea Shop,Hotel
12,M5H,-79.384568,1,0,0,0,Coffee Shop,Café,Clothing Store,Hotel,Restaurant,Gym,Thai Restaurant,Deli / Bodega,Steakhouse,American Restaurant
13,M5J,-79.381752,1,0,0,0,Coffee Shop,Aquarium,Café,Hotel,Fried Chicken Joint,Restaurant,Scenic Lookout,Brewery,Park,Music Venue
14,M5K,-79.381576,1,0,0,0,Coffee Shop,Hotel,Café,Restaurant,Seafood Restaurant,Japanese Restaurant,Salad Place,American Restaurant,Tea Room,Sporting Goods Shop


In [163]:
show_cluster(1)

Unnamed: 0,Postal Code,Longitude,Borough_Downtown Toronto,Borough_East Toronto,Borough_West Toronto,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M4W,-79.377529,1,0,0,1,Park,Playground,Trail,Cuban Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner


In [159]:
show_cluster(2)

Unnamed: 0,Postal Code,Longitude,Borough_Downtown Toronto,Borough_East Toronto,Borough_West Toronto,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M4K,-79.352188,0,1,0,2,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Restaurant,Bubble Tea Shop,Bakery,Pub,Pizza Place
3,M4M,-79.340923,0,1,0,2,Café,Coffee Shop,Gastropub,American Restaurant,Brewery,Bakery,Latin American Restaurant,Diner,Seafood Restaurant,Bookstore
9,M5C,-79.375418,1,0,0,2,Café,Coffee Shop,Restaurant,Clothing Store,American Restaurant,Cosmetics Shop,Cocktail Bar,Farmers Market,Gym,Italian Restaurant
10,M5E,-79.373306,1,0,0,2,Coffee Shop,Café,Cheese Shop,Bakery,Seafood Restaurant,Farmers Market,Restaurant,Beer Bar,Cocktail Bar,Liquor Store
16,M5S,-79.400049,1,0,0,2,Café,Japanese Restaurant,Restaurant,Bar,Bookstore,Sandwich Place,Bakery,Beer Bar,Beer Store,Italian Restaurant
17,M5T,-79.400049,1,0,0,2,Café,Coffee Shop,Vegetarian / Vegan Restaurant,Bar,Vietnamese Restaurant,Mexican Restaurant,Gaming Cafe,Park,Pizza Place,Grocery Store
21,M6G,-79.422564,1,0,0,2,Grocery Store,Café,Park,Restaurant,Baby Store,Athletics & Sports,Candy Store,Italian Restaurant,Diner,Nightclub
22,M6H,-79.442259,0,0,1,2,Pizza Place,Bakery,Pharmacy,Music Venue,Middle Eastern Restaurant,Supermarket,Bar,Café,Bank,Brewery
23,M6J,-79.41975,0,0,1,2,Bar,Coffee Shop,Vietnamese Restaurant,Asian Restaurant,Vegetarian / Vegan Restaurant,Restaurant,Café,Men's Store,Greek Restaurant,Italian Restaurant
24,M6K,-79.428191,0,0,1,2,Café,Breakfast Spot,Coffee Shop,Performing Arts Venue,Climbing Gym,Burrito Place,Restaurant,Italian Restaurant,Intersection,Bar


In [160]:
show_cluster(3)

Unnamed: 0,Postal Code,Longitude,Borough_Downtown Toronto,Borough_East Toronto,Borough_West Toronto,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,M5V,-79.39442,1,0,0,3,Airport Service,Airport Lounge,Sculpture Garden,Plane,Airport,Airport Food Court,Airport Terminal,Boat or Ferry,Coffee Shop,Bar


## Cluster Details

Looking at our clusters, we see that the park and the airport have been picked out as outliers, which makes sense, as they are quite different from the rest of the city. 

Our largest two groups, the red and light blue clusters, need a little more examination, as they are similar at first glance.

Cluster 0, the red group, contains many coffee shops and restaurants, but also some hotels, and what look like shopping areas or centres. This is consistent with a city centre, which we do see the red points are found in on the map.

Cluster 2, the light blue group, similarly contains a variety of restaurants and cafes, but we can see that there are also grocery stores, a train station, and a variety of other locations we might expect to be near residential areas - again, our blue points do appear to be located in areas that look residential.

From these points then, it looks like our algorithm has successfully separated the city centre from residential areas, which detecting the airport and parks as the outliers that they do indeed represent in this map.