# Clustering neighborhoods in Toronto #

## Part 1: data scrapping ##

First, we install packages requests and lxml, and later we import beautifulSoup, requests and pandas

In [1]:
!pip install requests
!pip install lxml



In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

We access to the Wikipedia page. We scrap the page looking for the table with postal codes.
We read the data into a dataframe df, and then we update some fields and the group Neighborhoods
by Boroughs and assign this grouping to a new dataframe df_group

In [3]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
html_file = response.text
soup= BeautifulSoup(html_file,'lxml')
toronto_html=soup.find('table')
rows_html=toronto_html.find_all('tr')
array_toronto=[]
element_array=[]
for line in rows_html:
    cells=line.find_all('td')
    if len(cells)>0:
        element_array=[]
        for i in cells:
            element_array.append(i.text.rstrip())
        array_toronto.append(element_array)
df=pd.DataFrame(array_toronto)
df.columns = ['PostalCode','Borough','Neighborhood']
df= df[df['Borough'] != 'Not assigned']
df.loc[df['Neighborhood'] == 'Not assigned' , 'Neighborhood'] = df['Borough']
df_group=df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ','.join(x)).reset_index()
df_group.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [4]:
df_group.shape

(103, 3)

## Part 2: getting latitude, longitude using geocoder ##

We install and import geocoder

In [5]:
!pip install geocoder



In [6]:
import geocoder

We create a function queryCoordinates that takes the postal code as parameter and retruns a tuple (latitude,longitude).
We use geocoder.geocodefarm because it is more reliable than google.

In [7]:
def queryCoordinates(postal_code):
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.geocodefarm('{},Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return(lat_lng_coords)

We add 2 columns (latitude, longitude) to our dataframe, with default value 0.0

In [8]:
df_group['Latitude']=0.0
df_group['Longitude']=0.0
df_group.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",0.0,0.0
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",0.0,0.0
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",0.0,0.0
3,M1G,Scarborough,Woburn,0.0,0.0
4,M1H,Scarborough,Cedarbrae,0.0,0.0


Now we iterate over df_group and get the coordinates for each Borough

In [9]:
for index, row in df_group.iterrows():
    postcode=row[0]
    coordinates=queryCoordinates(postcode)
    df_group.at[index ,'Latitude']= coordinates[0]
    df_group.at[index ,'Longitude']= coordinates[1]
df_group

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.810154,-79.194603
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784672,-79.158958
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.766289,-79.172890
3,M1G,Scarborough,Woburn,43.768288,-79.214111
4,M1H,Scarborough,Cedarbrae,43.769180,-79.238770
5,M1J,Scarborough,Scarborough Village,43.743938,-79.231354
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.725803,-79.262848
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.716656,-79.286537
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.723576,-79.234451
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.697884,-79.258759


## Parte 3: clustering ##

First, we import Kmeans clustering from sklearn, and folium

In [10]:
!conda install -c conda-forge folium=0.5.0 --yes 
from sklearn.cluster import KMeans
import folium

Solving environment: / ^C
failed

CondaError: KeyboardInterrupt



#### Lets's create a map of Toronto with Postal code areas superimposed on top ####
We create a map centered in Toronto
We filter Boroughs with the text "Toronto" as part of its name and assing them to a new dataframe df_toronto

In [11]:
latitude_t=43.651070
longitude_t=-79.347015
map_toronto = folium.Map(location=[latitude_t, longitude_t], zoom_start=11)
df_toronto=  df_group[df_group.Borough.str.find('Toronto') != -1].reset_index(drop=True)
df_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.679611,-79.295692
1,M4K,East Toronto,"The Danforth West,Riverdale",43.682327,-79.355797
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.670906,-79.316269
3,M4M,East Toronto,Studio District,43.657566,-79.340492
4,M4N,Central Toronto,Lawrence Park,43.728504,-79.388443
5,M4P,Central Toronto,Davisville North,43.71302,-79.388565
6,M4R,Central Toronto,North Toronto West,43.714615,-79.406532
7,M4S,Central Toronto,Davisville,43.703163,-79.385895
8,M4T,Central Toronto,"Moore Park,Summerhill East",43.690735,-79.383003
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686035,-79.402267


Now we add one marker per Postal Code and display the map

In [12]:
# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Define Foursquare Credentials and Version ####
Hidden cell with my keys.

In [13]:
# The code was removed by Watson Studio for sharing.

Lets define some utility functions. One function to get the category of any venue

In [14]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Another function to get venues in a Borough

In [15]:
def getNearbyVenues(codes, names, latitudes, longitudes, radius=500):  
    venues_list=[]
    for code, name, lat, lng in zip(codes, names, latitudes, longitudes):
        print(code, name)            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            code,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode','Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']    
    return(nearby_venues)            

In [16]:
toronto_venues = getNearbyVenues(codes=df_toronto['PostalCode'],
                                   names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

M4E The Beaches
M4K The Danforth West,Riverdale
M4L The Beaches West,India Bazaar
M4M Studio District
M4N Lawrence Park
M4P Davisville North
M4R North Toronto West
M4S Davisville
M4T Moore Park,Summerhill East
M4V Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
M4W Rosedale
M4X Cabbagetown,St. James Town
M4Y Church and Wellesley
M5A Harbourfront,Regent Park
M5B Ryerson,Garden District
M5C St. James Town
M5E Berczy Park
M5G Central Bay Street
M5H Adelaide,King,Richmond
M5J Harbourfront East,Toronto Islands,Union Station
M5K Design Exchange,Toronto Dominion Centre
M5L Commerce Court,Victoria Hotel
M5N Roselawn
M5P Forest Hill North,Forest Hill West
M5R The Annex,North Midtown,Yorkville
M5S Harbord,University of Toronto
M5T Chinatown,Grange Park,Kensington Market
M5V CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
M5W Stn A PO Boxes 25 The Esplanade
M5X First Canadian Place,Underground city
M6G Christie
M6H Dovercourt Vill

__What is the shape of this dataframe (number of venues, columns?__

Also, lets take a look of the first 5 rows

In [17]:
print(toronto_venues.shape)
toronto_venues.head()

(1719, 8)


Unnamed: 0,PostalCode,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,The Beaches,43.679611,-79.295692,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,M4E,The Beaches,43.679611,-79.295692,Glen Manor Ravine,43.676821,-79.293942,Trail
2,M4E,The Beaches,43.679611,-79.295692,Beaches Bake Shop,43.680363,-79.289692,Bakery
3,M4E,The Beaches,43.679611,-79.295692,Grover Pub and Grub,43.679181,-79.297215,Pub
4,M4E,The Beaches,43.679611,-79.295692,Domino's Pizza,43.679058,-79.297382,Pizza Place


__How many venues per neighborhood we got from FourSquare?__

Lets take a look

In [18]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,PostalCode,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100,100
Berczy Park,15,15,15,15,15,15,15
"Brockton,Exhibition Place,Parkdale Village",66,66,66,66,66,66,66
Business Reply Mail Processing Centre 969 Eastern,100,100,100,100,100,100,100
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",71,71,71,71,71,71,71
"Cabbagetown,St. James Town",46,46,46,46,46,46,46
Central Bay Street,99,99,99,99,99,99,99
"Chinatown,Grange Park,Kensington Market",93,93,93,93,93,93,93
Christie,10,10,10,10,10,10,10
Church and Wellesley,86,86,86,86,86,86,86


How many categories?

In [19]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 214 uniques categories.


### Analyzing Each Neighborhood

In [20]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# drop column Neighborhood
toronto_onehot.drop(['Neighborhood'], axis=1, inplace=True)
# add postal code and neighborhood columns back to dataframe
toronto_onehot['PostalCode'] = toronto_venues['PostalCode']
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']
# move postcal code & neighborhood column to the left
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,PostalCode,Neighborhood,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Toy / Game Store,Trail,Train Station,Tunnel,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,The Beaches,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4E,The Beaches,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,M4E,The Beaches,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,The Beaches,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4E,The Beaches,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [21]:
toronto_onehot.shape

(1719, 215)

__Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category__

In [22]:
toronto_grouped = toronto_onehot.groupby(['PostalCode','Neighborhood']).mean().reset_index()
toronto_grouped

Unnamed: 0,PostalCode,Neighborhood,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Toy / Game Store,Trail,Train Station,Tunnel,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,The Beaches,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,...,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,"The Danforth West,Riverdale",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4L,"The Beaches West,India Bazaar",0.0,0.0,0.0,0.0,0.034483,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,Studio District,0.032258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4N,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,M4S,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,"Moore Park,Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


__Size of this dataframe:__

In [23]:
toronto_grouped.shape

(38, 215)

Top 5 venue categories per Neighboorhood

In [24]:
num_top_venues = 5

for code,hood in zip(toronto_grouped['PostalCode'],toronto_grouped['Neighborhood']):
    print("----"+code+":"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[2:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M4E:The Beaches----
         venue  freq
0          Pub   0.1
1       Bakery   0.1
2       Church   0.1
3  Coffee Shop   0.1
4         Park   0.1


----M4K:The Danforth West,Riverdale----
                 venue  freq
0                 Café  0.25
1       Discount Store  0.25
2                 Park  0.25
3        Grocery Store  0.25
4  American Restaurant  0.00


----M4L:The Beaches West,India Bazaar----
                venue  freq
0  Italian Restaurant  0.07
1           Pet Store  0.07
2                Café  0.07
3                 Pub  0.03
4       Movie Theater  0.03


----M4M:Studio District----
                       venue  freq
0                Coffee Shop  0.13
1         Italian Restaurant  0.06
2                       Café  0.06
3  Latin American Restaurant  0.03
4            Coworking Space  0.03


----M4N:Lawrence Park----
                       venue  freq
0                Swim School   0.5
1                   Bus Line   0.5
2        American Restaurant   0.0
3             

__Let's write a function to sort the venues in descending order__

In [25]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

__Now let's create the new dataframe and display the top 10 venues for each neighborhood__

In [26]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode','Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 2:] = return_most_common_venues(toronto_grouped.iloc[ind, 1:], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,PostalCode,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,The Beaches,Other Great Outdoors,Pub,Health Food Store,Pizza Place,Asian Restaurant,Park,Church,Trail,Coffee Shop,Bakery
1,M4K,"The Danforth West,Riverdale",Café,Park,Grocery Store,Discount Store,Yoga Studio,Electronics Store,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
2,M4L,"The Beaches West,India Bazaar",Café,Pet Store,Italian Restaurant,Fish & Chips Shop,Ice Cream Shop,Burger Joint,Pub,Brewery,Movie Theater,Sushi Restaurant
3,M4M,Studio District,Coffee Shop,Café,Italian Restaurant,American Restaurant,Convenience Store,Cheese Shop,Chinese Restaurant,Seafood Restaurant,Clothing Store,Sandwich Place
4,M4N,Lawrence Park,Bus Line,Swim School,Yoga Studio,Ethiopian Restaurant,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market


## Finally, We can do clustering! ##

#### We try k-means with k=5

In [27]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop(['PostalCode','Neighborhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0, n_init =100).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ 

array([0, 2, 0, 0, 1, 4, 4, 0, 4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       3, 4, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Number of elements per cluster:

In [28]:
from itertools import groupby
results_count = {value: len(list(freq)) for value, freq in groupby(sorted(kmeans.labels_))}
results_count

{0: 29, 1: 1, 2: 2, 3: 1, 4: 5}

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [29]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = pd.merge(df_toronto,neighborhoods_venues_sorted, on=['PostalCode','Neighborhood'])

toronto_merged.head() 

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.679611,-79.295692,0,Other Great Outdoors,Pub,Health Food Store,Pizza Place,Asian Restaurant,Park,Church,Trail,Coffee Shop,Bakery
1,M4K,East Toronto,"The Danforth West,Riverdale",43.682327,-79.355797,2,Café,Park,Grocery Store,Discount Store,Yoga Studio,Electronics Store,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.670906,-79.316269,0,Café,Pet Store,Italian Restaurant,Fish & Chips Shop,Ice Cream Shop,Burger Joint,Pub,Brewery,Movie Theater,Sushi Restaurant
3,M4M,East Toronto,Studio District,43.657566,-79.340492,0,Coffee Shop,Café,Italian Restaurant,American Restaurant,Convenience Store,Cheese Shop,Chinese Restaurant,Seafood Restaurant,Clothing Store,Sandwich Place
4,M4N,Central Toronto,Lawrence Park,43.728504,-79.388443,1,Bus Line,Swim School,Yoga Studio,Ethiopian Restaurant,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market


Finally, let's visualize the resulting clusters

In [30]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude_t, longitude_t], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, code, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], 
                                        toronto_merged['Neighborhood'], toronto_merged['PostalCode'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(code)+':'+str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Lets examine the clusters

### Cluster 1

In [31]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[0,2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,The Beaches,Other Great Outdoors,Pub,Health Food Store,Pizza Place,Asian Restaurant,Park,Church,Trail,Coffee Shop,Bakery
2,M4L,"The Beaches West,India Bazaar",Café,Pet Store,Italian Restaurant,Fish & Chips Shop,Ice Cream Shop,Burger Joint,Pub,Brewery,Movie Theater,Sushi Restaurant
3,M4M,Studio District,Coffee Shop,Café,Italian Restaurant,American Restaurant,Convenience Store,Cheese Shop,Chinese Restaurant,Seafood Restaurant,Clothing Store,Sandwich Place
7,M4S,Davisville,Dessert Shop,Pizza Place,Italian Restaurant,Coffee Shop,Sandwich Place,Café,Diner,Restaurant,Chinese Restaurant,Seafood Restaurant
9,M4V,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",Coffee Shop,Light Rail Station,Health & Beauty Service,Supermarket,Liquor Store,Electronics Store,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
11,M4X,"Cabbagetown,St. James Town",Coffee Shop,Café,Restaurant,Park,Pizza Place,Pub,Bakery,Italian Restaurant,Pharmacy,Sandwich Place
12,M4Y,Church and Wellesley,Coffee Shop,Gay Bar,Restaurant,Japanese Restaurant,Dance Studio,Gym,Men's Store,Park,Bubble Tea Shop,Hotel
13,M5A,"Harbourfront,Regent Park",Coffee Shop,Gym / Fitness Center,Breakfast Spot,Yoga Studio,Italian Restaurant,Café,Mexican Restaurant,Park,Spa,Food Truck
14,M5B,"Ryerson,Garden District",Coffee Shop,Clothing Store,Middle Eastern Restaurant,Café,Sandwich Place,Cosmetics Shop,Hotel,Diner,Furniture / Home Store,Tanning Salon
15,M5C,St. James Town,Coffee Shop,Café,Hotel,Restaurant,Seafood Restaurant,Bakery,Cocktail Bar,Italian Restaurant,Cosmetics Shop,Clothing Store


### Cluster 2

In [32]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[0,2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M4N,Lawrence Park,Bus Line,Swim School,Yoga Studio,Ethiopian Restaurant,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market


### Cluster 3

In [33]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[0,2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M4K,"The Danforth West,Riverdale",Café,Park,Grocery Store,Discount Store,Yoga Studio,Electronics Store,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
30,M6G,Christie,Café,Grocery Store,Playground,Athletics & Sports,Baby Store,Coffee Shop,Ethiopian Restaurant,Flower Shop,Flea Market,Fish Market


### Cluster 4

In [34]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[0,2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,M5N,Roselawn,Home Service,Garden,Electronics Store,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm


### Cluster 5

In [35]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[0,2] + list(range(6, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,M4P,Davisville North,Food & Drink Shop,Hotel,Park,Gym,Breakfast Spot,Clothing Store,Ethiopian Restaurant,Flower Shop,Flea Market,Fish Market
6,M4R,North Toronto West,Playground,Gym Pool,Park,Eastern European Restaurant,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
8,M4T,"Moore Park,Summerhill East",Tennis Court,Convenience Store,Gym,Park,Eastern European Restaurant,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
10,M4W,Rosedale,Playground,Building,Park,Candy Store,Bank,Electronics Store,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
23,M5P,"Forest Hill North,Forest Hill West",Construction & Landscaping,Locksmith,Park,Yoga Studio,Electronics Store,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
