# Capstone Week 3

# Part 1 - Scrape the data into a Data Frame

Scrape the table of post codes from Wikipedia

In [1]:
# Import libaries

import pandas as pd

# Scrape list of Toronto postal codes and neighbourhoods from wikipedia

df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

In [2]:
# check the table
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [3]:
#Check the shape of the table downloaded
df.shape

(180, 3)

Drop the the rows where Borough is Not assigned

In [4]:
# drop rows with no Borough

df.drop(df[df.Borough == "Not assigned"].index, inplace=True)

In [5]:
# Check the shape of the table 

df.shape

(103, 3)

Additional checks on the data to see if Neighborhoods are not assigned or if duplicated postal codes

In [6]:
#Check to see if any Neighborhoods are Not Assigned

df[df.Neighborhood == 'Not assigned'].shape[0]

0

In [7]:
#Check to see if any duplicated postal codes.

boolean = not df["Postal Code"].is_unique
boolean

False

# Part two Use a Geocoder to get the Lat / Long of the Post Code for the Neighbourhood

Install Geocoder

In [8]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 6.8MB/s ta 0:00:011
[?25hCollecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


Test the Geocoder, Goolge doesnt work without API, so I used geocodefarm instead

In [9]:
 import geocoder
 g = geocoder.geocodefarm('Toronto, M5A')
 g.latlng

[43.6553535461715, -79.365043640113]

If you get a lat/lng then means that Geocodefarm is working a lot of the time it doesnt !! so now to get the lat/long for each of the postal codes in the Dataframe.

In [12]:
# Dont run if the above doesnt work, and even then it can be SLOW!

def get_geocoder(postal_code_from_df):
     # initialize your variable to None
     lat_lng_coords = None
     # loop until you get the coordinates
     while(lat_lng_coords is None):
       g = geocoder.geocodefarm('{}, Toronto, Ontario'.format(postal_code_from_df), maxRows=1)
       lat_lng_coords = g.latlng
     latitude = lat_lng_coords[0]
     longitude = lat_lng_coords[1]
     return latitude,longitude



df['Latitude'], df['Longitude'] = zip(*df['Postal Code'].apply(get_geocoder))

In [13]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.756123,-79.329636
3,M4A,North York,Victoria Village,43.72678,-79.310738
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.655354,-79.365044
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.721996,-79.445915
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66391,-79.388733


 ## Note if the above runs then dont run the cells below, as these load the prefilled CSV. If the Geocoder didnt work then use the below instead


 So as an alternative download the file CSV with the coords from the url

In [242]:
import io
import requests
url="https://cocl.us/Geospatial_data"
s=requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode('utf-8')))

In [243]:
c.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [244]:
# merge the data with the scraped data table

df = pd.merge(df, c, on='Postal Code', how='outer')
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Part 3 Explore and cluster the neighborhoods in Toronto.

First install Folium

In [14]:
#!conda install -c conda-forge folium=0.5.0
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl (93kB)
[K     |████████████████████████████████| 102kB 10.2MB/s ta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


In [15]:
import folium

 ### First lets just plot all the Neighborhoods onto a Map of toronto

In [16]:
# Toronto Lat / long to center the map

latitude = 43.653963
longitude = -79.387207

# create map of Toronto using latitude and longitude values, and set the zoom level
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map using the lat/long data in the dataframe 
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  

# display the map
map_toronto

## Lets concentrate on the Downtown Toronto area

In [17]:
# Create a new dataframe with only Borough that contains Toronto in the name
Toronto_data = df[df['Borough'].str.contains('Downtown Toronto')].reset_index(drop=True)
# check the dataframe
Toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.655354,-79.365044
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66391,-79.388733
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657478,-79.378632
3,M5C,Downtown Toronto,St. James Town,43.651112,-79.375732
4,M5E,Downtown Toronto,Berczy Park,43.647018,-79.374084


In [18]:
# lets see how many there are
Toronto_data.shape

(19, 5)

## Lets plot the neighborhoods on a Map

In [82]:
latitude = 43.653963
longitude = -79.387207

# create map of Toronto using latitude and longitude values, and set the zoom level
map_toronto_center = folium.Map(location=[latitude, longitude], zoom_start=14)

# add markers to map using the lat/long data in the dataframe 
for lat, lng, borough, neighborhood in zip(Toronto_data['Latitude'], Toronto_data['Longitude'], Toronto_data['Borough'], Toronto_data['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=15,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto_center)  

# display the map
map_toronto_center

## Lets set up the Foursquare Connection to get venue information about the Neighborhoods

In [23]:
CLIENT_ID = 'RPDH4QIYC4HUOWFKVAH0AF3NEV1SZUNGGXJYCZITGBOACQXX' # your Foursquare ID
CLIENT_SECRET = 'UZPWVY2XAAMPLXC3VA2NU51IWMHZBPXBZVESWHW0VZY3C1GV' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: RPDH4QIYC4HUOWFKVAH0AF3NEV1SZUNGGXJYCZITGBOACQXX
CLIENT_SECRET:UZPWVY2XAAMPLXC3VA2NU51IWMHZBPXBZVESWHW0VZY3C1GV


Import libaries to handle json and requests

In [24]:
import json # library to handle JSON files
import requests # library to handle requests
import numpy as np


Function to get venues in the neighbourhood from foursquare

In [25]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT = 100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [26]:
# Call the function on each of the Neighborhoods to get venue data
DowntownToronto_venues = getNearbyVenues(names=Toronto_data['Neighborhood'],
                                   latitudes=Toronto_data['Latitude'],
                                   longitudes=Toronto_data['Longitude']
                                  )

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Stn A PO Boxes
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


In [27]:
# Check the dataframe shape and header

print(DowntownToronto_venues.shape)
DowntownToronto_venues.head()

(1279, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.655354,-79.365044,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.655354,-79.365044,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
2,"Regent Park, Harbourfront",43.655354,-79.365044,Tandem Coffee,43.653559,-79.361809,Coffee Shop
3,"Regent Park, Harbourfront",43.655354,-79.365044,The Yoga Lounge,43.655515,-79.364955,Yoga Studio
4,"Regent Park, Harbourfront",43.655354,-79.365044,Berkeley Church,43.655123,-79.365873,Event Space


Check how many venues were returned for each Neigborhood

In [28]:
DowntownToronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,100,100,100,100,100,100
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",64,64,64,64,64,64
Central Bay Street,62,62,62,62,62,62
Christie,22,22,22,22,22,22
Church and Wellesley,73,73,73,73,73,73
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront East, Union Station, Toronto Islands",52,52,52,52,52,52
"Kensington Market, Chinatown, Grange Park",54,54,54,54,54,54


Check how many unique catergories of venue there are

In [29]:
print('There are {} uniques categories.'.format(len(DowntownToronto_venues['Venue Category'].unique())))

There are 188 uniques categories.


## Analyise the Neighborhoods

In [30]:
# one hot encoding
Toronto_onehot = pd.get_dummies(DowntownToronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = DowntownToronto_venues['Neighborhood'] 

# move neighborhood column to the first column
# get a list of columns
cols = list(Toronto_onehot)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('Neighborhood')))

# use ix to reorder
Toronto_onehot = Toronto_onehot.loc[:, cols]

Toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,Bagel Shop,...,Toy / Game Store,Trail,Train Station,University,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [31]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,Bagel Shop,...,Toy / Game Store,Trail,Train Station,University,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.01,0.01,0.02,0.0,0.0,0.0,0.01,0.0,0.01,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.0,0.0,0.015625
2,Central Bay Street,0.0,0.0,0.0,0.016129,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016129,0.0,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.013699,0.0,0.0,0.0,0.013699,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013699
5,"Commerce Court, Victoria Hotel",0.04,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.01,0.0
6,"First Canadian Place, Underground city",0.03,0.0,0.01,0.0,0.0,0.03,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.01,0.0
7,"Garden District, Ryerson",0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0
8,"Harbourfront East, Union Station, Toronto Islands",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.019231,0.0,0.0,0.0,0.0,0.0
9,"Kensington Market, Chinatown, Grange Park",0.0,0.0,0.018519,0.0,0.037037,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.018519,0.037037,0.018519,0.037037,0.018519,0.0,0.0


## Sort the Venues

In [32]:
num_top_venues = 5

for hood in Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                 venue  freq
0          Coffee Shop  0.08
1                 Café  0.05
2   Italian Restaurant  0.04
3  Japanese Restaurant  0.04
4             Beer Bar  0.03


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                venue  freq
0               Hotel  0.08
1  Italian Restaurant  0.06
2         Coffee Shop  0.06
3                Café  0.05
4          Restaurant  0.05


----Central Bay Street----
                       venue  freq
0                Coffee Shop  0.18
1        Japanese Restaurant  0.05
2         Italian Restaurant  0.05
3            Bubble Tea Shop  0.03
4  Middle Eastern Restaurant  0.03


----Christie----
               venue  freq
0  Korean Restaurant  0.23
1               Café  0.14
2      Grocery Store  0.14
3        Coffee Shop  0.09
4         Baby Store  0.05


----Church and Wellesley----
                 venue  freq
0          Coffee Shop  0.10
1  Japanese 

## Function to get the top venues

In [33]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

## Create a new dataframe of the top venues

In [34]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Café,Japanese Restaurant,Italian Restaurant,Hotel,Beer Bar,Seafood Restaurant,Restaurant,Cocktail Bar,Grocery Store
1,"CN Tower, King and Spadina, Railway Lands, Har...",Hotel,Coffee Shop,Italian Restaurant,Café,Restaurant,Park,Grocery Store,Gym,Beer Bar,Yoga Studio
2,Central Bay Street,Coffee Shop,Japanese Restaurant,Italian Restaurant,Bubble Tea Shop,Bar,Café,Sandwich Place,Middle Eastern Restaurant,Clothing Store,Seafood Restaurant
3,Christie,Korean Restaurant,Café,Grocery Store,Coffee Shop,Baby Store,Sandwich Place,Candy Store,Japanese Restaurant,Playground,Karaoke Bar
4,Church and Wellesley,Coffee Shop,Restaurant,Sushi Restaurant,Japanese Restaurant,Pub,Gay Bar,Men's Store,Mediterranean Restaurant,Café,Hotel


## Build the clusters using KMeans

In [35]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

array([4, 4, 1, 3, 2, 4, 4, 1, 1, 1, 2, 1, 4, 0, 4, 2, 4, 4, 4],
      dtype=int32)

## Add the cluster labels into the Toronto Data 

In [36]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = Toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

In [37]:
Toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.655354,-79.365044,1,Coffee Shop,Italian Restaurant,Breakfast Spot,Yoga Studio,Bar,Diner,Discount Store,Electronics Store,Event Space,Sandwich Place
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66391,-79.388733,2,Coffee Shop,Gym,Diner,Yoga Studio,Dance Studio,Café,College Cafeteria,Restaurant,Ramen Restaurant,College Theater
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657478,-79.378632,1,Coffee Shop,Clothing Store,Cosmetics Shop,Ramen Restaurant,Café,Italian Restaurant,Japanese Restaurant,Lingerie Store,Hotel,Fast Food Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651112,-79.375732,4,Coffee Shop,Restaurant,Café,Seafood Restaurant,Clothing Store,Italian Restaurant,Cocktail Bar,Cosmetics Shop,American Restaurant,Department Store
4,M5E,Downtown Toronto,Berczy Park,43.647018,-79.374084,4,Coffee Shop,Café,Japanese Restaurant,Italian Restaurant,Hotel,Beer Bar,Seafood Restaurant,Restaurant,Cocktail Bar,Grocery Store


In [38]:
Toronto_Cluster3 = Toronto_merged[Toronto_merged['Cluster Labels']==3]

In [39]:
Toronto_Cluster3

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,M6G,Downtown Toronto,Christie,43.667717,-79.420197,3,Korean Restaurant,Café,Grocery Store,Coffee Shop,Baby Store,Sandwich Place,Candy Store,Japanese Restaurant,Playground,Karaoke Bar


## Show the clusters on a map of Toronto

In [71]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=14)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.9).add_to(map_clusters)
       
map_clusters

## View the clusters by Venues

In [41]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Rosedale,0,Pie Shop,Coffee Shop,Park,Breakfast Spot,Trail,Japanese Restaurant,Bank,Sandwich Place,Fish & Chips Shop,Fast Food Restaurant


In [42]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Regent Park, Harbourfront",1,Coffee Shop,Italian Restaurant,Breakfast Spot,Yoga Studio,Bar,Diner,Discount Store,Electronics Store,Event Space,Sandwich Place
2,"Garden District, Ryerson",1,Coffee Shop,Clothing Store,Cosmetics Shop,Ramen Restaurant,Café,Italian Restaurant,Japanese Restaurant,Lingerie Store,Hotel,Fast Food Restaurant
5,Central Bay Street,1,Coffee Shop,Japanese Restaurant,Italian Restaurant,Bubble Tea Shop,Bar,Café,Sandwich Place,Middle Eastern Restaurant,Clothing Store,Seafood Restaurant
8,"Harbourfront East, Union Station, Toronto Islands",1,Coffee Shop,Boat or Ferry,Bar,Sporting Goods Shop,Restaurant,Hotel,Plaza,Fried Chicken Joint,Pizza Place,Liquor Store
12,"Kensington Market, Chinatown, Grange Park",1,Café,Coffee Shop,Pizza Place,Bakery,Vietnamese Restaurant,Clothing Store,Vegetarian / Vegan Restaurant,Park,Arts & Crafts Store,Mexican Restaurant


In [43]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,"Queen's Park, Ontario Provincial Government",2,Coffee Shop,Gym,Diner,Yoga Studio,Dance Studio,Café,College Cafeteria,Restaurant,Ramen Restaurant,College Theater
16,"St. James Town, Cabbagetown",2,Coffee Shop,Restaurant,Pizza Place,Grocery Store,Japanese Restaurant,Chinese Restaurant,Pub,Bakery,Café,Italian Restaurant
18,Church and Wellesley,2,Coffee Shop,Restaurant,Sushi Restaurant,Japanese Restaurant,Pub,Gay Bar,Men's Store,Mediterranean Restaurant,Café,Hotel


In [44]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 3, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Christie,3,Korean Restaurant,Café,Grocery Store,Coffee Shop,Baby Store,Sandwich Place,Candy Store,Japanese Restaurant,Playground,Karaoke Bar


In [45]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 4, Toronto_merged.columns[[2] + list(range(5, Toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,St. James Town,4,Coffee Shop,Restaurant,Café,Seafood Restaurant,Clothing Store,Italian Restaurant,Cocktail Bar,Cosmetics Shop,American Restaurant,Department Store
4,Berczy Park,4,Coffee Shop,Café,Japanese Restaurant,Italian Restaurant,Hotel,Beer Bar,Seafood Restaurant,Restaurant,Cocktail Bar,Grocery Store
7,"Richmond, Adelaide, King",4,Hotel,Café,Restaurant,Coffee Shop,Gym,Japanese Restaurant,Salad Place,Steakhouse,American Restaurant,Asian Restaurant
9,"Toronto Dominion Centre, Design Exchange",4,Coffee Shop,Hotel,Café,Restaurant,Gym,American Restaurant,Japanese Restaurant,Seafood Restaurant,Steakhouse,Tea Room
10,"Commerce Court, Victoria Hotel",4,Coffee Shop,Restaurant,Café,Hotel,Italian Restaurant,American Restaurant,Gym,Japanese Restaurant,Gastropub,Seafood Restaurant
11,"University of Toronto, Harbord",4,Café,Coffee Shop,Restaurant,Park,Bookstore,Museum,Office,Italian Restaurant,Dessert Shop,College Arts Building
13,"CN Tower, King and Spadina, Railway Lands, Har...",4,Hotel,Coffee Shop,Italian Restaurant,Café,Restaurant,Park,Grocery Store,Gym,Beer Bar,Yoga Studio
15,Stn A PO Boxes,4,Coffee Shop,Restaurant,Hotel,Café,Cocktail Bar,Beer Bar,Deli / Bodega,Japanese Restaurant,Sporting Goods Shop,Gastropub
17,"First Canadian Place, Underground city",4,Coffee Shop,Hotel,Café,Restaurant,Gym,Japanese Restaurant,American Restaurant,Deli / Bodega,Asian Restaurant,Salad Place


## Lets change the Labels to show the top 3 catergories for the cluster

In [52]:
df_Labels = Toronto_merged.groupby('Cluster Labels')['1st Most Common Venue'].apply(lambda x: x.mode().iat[0]).reset_index()

In [53]:
df_Labels

Unnamed: 0,Cluster Labels,1st Most Common Venue
0,0,Pie Shop
1,1,Coffee Shop
2,2,Coffee Shop
3,3,Korean Restaurant
4,4,Coffee Shop


In [54]:
df_lables2 = Toronto_merged.groupby('Cluster Labels')['2nd Most Common Venue'].apply(lambda x: x.mode().iat[0]).reset_index()

In [55]:
df_lables2

Unnamed: 0,Cluster Labels,2nd Most Common Venue
0,0,Coffee Shop
1,1,Boat or Ferry
2,2,Restaurant
3,3,Café
4,4,Restaurant


In [56]:
df_lables3 = Toronto_merged.groupby('Cluster Labels')['3rd Most Common Venue'].apply(lambda x: x.mode().iat[0]).reset_index()

In [57]:
df_lables3

Unnamed: 0,Cluster Labels,3rd Most Common Venue
0,0,Park
1,1,Bar
2,2,Diner
3,3,Grocery Store
4,4,Café


## Lets merge those together to one table

In [61]:
from functools import reduce
dfs = [df_Labels, df_lables2, df_lables3]
df_final = reduce(lambda left,right: pd.merge(left,right,on='Cluster Labels'), dfs)

In [62]:
df_final

Unnamed: 0,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,0,Pie Shop,Coffee Shop,Park
1,1,Coffee Shop,Boat or Ferry,Bar
2,2,Coffee Shop,Restaurant,Diner
3,3,Korean Restaurant,Café,Grocery Store
4,4,Coffee Shop,Restaurant,Café


## And now create the Label as a contcatenation of the three most common Venues

In [69]:
cols = ['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue']
df_final['Label'] = df_final[cols].apply(lambda row: ', '.join(row.values.astype(str)), axis=1)

In [70]:
df_final

Unnamed: 0,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,Label
0,0,Pie Shop,Coffee Shop,Park,"Pie Shop, Coffee Shop, Park"
1,1,Coffee Shop,Boat or Ferry,Bar,"Coffee Shop, Boat or Ferry, Bar"
2,2,Coffee Shop,Restaurant,Diner,"Coffee Shop, Restaurant, Diner"
3,3,Korean Restaurant,Café,Grocery Store,"Korean Restaurant, Café, Grocery Store"
4,4,Coffee Shop,Restaurant,Café,"Coffee Shop, Restaurant, Café"


## And now add the label into our Data Set for Mapping

In [72]:
Toronto_merged = pd.merge(Toronto_merged, df_final, on='Cluster Labels', how='outer')

## For some reason Cafe with an accent doesnt display in folium correctly so we will just get rid of the accent!

In [78]:
!pip install unidecode
import unidecode
Toronto_merged['Label'] = Toronto_merged['Label'].apply(unidecode.unidecode)

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 7.3MB/s eta 0:00:01
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.1.1


In [79]:
Toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue_x,2nd Most Common Venue_x,3rd Most Common Venue_x,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,1st Most Common Venue_y,2nd Most Common Venue_y,3rd Most Common Venue_y,Label
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.655354,-79.365044,1,Coffee Shop,Italian Restaurant,Breakfast Spot,Yoga Studio,Bar,Diner,Discount Store,Electronics Store,Event Space,Sandwich Place,Coffee Shop,Boat or Ferry,Bar,"Coffee Shop, Boat or Ferry, Bar"
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657478,-79.378632,1,Coffee Shop,Clothing Store,Cosmetics Shop,Ramen Restaurant,Café,Italian Restaurant,Japanese Restaurant,Lingerie Store,Hotel,Fast Food Restaurant,Coffee Shop,Boat or Ferry,Bar,"Coffee Shop, Boat or Ferry, Bar"
2,M5G,Downtown Toronto,Central Bay Street,43.656761,-79.38649,1,Coffee Shop,Japanese Restaurant,Italian Restaurant,Bubble Tea Shop,Bar,Café,Sandwich Place,Middle Eastern Restaurant,Clothing Store,Seafood Restaurant,Coffee Shop,Boat or Ferry,Bar,"Coffee Shop, Boat or Ferry, Bar"
3,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640686,-79.376625,1,Coffee Shop,Boat or Ferry,Bar,Sporting Goods Shop,Restaurant,Hotel,Plaza,Fried Chicken Joint,Pizza Place,Liquor Store,Coffee Shop,Boat or Ferry,Bar,"Coffee Shop, Boat or Ferry, Bar"
4,M5T,Downtown Toronto,"Kensington Market, Chinatown, Grange Park",43.653248,-79.397064,1,Café,Coffee Shop,Pizza Place,Bakery,Vietnamese Restaurant,Clothing Store,Vegetarian / Vegan Restaurant,Park,Arts & Crafts Store,Mexican Restaurant,Coffee Shop,Boat or Ferry,Bar,"Coffee Shop, Boat or Ferry, Bar"


Lets plot the map with the label for the cluster

In [81]:
# create map
map_clusters2 = folium.Map(location=[latitude, longitude], zoom_start=14)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Label'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.Marker(
        [lat, lon],
        radius=15,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.9).add_to(map_clusters)
       
map_clusters