## Question 1

###### 1. Importing concerned libraries

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests # library to handle requests
import lxml # parse the website in lxml format

###### 2. Downloading, Scraping and Wrangling

In [2]:
# downloading and scraping data from website
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table = soup.find('table', class_='wikitable sortable')
#print(table.prettify())

In [3]:
#Preprocessing the table before the transformation of data in .csv file
table1 = ""
for tr in table.find_all('tr'):
    row = ""
    for tds in tr.find_all('td'):
        row = row + ";" + tds.text.strip()
    table1 = table1 + row[1:] + '\n'
print(table1)


M1A;Not assigned;Not assigned
M2A;Not assigned;Not assigned
M3A;North York;Parkwoods
M4A;North York;Victoria Village
M5A;Downtown Toronto;Regent Park, Harbourfront
M6A;North York;Lawrence Manor, Lawrence Heights
M7A;Downtown Toronto;Queen's Park, Ontario Provincial Government
M8A;Not assigned;Not assigned
M9A;Etobicoke;Islington Avenue, Humber Valley Village
M1B;Scarborough;Malvern, Rouge
M2B;Not assigned;Not assigned
M3B;North York;Don Mills
M4B;East York;Parkview Hill, Woodbine Gardens
M5B;Downtown Toronto;Garden District, Ryerson
M6B;North York;Glencairn
M7B;Not assigned;Not assigned
M8B;Not assigned;Not assigned
M9B;Etobicoke;West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
M1C;Scarborough;Rouge Hill, Port Union, Highland Creek
M2C;Not assigned;Not assigned
M3C;North York;Don Mills
M4C;East York;Woodbine Heights
M5C;Downtown Toronto;St. James Town
M6C;York;Humewood-Cedarvale
M7C;Not assigned;Not assigned
M8C;Not assigned;Not assigned
M9C;Etobicoke;Eringate, B

In [4]:
#Rewriting data in .csv format
filename = 'toronto.csv'

csv_file = open(filename,'wb')
csv_file.write(bytes(table1,encoding="ascii",errors="ignore"))


7231

In [5]:
#converting into a dataframe
df = pd.read_csv(filename, header = None, delimiter=';')
df.columns = ['Postalcode', 'Borough', 'Neighbourhood']
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
Borough_only_assigned = df[df['Borough'] == 'Not assigned'].index
df.drop(Borough_only_assigned, inplace = True)
df.head(15)

Unnamed: 0,Postalcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']
df.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
# More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
df_group = df.groupby(['Postalcode', 'Borough'], sort = False).agg( ','.join)
df_group_new = df_group.reset_index()
df_group_new.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
# shape method to print the number of rows of your dataframe
df_group_new.shape

(103, 3)

## Question 2

###### 1. Creating a dataframe using the following geographical coordinates

In [10]:
#Use the following csv file which contains the geographical coordinates of each postal code to create its dataframe:
!wget -q -O 'Toronto_localisation.csv' http://cocl.us/Geospatial_data
df_toronto_localisation = pd.read_csv('Toronto_localisation.csv')
df_toronto_localisation.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
# Merge all data based on Postalcode
df_toronto_localisation = df_toronto_localisation.rename(columns={'Postal Code':'Postalcode'})
df_toronto_loc= pd.merge(df_group_new, df_toronto_localisation, on = 'Postalcode')
df_toronto_loc.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Question 3

###### 1. Exploring and clustering neighborhoods in Toronto

In [12]:
# Importing necessary libraries
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Folium installed
Libraries imported.


In [14]:
#Define Foursquare Credentials and Version
CLIENT_ID = 'DCWR3XPE3YXZPVHACVO5T0GFBPOW0PV0KPDWM4OPH2ZRDEJQ' # your Foursquare ID
CLIENT_SECRET = 'ZWSYUFGPDTVMVLNT1HSMJ1IID1CRA5UXAKOY01IG3VN4NZIA' # your Foursquare Secret
VERSION = '20200130'
radius=500
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: DCWR3XPE3YXZPVHACVO5T0GFBPOW0PV0KPDWM4OPH2ZRDEJQ
CLIENT_SECRET:ZWSYUFGPDTVMVLNT1HSMJ1IID1CRA5UXAKOY01IG3VN4NZIA


In [15]:
# Getting Geogrphical coordinates of Toronto
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="Toronto geo")
location = geolocator.geocode(address)
latitude_toronto = location.latitude
longitude_toronto = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude_toronto, longitude_toronto))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [22]:
map_toronto = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=10)

# add markers to map
for lat, lng, borough, Neighbourhood in zip(df_toronto_loc['Latitude'], df_toronto_loc['Longitude'], df_toronto_loc['Borough'], df_toronto_loc['Neighbourhood']):
    label = '{}, {}'.format(Neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [31]:
# Creating a function to explore all neighborhoods
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
       # creating the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # Sending GET request for result
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # Collecting useful features for each nearby venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
    

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [33]:
toronto_venues = getNearbyVenues(names=df_toronto_loc['Neighbourhood'],
                                latitudes=df_toronto_loc['Latitude'],
                                longitudes=df_toronto_loc['Longitude']
                                )
toronto_venues.head()

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Brookbanks Pool,43.751389,-79.332184,Pool
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [34]:
#  printing the number of rows of your dataframe.
toronto_venues.shape

(2140, 7)

In [36]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, Long Branch",8,8,8,8,8,8
"Bathurst Manor, Wilson Heights, Downsview North",20,20,20,20,20,20
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",26,26,26,26,26,26
Berczy Park,57,57,57,57,57,57
"Birch Cliff, Cliffside West",5,5,5,5,5,5
"Brockton, Parkdale Village, Exhibition Place",22,22,22,22,22,22
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",16,16,16,16,16,16
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16


###### 2. Analysing each Neighbourhood

In [40]:
# One-hot enconding by Venue Category
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# Matching neighbourhood column to the initial dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# Placing neighbourhood column at the beginning of the dataframe
new_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
toronto_onehot.shape

(2140, 272)

In [43]:
# Group by Neighbourhood
toronto_group_by_n = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_group_by_n

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.000000,0.000000,0.000000,0.0000,0.000,0.000,0.000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.000000,0.000000
1,"Alderwood, Long Branch",0.000000,0.000000,0.000000,0.0000,0.000,0.000,0.000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.000000,0.000000
2,"Bathurst Manor, Wilson Heights, Downsview North",0.000000,0.000000,0.000000,0.0000,0.000,0.000,0.000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.000000,0.000000
3,Bayview Village,0.000000,0.000000,0.000000,0.0000,0.000,0.000,0.000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.000000,0.000000
4,"Bedford Park, Lawrence Manor East",0.000000,0.000000,0.000000,0.0000,0.000,0.000,0.000,0.038462,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.000000,0.000000
5,Berczy Park,0.000000,0.000000,0.000000,0.0000,0.000,0.000,0.000,0.000000,0.000000,...,0.017544,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.000000,0.000000
6,"Birch Cliff, Cliffside West",0.000000,0.000000,0.000000,0.0000,0.000,0.000,0.000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.000000,0.000000
7,"Brockton, Parkdale Village, Exhibition Place",0.000000,0.000000,0.000000,0.0000,0.000,0.000,0.000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.000000,0.000000
8,"Business reply mail Processing Centre, South C...",0.000000,0.000000,0.000000,0.0000,0.000,0.000,0.000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.000000,0.062500
9,"CN Tower, King and Spadina, Railway Lands, Har...",0.000000,0.000000,0.062500,0.0625,0.125,0.125,0.125,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.0,0.000000,0.000000


In [51]:
# Returning each neighbourhood with the most common venue listed in top 5

for nei in toronto_group_by_n['Neighborhood']:
    print("---"+nei+"---")
    temp = toronto_group_by_n[toronto_group_by_n['Neighborhood'] == nei].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp=temp.round({'freq':2})
    
    print('\n')

---Agincourt---


---Alderwood, Long Branch---


---Bathurst Manor, Wilson Heights, Downsview North---


---Bayview Village---


---Bedford Park, Lawrence Manor East---


---Berczy Park---


---Birch Cliff, Cliffside West---


---Brockton, Parkdale Village, Exhibition Place---


---Business reply mail Processing Centre, South Central Letter Processing Plant Toronto---


---CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport---


---Caledonia-Fairbanks---


---Canada Post Gateway Processing Centre---


---Cedarbrae---


---Central Bay Street---


---Christie---


---Church and Wellesley---


---Clarks Corners, Tam O'Shanter, Sullivan---


---Cliffside, Cliffcrest, Scarborough Village West---


---Commerce Court, Victoria Hotel---


---Davisville---


---Davisville North---


---Del Ray, Mount Dennis, Keelsdale and Silverthorn---


---Don Mills---


---Dorset Park, Wexford Heights, Scarborough Town Centre---


---Downsview---


---Du

In [85]:
# Putting all data into pandas dataframe to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [86]:
import numpy as np
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighborhood'] = toronto_group_by_n['Neighborhood']

for ind in np.arange(toronto_group_by_n.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_group_by_n.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Lounge,Skating Rink,Latin American Restaurant,Breakfast Spot,Drugstore
1,"Alderwood, Long Branch",Pizza Place,Gym,Pool,Coffee Shop,Sandwich Place
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Shopping Mall,Pizza Place,Sushi Restaurant
3,Bayview Village,Café,Bank,Japanese Restaurant,Chinese Restaurant,Yoga Studio
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Sandwich Place,Restaurant,Coffee Shop,Pizza Place


##### 3. Clustering Neighborhoods

In [97]:
from sklearn.cluster import KMeans


# Run K-means to cluster the neighbourhoods into 5 clusters
kclusters = 5

toronto_grouped_clustering = toronto_group_by_n.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

kmeans.labels_

array([2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2,
       2, 2, 2, 2, 3, 2, 2, 2, 3, 2, 0, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2,
       0, 2, 3, 2, 2, 2, 3, 2, 4, 2, 2, 2, 2, 2, 1, 2, 0, 3, 2, 2, 2, 3,
       2, 2, 2, 0, 3, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 3,
       2, 2, 0, 2, 2, 3, 2], dtype=int32)

In [106]:
# Creating a new data frame including the cluster indices
toronto_merged = df_toronto_loc
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,3.0,Park,Food & Drink Shop,Pool,Dim Sum Restaurant,Diner
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Portuguese Restaurant,French Restaurant,Pizza Place,Coffee Shop,Hockey Arena
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2.0,Coffee Shop,Bakery,Pub,Park,Breakfast Spot
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,2.0,Clothing Store,Accessories Store,Furniture / Home Store,Event Space,Vietnamese Restaurant
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2.0,Coffee Shop,Diner,Sushi Restaurant,Yoga Studio,Park


In [107]:
# dropping blank data
toronto_merged=toronto_merged.dropna()
toronto_merged['Cluster_Labels'] = toronto_merged.Cluster_Labels.astype(int)

In [123]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


##### 5. Examining Clusters

In [124]:
# The 1st Cluster
toronto_merged.loc[toronto_merged['Cluster_Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
57,North York,1,Baseball Field,Yoga Studio,Donut Shop,Diner,Discount Store
101,Etobicoke,1,Baseball Field,Yoga Studio,Donut Shop,Diner,Discount Store


In [125]:
# The 2nd Cluster
toronto_merged.loc[toronto_merged['Cluster_Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
57,North York,1,Baseball Field,Yoga Studio,Donut Shop,Diner,Discount Store
101,Etobicoke,1,Baseball Field,Yoga Studio,Donut Shop,Diner,Discount Store


In [126]:
# The 3rd Cluster
toronto_merged.loc[toronto_merged['Cluster_Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2,Downtown Toronto,2,Coffee Shop,Bakery,Pub,Park,Breakfast Spot
3,North York,2,Clothing Store,Accessories Store,Furniture / Home Store,Event Space,Vietnamese Restaurant
4,Downtown Toronto,2,Coffee Shop,Diner,Sushi Restaurant,Yoga Studio,Park
6,Scarborough,2,Fast Food Restaurant,Donut Shop,Dim Sum Restaurant,Diner,Discount Store
7,North York,2,Gym,Asian Restaurant,Japanese Restaurant,Restaurant,Coffee Shop
9,Downtown Toronto,2,Clothing Store,Coffee Shop,Café,Cosmetics Shop,Japanese Restaurant
12,Scarborough,2,Bar,Yoga Studio,Donut Shop,Diner,Discount Store
13,North York,2,Gym,Asian Restaurant,Japanese Restaurant,Restaurant,Coffee Shop
14,East York,2,Skating Rink,Bus Stop,Park,Spa,Pharmacy
15,Downtown Toronto,2,Café,Coffee Shop,Gastropub,Restaurant,Cocktail Bar


In [127]:
# The 4th Cluster
toronto_merged.loc[toronto_merged['Cluster_Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,North York,3,Park,Food & Drink Shop,Pool,Dim Sum Restaurant,Diner
32,Scarborough,3,Playground,Yoga Studio,Drugstore,Diner,Discount Store
35,East York,3,Park,Convenience Store,Drugstore,Diner,Discount Store
61,Central Toronto,3,Park,Swim School,Bus Line,Doner Restaurant,Diner
64,York,3,Park,Donut Shop,Dim Sum Restaurant,Diner,Discount Store
66,North York,3,Park,Convenience Store,Drugstore,Diner,Discount Store
68,Central Toronto,3,Park,Trail,Sushi Restaurant,Jewelry Store,Electronics Store
85,Scarborough,3,Park,Playground,Sculpture Garden,Dog Run,Dessert Shop
91,Downtown Toronto,3,Park,Playground,Trail,Dog Run,Dessert Shop


In [128]:
# The 5th Cluster
toronto_merged.loc[toronto_merged['Cluster_Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
83,Central Toronto,4,Lawyer,Yoga Studio,Drugstore,Diner,Discount Store
