## PART 3 - Segmenting and Clustering Neighborhoods in Toronto

1. Import libraries

In [81]:
import pandas as pd # library for data analsysis
from bs4 import BeautifulSoup # library for pulling data out of HTML and XML files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


2. Use the Notebook to build the code to scrape the Wikipedia page.

In [82]:
# Upload the Wikipedia page
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

In [83]:
# Find the Table on the Wikipedia page and iterate through tags for required information
table_can = soup.find('table')
col = table_can.find_all('td')

elem_cnt = len(col)

# create three lists to store table data
postcode = []
borough = []
neighborhood = []

for i in range(0, elem_cnt, 3):
    postcode.append(col[i].text.strip())
    borough.append(col[i+1].text.strip())
    neighborhood.append(col[i+2].text.strip())

In [84]:
# create a new DataFrame from our lists
toronto_df = pd.DataFrame({"PostalCode": postcode,
                           "Borough": borough,
                           "Neighborhood": neighborhood})

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


3.1. The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [85]:
# Normalize data and transform per given requirements
toronto_df_dropna = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df.loc[toronto_df.Neighborhood == 'Not assigned', "Neighborhood"] = toronto_df.Borough

#Group data by Postcode & Borough
toronto_df_grouped = toronto_df_dropna.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))

# for Neighborhood="Not assigned", make the value the same as Borough
for index, row in toronto_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]

In [86]:
# create a new dataframe to check whether we met the assignement's requirements
column_names = ["PostalCode", "Borough", "Neighborhood"]
neighborhoods_df = pd.DataFrame(columns=column_names)

neighborhoods_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in neighborhoods_list:
    neighborhoods_df = neighborhoods_df.append(toronto_df_grouped[toronto_df_grouped["PostalCode"]==postcode], ignore_index=True)
    
neighborhoods_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


In [87]:
# print the number of rows of the cleaned dataframe
toronto_df_grouped.shape

(103, 3)

3.2. Add coordinates

In [88]:
# load coordinates from the csv file 
coordinates = pd.read_csv("http://cocl.us/Geospatial_data")
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [89]:
# rename the column "PostalCode"
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)

# merge two table on the column "PostalCode" 
toronto_df_new = toronto_df_grouped.merge(coordinates, on="PostalCode", how="left")
toronto_df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [90]:
column_names = ["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]
neighborhoods_coord_df = pd.DataFrame(columns=column_names)

neighborhoods_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in neighborhoods_list:
    neighborhoods_coord_df = neighborhoods_coord_df.append(toronto_df_new[toronto_df_new["PostalCode"]==postcode], ignore_index=True)
    
neighborhoods_coord_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


4. Use geopy library to get the latitude and longitude values of Toronto

In [91]:
address = 'Toronto, Ca'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


5. Create a map of Toronto with neighborhoods

In [92]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods_coord_df['Latitude'], neighborhoods_coord_df['Longitude'], neighborhoods_coord_df['Borough'], neighborhoods_coord_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

6. Start utilizing the Foursquare API to explore the neighborhoods and segment them

In [93]:
# Foursquare Credentials
CLIENT_ID = 'UCUF14KI1T43FFS0COUVYXCZ2TASUOBXTREQ2ONZTGSYCQJ2' # your Foursquare ID
CLIENT_SECRET = 'LK1MLFWYGH5BUA4IIKMELJIKUIDBCVVOZH0C2YKRWZ3LSHMJ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: UCUF14KI1T43FFS0COUVYXCZ2TASUOBXTREQ2ONZTGSYCQJ2
CLIENT_SECRET:LK1MLFWYGH5BUA4IIKMELJIKUIDBCVVOZH0C2YKRWZ3LSHMJ


7. Let's explore the first neighborhood in our dataframe

In [94]:
neighborhoods_coord_df.loc[0, 'Neighborhood']

'Central Bay Street'

In [95]:
#Get the neighborhood's latitude and longitude values.
neighborhood_latitude = neighborhoods_coord_df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = neighborhoods_coord_df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = neighborhoods_coord_df.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Central Bay Street are 43.6579524, -79.3873826.


8. Explore Neighborhoods in Toronto. Let's explore all the neighborhoods in Toronto (let's get the top 50 venues that are in each Neighborhood within a radius of 500 meters)

In [96]:
# Set the limit of venues
LIMIT = 50 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# A function to explore the neighborhoods in Toronto
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

The code to run the above function on each neighborhood and create a new dataframe called toronto_venues

In [97]:
toronto_venues = getNearbyVenues(names=neighborhoods_coord_df['Neighborhood'],
                                   latitudes=neighborhoods_coord_df['Latitude'],
                                   longitudes=neighborhoods_coord_df['Longitude']
                                  )

Central Bay Street
Hillcrest Village
Parkview Hill, Woodbine Gardens
Scarborough Village
Leaside
Studio District
Wexford, Maryvale
South Steeles, Silverstone, Humbergate, Jamestown, Mount Olive, Beaumond Heights, Thistletown, Albion Gardens
Humber Summit
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Malvern, Rouge
Regent Park, Harbourfront


Check the size of the resulting dataframe

In [98]:
print(toronto_venues.shape)
toronto_venues.head()

(216, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Central Bay Street,43.657952,-79.387383,Jimmy's Coffee,43.658421,-79.385613,Coffee Shop
1,Central Bay Street,43.657952,-79.387383,Tim Hortons,43.65857,-79.385123,Coffee Shop
2,Central Bay Street,43.657952,-79.387383,Somethin' 2 Talk About,43.658395,-79.385338,Middle Eastern Restaurant
3,Central Bay Street,43.657952,-79.387383,Hailed Coffee,43.658833,-79.383684,Coffee Shop
4,Central Bay Street,43.657952,-79.387383,NEO COFFEE BAR,43.66013,-79.38583,Coffee Shop


Check how many venues were returned for each neighborhood

In [99]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",17,17,17,17,17,17
Central Bay Street,50,50,50,50,50,50
Hillcrest Village,5,5,5,5,5,5
Humber Summit,2,2,2,2,2,2
Leaside,33,33,33,33,33,33
"Malvern, Rouge",2,2,2,2,2,2
"Parkview Hill, Woodbine Gardens",12,12,12,12,12,12
"Regent Park, Harbourfront",45,45,45,45,45,45
Scarborough Village,1,1,1,1,1,1
"South Steeles, Silverstone, Humbergate, Jamestown, Mount Olive, Beaumond Heights, Thistletown, Albion Gardens",8,8,8,8,8,8


Find out how many unique categories can be curated from all the returned venues

In [100]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 107 uniques categories.


9. Analyze Each Neighborhood

In [101]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,...,Spa,Sporting Goods Shop,Sports Bar,Stationery Store,Supermarket,Sushi Restaurant,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Wine Bar
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
toronto_onehot.shape

(216, 107)

Let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [103]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Spa,Sporting Goods Shop,Sports Bar,Stationery Store,Supermarket,Sushi Restaurant,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Wine Bar
0,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.058824,0.058824,0.058824,0.117647,0.117647,0.117647,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Central Bay Street,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.02,0.02
2,Hillcrest Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Humber Summit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Leaside,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.090909,0.030303,0.0,0.030303,0.030303,0.0,0.0,0.0,0.0
5,"Malvern, Rouge",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Parkview Hill, Woodbine Gardens",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Regent Park, Harbourfront",0.022222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022222,...,0.022222,0.0,0.0,0.0,0.0,0.0,0.0,0.044444,0.0,0.0
8,Scarborough Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"South Steeles, Silverstone, Humbergate, Jamest...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Confirm the new size

In [104]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3          Boutique  0.06
4               Bar  0.06


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.16
1                Café  0.06
2      Sandwich Place  0.06
3  Italian Restaurant  0.04
4        Burger Joint  0.04


----Hillcrest Village----
                      venue  freq
0               Golf Course   0.2
1                      Pool   0.2
2      Fast Food Restaurant   0.2
3  Mediterranean Restaurant   0.2
4                   Dog Run   0.2


----Humber Summit----
                    venue  freq
0  Furniture / Home Store   0.5
1             Pizza Place   0.5
2               Pet Store   0.0
3   Performing Arts Venue   0.0
4                    Park   0.0


----Leaside----
                    venue  freq
0     Sporting Goods Shop  0.09
1 

 Put that into a pandas dataframe

In [105]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [106]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Sculpture Garden,Airport,Airport Food Court,Airport Gate,Bar,Rental Car Location
1,Central Bay Street,Coffee Shop,Sandwich Place,Café,Bubble Tea Shop,Burger Joint,Italian Restaurant,Wine Bar,Modern European Restaurant,Middle Eastern Restaurant,Juice Bar
2,Hillcrest Village,Fast Food Restaurant,Golf Course,Pool,Dog Run,Mediterranean Restaurant,Wine Bar,Coworking Space,Cheese Shop,Chocolate Shop,Clothing Store
3,Humber Summit,Pizza Place,Furniture / Home Store,Dessert Shop,Café,Cheese Shop,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop
4,Leaside,Coffee Shop,Sporting Goods Shop,Burger Joint,Bank,Furniture / Home Store,Department Store,Sandwich Place,Beer Store,Electronics Store,Bagel Shop


10. Cluster Neighborhoods

In [107]:
# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 1, 0, 0, 2, 0], dtype=int32)

In [108]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Label', kmeans.labels_)

toronto_merged = neighborhoods_coord_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0,Coffee Shop,Sandwich Place,Café,Bubble Tea Shop,Burger Joint,Italian Restaurant,Wine Bar,Modern European Restaurant,Middle Eastern Restaurant,Juice Bar
1,M2H,North York,Hillcrest Village,43.803762,-79.363452,0,Fast Food Restaurant,Golf Course,Pool,Dog Run,Mediterranean Restaurant,Wine Bar,Coworking Space,Cheese Shop,Chocolate Shop,Clothing Store
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,0,Pizza Place,Pet Store,Gym / Fitness Center,Pharmacy,Intersection,Bank,Bus Line,Athletics & Sports,Café,Flea Market
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,2,Playground,Wine Bar,Department Store,Café,Cheese Shop,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop
4,M4G,East York,Leaside,43.70906,-79.363452,0,Coffee Shop,Sporting Goods Shop,Burger Joint,Bank,Furniture / Home Store,Department Store,Sandwich Place,Beer Store,Electronics Store,Bagel Shop


In [109]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

11. Examine clusters. Let's examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, let's then assign a name to each cluster.

In [110]:
# Cluster 1 
toronto_merged.loc[toronto_merged['Cluster Label'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,0,Coffee Shop,Sandwich Place,Café,Bubble Tea Shop,Burger Joint,Italian Restaurant,Wine Bar,Modern European Restaurant,Middle Eastern Restaurant,Juice Bar
1,North York,0,Fast Food Restaurant,Golf Course,Pool,Dog Run,Mediterranean Restaurant,Wine Bar,Coworking Space,Cheese Shop,Chocolate Shop,Clothing Store
2,East York,0,Pizza Place,Pet Store,Gym / Fitness Center,Pharmacy,Intersection,Bank,Bus Line,Athletics & Sports,Café,Flea Market
4,East York,0,Coffee Shop,Sporting Goods Shop,Burger Joint,Bank,Furniture / Home Store,Department Store,Sandwich Place,Beer Store,Electronics Store,Bagel Shop
5,East Toronto,0,Coffee Shop,Brewery,Gastropub,Bakery,Café,American Restaurant,Yoga Studio,Clothing Store,Pet Store,Middle Eastern Restaurant
6,Scarborough,0,Sandwich Place,Middle Eastern Restaurant,Bakery,Auto Garage,Wine Bar,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop
7,Etobicoke,0,Grocery Store,Pharmacy,Sandwich Place,Fried Chicken Joint,Pizza Place,Beer Store,Fast Food Restaurant,Airport Terminal,American Restaurant,Chocolate Shop
8,North York,0,Pizza Place,Furniture / Home Store,Dessert Shop,Café,Cheese Shop,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop
9,Downtown Toronto,0,Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Sculpture Garden,Airport,Airport Food Court,Airport Gate,Bar,Rental Car Location
11,Downtown Toronto,0,Coffee Shop,Bakery,Pub,Café,Park,Breakfast Spot,Theater,Art Gallery,Electronics Store,Mexican Restaurant


In [111]:
# Cluster 2
toronto_merged.loc[toronto_merged['Cluster Label'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Scarborough,1,Fast Food Restaurant,Print Shop,Wine Bar,Department Store,Café,Cheese Shop,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant


In [112]:
# Cluster 3
toronto_merged.loc[toronto_merged['Cluster Label'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Scarborough,2,Playground,Wine Bar,Department Store,Café,Cheese Shop,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop
