# Segmenting and clustering neighborhoods to find the best nightlife hotspots in Toronto



### This is the IBM Capstone projects  'Segmenting and Clustering Neighborhoods in Toronto' workbook

In [2]:
#Import useful libraries

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Needed for webscraping
!pip install bs4
from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


## Scraping wikipedia for Toronto data

In [3]:
# Using BeautfiulSoup to transfer data from wikipedia url to a readable format
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_data=requests.get(url).text 
soup=BeautifulSoup(html_data,"html5lib")

In [4]:
# Finding the required table to inspect what it contains
table=soup.find('table') 
print(table.prettify())

<table cellpadding="2" cellspacing="0" rules="all" style="width:100%; border-collapse:collapse; border:1px solid #ccc;">
 <tbody>
  <tr>
   <td style="width:11%; vertical-align:top; color:#ccc;">
    <p>
     <b>
      M1A
     </b>
     <br/>
     <span style="font-size:85%;">
      <i>
       Not assigned
      </i>
     </span>
    </p>
   </td>
   <td style="width:11%; vertical-align:top; color:#ccc;">
    <p>
     <b>
      M2A
     </b>
     <br/>
     <span style="font-size:85%;">
      <i>
       Not assigned
      </i>
     </span>
    </p>
   </td>
   <td style="width:11%; vertical-align:top;">
    <p>
     <b>
      M3A
     </b>
     <br/>
     <span style="font-size:85%;">
      <a href="/wiki/North_York" title="North York">
       North York
      </a>
      <br/>
      (
      <a href="/wiki/Parkwoods" title="Parkwoods">
       Parkwoods
      </a>
      )
     </span>
    </p>
   </td>
   <td style="width:11%; vertical-align:top;">
    <p>
     <b>
      M4A
     </b>
 

In [5]:
table_contents=[]  #A dictionary that stores the data that is processed in the for loop
for row in table.findAll('td'):  # Finds all td (Standard data cells in HTML)
    cell = {} #list to store the html data
    if row.span.text=='Not assigned':  #Skips all 'Not assigned td's'
        pass
    else:
        cell['PostalCode'] = row.p.text[:3] #in the HTML p stands for paragraph, in the td we look for p and get the first 3 characters to get the postal code
        cell['Borough'] = (row.span.text).split('(')[0] #Borough and Neighborhood are under the same <span>, so we get everything before '(' to receive only the Borough
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ') #We get everything after '(' and clean the data. Take off ( ) and extra spaces and add ',' between neighborhoods in the same borough
        table_contents.append(cell) # But everything into table_contents dictionary

df=pd.DataFrame(table_contents)   
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
# above line cleans the data, there are multiple formatting errors.

df.head()  # Get the first 5 rows

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [6]:
print(df.shape)

(103, 3)


## Location data for Toronto neighborhoods

In [7]:
!pip install geocoder
import geocoder



In [8]:
# I Couldn't get the coordinates using the geocoder, I think I ran out of queries. 

postal_code=df['PostalCode']
postal_code[0] # The first one is M3A

g  = geocoder.google('M3A, Toronto ON')
g1 = geocoder.google('M3A, Toronto, Ontario')
g2 = geocoder.google('M3A, Toronto, Ontario, Canada')
g3 = geocoder.google('{}, Toronto, Ontario'.format(postal_code[0]))
lat_lng_coords = g.latlng
lat_lng_coords1 = g1.latlng
lat_lng_coords2 = g2.latlng
lat_lng_coords3 = g3.latlng

In [9]:
print("g=",lat_lng_coords) # the geocoder does not work even when I put the correct postal code and ,
print("g1=",lat_lng_coords1)
print("g2=",lat_lng_coords2)
print("g3=",lat_lng_coords3)

# I think I went over the query limit when doing trying to use the loop

g= None
g1= None
g2= None
g3= None


In [9]:
# NOT WORKING, USING THE CSV FILE INSTEAD 
#import geocoder # import geocoder

# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
#  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#  lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

In [10]:
# Import library to processes CSV files and downlaod the required data to the notebook
import csv
!wget -O Geospatial_Coordinates.csv https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv

--2021-05-04 21:45:26--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2788 (2,7K) [text/csv]
Saving to: ‘Geospatial_Coordinates.csv’


2021-05-04 21:45:27 (329 MB/s) - ‘Geospatial_Coordinates.csv’ saved [2788/2788]



In [11]:
# Read the data into a pandas dataframe
Geospatial_Coordinates_df = pd.read_csv("Geospatial_Coordinates.csv") 
Geospatial_Coordinates_df.head(20)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [12]:
#Rename the 'PostalCode' to 'Postal Code' inorder to user merge with common column name
df.rename(columns = {'PostalCode' : 'Postal Code'}, inplace=True)
#Merge the two dataframes using the common column 'Postal Code'
df_merged=pd.merge(df, Geospatial_Coordinates_df, on="Postal Code") 

In [13]:
df_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Analysing the neighborhoods

In [13]:
# Lets get the starting point for the folium map
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [14]:
# Create the map with starting location in Toronto
toronto_map= folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, borough, neighborhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Borough'], df_merged['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
# Display map of toronto with all the neighborhoods shown with markers 
toronto_map

In [15]:
# Lets create the foursquare credentials
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value


In [16]:
df_merged.loc[2,'Neighborhood']

'Regent Park, Harbourfront'

In [17]:
df_merged.loc[2, 'Latitude']

43.6542599

In [18]:
# Lets use foursquare to find out where are the local pubs are VS where the tourist are more likely to be, 
# determined by how many hotels are nearby.
# The code in foursquare for nightlife spots is 4d4b7105d754a06376d81259
# First lets try with one neighborhood
neighborhood_latitude = df_merged.loc[2, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_merged.loc[2, 'Longitude'] # neighborhood longitude value
neighborhood_name = df_merged.loc[2, 'Neighborhood'] # neighborhood name
categoryId_nightlife="4d4b7105d754a06376d81259"
print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.6542599, -79.3606359.


In [19]:
# Get the foursquare url with radius of 500
radius=500
url_nightlife='https://api.foursquare.com/v2/venues/explore?categoryId={}&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(categoryId_nightlife, CLIENT_ID, CLIENT_SECRET, VERSION, neighborhood_latitude, neighborhood_longitude, radius, LIMIT)
url_nightlife

'https://api.foursquare.com/v2/venues/explore?categoryId=4d4b7105d754a06376d81259&client_id=DHWB3UNEEBCUC20YP0UQV43YVYPSNWRZZYN33VJGHOWIR2C2&client_secret=U0QIWMLRMFAHTR0YNUQ3I1FOZ44UOM1C4O2UVIYXZVVZEZYI&v=20180605&ll=43.6542599,-79.3606359&radius=500&limit=100'

In [20]:
results_nightlife = requests.get(url_nightlife).json()
results_nightlife

{'meta': {'code': 200, 'requestId': '609187eb5788171d05de59fe'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'query': 'nightlife',
  'totalResults': 11,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '566e1294498e3f6629006bc3',
       'name': 'Dominion Pub and Kitchen',
       'location': {'address': '500 Queen Street East',
        'lat': 43.65691857501867,
        'lng': -79.35896684476664,
        'labeledLatLngs': [{'label': 'display'

In [21]:
# Now that we tested that we can get the nightlife locations near our coordinates,
# we can repeat the process for all the coordinates using this function

def getNearbyNightlife(names, latitudes, longitudes, radius=500):
    
    venues_list=[] # Create venues_list that collects all the useful data from the json
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # create the API request URL like before. Here the lat and lng are changing in every revolution of the for loop
        url_nightlife = 'https://api.foursquare.com/v2/venues/explore?categoryId={}&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            categoryId_nightlife,
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url_nightlife).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        # First the name, lat, lng that we already had and then corresponding venues to that neighborhood using for loop for the results
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], # The name of the venue
            v['venue']['location']['lat'],  # The latitude of the venue 
            v['venue']['location']['lng'],  # The longitude of the venue
            v['venue']['categories'][0]['name']) for v in results]) # The category name of the venue
        
    # Make a dataframe from the newly gathered venues and give the columns representative columns
    nearby_nightlife = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_nightlife.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    #Remember to return the results out of the function, otherwise nothing happens
    return(nearby_nightlife)

In [22]:
# use the newly created function by giving the variables names, latitudes, and longitudes to it. 
nearby_nightlife=getNearbyNightlife(names=df_merged['Neighborhood'],
                                    latitudes=df_merged['Latitude'],
                                    longitudes=df_merged['Longitude']
                                   )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills North
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth  East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview East
The Danforth

In [23]:
print(nearby_nightlife.shape)

(607, 7)


In [24]:
#Lets look for any categories that do not fit our research objectives
nearby_nightlife['Venue Category'].unique()


array(['Pub', 'Café', 'Brewery', 'Seafood Restaurant', 'Lounge',
       'Gastropub', 'Event Space', 'Sports Bar', 'Beer Bar',
       'Wings Joint', 'Nightclub', 'Burger Joint', 'Bar',
       'College Cafeteria', 'Hookah Bar', 'Winery', 'Japanese Restaurant',
       'Italian Restaurant', 'Restaurant', 'Wine Bar', 'Smoke Shop',
       'Speakeasy', 'Karaoke Bar', 'Cocktail Bar', 'Pizza Place',
       'BBQ Joint', 'Jazz Club', 'Gay Bar', 'Hotel', 'Diner', 'Irish Pub',
       'Bistro', 'Belgian Restaurant', 'Hotel Bar', 'Sushi Restaurant',
       'Steakhouse', 'Breakfast Spot', 'American Restaurant',
       'Music Venue', 'Liquor Store', 'New American Restaurant',
       'Whisky Bar', 'Beer Garden', 'Coffee Shop', 'Korean Restaurant',
       'Mediterranean Restaurant', 'Office', 'Other Nightlife',
       'Piano Bar', 'Dive Bar', 'Comfort Food Restaurant',
       'Fraternity House', 'Theme Restaurant', 'Sake Bar', 'Strip Club',
       'Social Club'], dtype=object)

In [25]:
# Lets get rid of all the Cafes and restaurants, since we want straight pubs and nightclubs
# This step might have been avoided by choosing different foursquare category id. 
nearby_nightlife=nearby_nightlife.replace(['Café','Burger Joint','Diner','Office','Coffee Shop','Wings Joint','BBQ Joint','Seafood Restaurant','College Cafeteria','Japanese Restaurant','Italian Restaurant','Restaurant','Pizza Place','Belgian Restaurant','Sushi Restaurant','Steakhouse','Breakfast Spot','American Restaurant','New American Restaurant','Korean Restaurant', 'Mediterranean Restaurant','Comfort Food Restaurant','Theme Restaurant'], 
                                          [np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan])
nearby_nightlife=nearby_nightlife.dropna(axis=0).reset_index(drop=True)
print(nearby_nightlife.shape)

(499, 7)


In [26]:
nearby_nightlife.head(30)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub
1,"Regent Park, Harbourfront",43.65426,-79.360636,The Aviary,43.653634,-79.354662,Pub
2,"Regent Park, Harbourfront",43.65426,-79.360636,Mill St. Brew Pub,43.650353,-79.358489,Pub
3,"Regent Park, Harbourfront",43.65426,-79.360636,Ontario Spring Water Sake Company,43.649922,-79.360073,Brewery
4,"Regent Park, Harbourfront",43.65426,-79.360636,Stirling Room,43.650644,-79.359829,Lounge
5,"Regent Park, Harbourfront",43.65426,-79.360636,The Corktown Kitchen,43.653316,-79.36214,Gastropub
6,"Regent Park, Harbourfront",43.65426,-79.360636,Banknote Bar Corktown,43.652655,-79.366071,Pub
7,"Regent Park, Harbourfront",43.65426,-79.360636,Martini Club,43.650597,-79.359789,Event Space
8,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,Rinx Sports Bar,43.718682,-79.461708,Sports Bar
9,Ontario Provincial Government,43.662301,-79.389494,Bar Volo,43.665462,-79.385692,Beer Bar


In [27]:
# check that the results look as expected
nearby_nightlife_grouped_count=nearby_nightlife.groupby('Neighborhood').count()[['Venue Category']]
nearby_nightlife_grouped_count.reset_index(inplace=True)


In [28]:
nearby_nightlife_grouped_count

Unnamed: 0,Neighborhood,Venue Category
0,Agincourt,1
1,"Alderwood, Long Branch",3
2,"Bathurst Manor, Wilson Heights, Downsview North",1
3,"Bedford Park, Lawrence Manor East",1
4,Berczy Park,20
5,"Birch Cliff, Cliffside West",4
6,"Brockton, Parkdale Village, Exhibition Place",6
7,"CN Tower, King and Spadina, Railway Lands, Har...",1
8,Caledonia-Fairbanks,1
9,Cedarbrae,1


In [29]:
nearby_nightlife_grouped_count_T = nearby_nightlife_grouped_count.T.reset_index(drop=True).T
nearby_nightlife_grouped_count_T



Unnamed: 0,0,1
0,Agincourt,1
1,"Alderwood, Long Branch",3
2,"Bathurst Manor, Wilson Heights, Downsview North",1
3,"Bedford Park, Lawrence Manor East",1
4,Berczy Park,20
5,"Birch Cliff, Cliffside West",4
6,"Brockton, Parkdale Village, Exhibition Place",6
7,"CN Tower, King and Spadina, Railway Lands, Har...",1
8,Caledonia-Fairbanks,1
9,Cedarbrae,1


In [30]:
print('There are {} uniques categories.'.format(len(nearby_nightlife['Venue Category'].unique())))


There are 33 uniques categories.


In [31]:
# one hot encoding the venue categories inorder to use analytic methods on the data
# Only hot encode the venue category, as we want to preserve the neighborhood names
nighlife_onehot = pd.get_dummies(nearby_nightlife[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
nighlife_onehot['Neighborhood'] = nearby_nightlife['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [nighlife_onehot.columns[-1]] + list(nighlife_onehot.columns[:-1])
nighlife_onehot = nighlife_onehot[fixed_columns]

#Check that the results look as expected
nighlife_onehot.head()

Unnamed: 0,Neighborhood,Bar,Beer Bar,Beer Garden,Bistro,Brewery,Cocktail Bar,Dive Bar,Event Space,Fraternity House,Gastropub,Gay Bar,Hookah Bar,Hotel,Hotel Bar,Irish Pub,Jazz Club,Karaoke Bar,Liquor Store,Lounge,Music Venue,Nightclub,Other Nightlife,Piano Bar,Pub,Sake Bar,Smoke Shop,Social Club,Speakeasy,Sports Bar,Strip Club,Whisky Bar,Wine Bar,Winery
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
#Lets check the size of the dataframe
nighlife_onehot.shape

(499, 34)

In [33]:
#Lets group by neighbourhood and get the mean values for which type of nightlife venue is most frequent in the location
nightlife_grouped = nighlife_onehot.groupby('Neighborhood').mean().reset_index()
nightlife_grouped

Unnamed: 0,Neighborhood,Bar,Beer Bar,Beer Garden,Bistro,Brewery,Cocktail Bar,Dive Bar,Event Space,Fraternity House,Gastropub,Gay Bar,Hookah Bar,Hotel,Hotel Bar,Irish Pub,Jazz Club,Karaoke Bar,Liquor Store,Lounge,Music Venue,Nightclub,Other Nightlife,Piano Bar,Pub,Sake Bar,Smoke Shop,Social Club,Speakeasy,Sports Bar,Strip Club,Whisky Bar,Wine Bar,Winery
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Berczy Park,0.2,0.1,0.0,0.05,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.0,0.05,0.0,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Birch Cliff, Cliffside West",0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Brockton, Parkdale Village, Exhibition Place",0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0
7,"CN Tower, King and Spadina, Railway Lands, Har...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Caledonia-Fairbanks,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Cedarbrae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
#Check the shape again to confirm the new changes
nightlife_grouped.shape

(64, 34)

In [35]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [36]:
num_top_venues = 1

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = nightlife_grouped['Neighborhood']

for ind in np.arange(nightlife_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(nightlife_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(10)

Unnamed: 0,Neighborhood,1st Most Common Venue
0,Agincourt,Lounge
1,"Alderwood, Long Branch",Pub
2,"Bathurst Manor, Wilson Heights, Downsview North",Bar
3,"Bedford Park, Lawrence Manor East",Pub
4,Berczy Park,Pub
5,"Birch Cliff, Cliffside West",Pub
6,"Brockton, Parkdale Village, Exhibition Place",Nightclub
7,"CN Tower, King and Spadina, Railway Lands, Har...",Bar
8,Caledonia-Fairbanks,Bar
9,Cedarbrae,Lounge


In [37]:
neighborhoods_venues_sorted.head(10)

Unnamed: 0,Neighborhood,1st Most Common Venue
0,Agincourt,Lounge
1,"Alderwood, Long Branch",Pub
2,"Bathurst Manor, Wilson Heights, Downsview North",Bar
3,"Bedford Park, Lawrence Manor East",Pub
4,Berczy Park,Pub
5,"Birch Cliff, Cliffside West",Pub
6,"Brockton, Parkdale Village, Exhibition Place",Nightclub
7,"CN Tower, King and Spadina, Railway Lands, Har...",Bar
8,Caledonia-Fairbanks,Bar
9,Cedarbrae,Lounge


## K-means clustering

In [38]:
# set number of clusters
kclusters = 5

#nightlife_grouped_clustering = nightlife_gaxis=1)rouped.drop('Neighborhood', 1)

# run k-means clustering
#kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(nightlife_grouped_clustering)

# check cluster labels generated for each row in the dataframe
#kmeans.labels_[0:10] 

In [39]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue
0,Agincourt,Lounge
1,"Alderwood, Long Branch",Pub
2,"Bathurst Manor, Wilson Heights, Downsview North",Bar
3,"Bedford Park, Lawrence Manor East",Pub
4,Berczy Park,Pub


In [40]:
# SECOND TRY WITH THE COUNT INSTEAD

In [41]:
# K-means clustering with count of nightlife locations
nightlife_grouped_count_clustering = nearby_nightlife_grouped_count.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(nightlife_grouped_count_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 



array([0, 0, 0, 0, 2, 3, 3, 0, 0, 0], dtype=int32)

In [42]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

nightlife_merged = df_merged

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
nightlife_merged = nightlife_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

nightlife_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,,
1,M4A,North York,Victoria Village,43.725882,-79.315572,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,3.0,Pub
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0.0,Sports Bar
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494,3.0,Bar


In [43]:
# We need to drop postal code and borough as they are not needed
nightlife_merged=nightlife_merged.drop(['Postal Code','Borough'], axis=1)


In [48]:
# Get rid of useless rows
nightlife_merged=nightlife_merged.dropna()

In [61]:
# Change the Cluster Labels to integral, they were floats before and cannot be used like that
convert_dict = {'Cluster Labels': int}
  
nightlife_merged = nightlife_merged.astype(convert_dict)
nightlife_merged

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue
2,"Regent Park, Harbourfront",43.65426,-79.360636,3,Pub
3,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,Sports Bar
4,Ontario Provincial Government,43.662301,-79.389494,3,Bar
7,Don Mills North,43.745906,-79.352188,0,Hookah Bar
8,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,0,Winery
9,"Garden District, Ryerson",43.657162,-79.378937,2,Bar
10,Glencairn,43.709577,-79.445073,0,Sports Bar
11,"West Deane Park, Princess Gardens, Martin Grov...",43.650943,-79.554724,0,Brewery
12,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,0,Bar
14,Woodbine Heights,43.695344,-79.318389,0,Bar


In [53]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(nightlife_merged['Latitude'], nightlife_merged['Longitude'], nightlife_merged['Neighborhood'], nightlife_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
      
map_clusters

## Examine the clusters 

In [55]:
nightlife_merged.loc[nightlife_merged['Cluster Labels'] == 0, nightlife_merged.columns[[0] + list(range(5, nightlife_merged.shape[1]))]]

Unnamed: 0,Neighborhood
3,"Lawrence Manor, Lawrence Heights"
7,Don Mills North
8,"Parkview Hill, Woodbine Gardens"
10,Glencairn
11,"West Deane Park, Princess Gardens, Martin Grov..."
12,"Rouge Hill, Port Union, Highland Creek"
14,Woodbine Heights
17,"Eringate, Bloordale Gardens, Old Burnhamthorpe..."
19,The Beaches
21,Caledonia-Fairbanks


In [56]:
nightlife_merged.loc[nightlife_merged['Cluster Labels'] == 1, nightlife_merged.columns[[0] + list(range(5, nightlife_merged.shape[1]))]]

Unnamed: 0,Neighborhood
15,St. James Town
30,"Richmond, Adelaide, King"
42,"Toronto Dominion Centre, Design Exchange"
92,Enclave of M5E
97,"First Canadian Place, Underground city"


In [57]:
nightlife_merged.loc[nightlife_merged['Cluster Labels'] == 2, nightlife_merged.columns[[0] + list(range(5, nightlife_merged.shape[1]))]]

Unnamed: 0,Neighborhood
9,"Garden District, Ryerson"
20,Berczy Park
36,"Harbourfront East, Union Station, Toronto Islands"
37,"Little Portugal, Trinity"
84,"Kensington Market, Chinatown, Grange Park"
99,Church and Wellesley


In [58]:
nightlife_merged.loc[nightlife_merged['Cluster Labels'] == 3, nightlife_merged.columns[[0] + list(range(5, nightlife_merged.shape[1]))]]

Unnamed: 0,Neighborhood
2,"Regent Park, Harbourfront"
4,Ontario Provincial Government
24,Central Bay Street
31,"Dufferin, Dovercourt Village"
41,"The Danforth West, Riverdale"
43,"Brockton, Parkdale Village, Exhibition Place"
47,"India Bazaar, The Beaches West"
54,Studio District
58,"Birch Cliff, Cliffside West"
69,"High Park, The Junction South"


In [59]:
nightlife_merged.loc[nightlife_merged['Cluster Labels'] == 4, nightlife_merged.columns[[0] + list(range(5, nightlife_merged.shape[1]))]]

Unnamed: 0,Neighborhood
48,"Commerce Court, Victoria Hotel"


# Results
## This map shows where the clusters of bars are located in Toronto. You can see theh Commerce Court is the most populous for bars. Tear 3 has some surprices such as "Birch Cliff, Cliffside West Cluster" which could be a nice party place outside of downtown Toronto. 

# MAPS DONT SHOW IN GITHUB SO I WILL GIVE ANOTHER LINK TO SEE MAP