# Explore, Segment, and Cluster the neighborhoods in the City of Toronto

In [1]:
pip install BeautifulSoup4

Note: you may need to restart the kernel to use updated packages.


### Step 1: Import libraries for Web Scrapping


In [2]:
from bs4 import BeautifulSoup
import urllib.request
import csv

### Step 2: Extract Data from the web


In [3]:
# Assign the url link where the data is located to a table
urlpage =  'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Using BeautifulSoup Library to extract data the HTML code of the page containing our data
page = urllib.request.urlopen(urlpage)
soup = BeautifulSoup(page, 'html.parser')
#print(soup)

In [4]:
#Our Data is located in a table with class: wikitable sortable. We will extract the table 

# Extract all data betweent the table tags using the soup.fing
table = soup.find('table', attrs={'class': 'wikitable sortable'})

# All the data in the table is in tr tags. We will extract all that data from our table data.
rows = table.find_all('tr')
print('Number of results', len(rows))

#print(table_rows[:5])

Number of results 181


In [5]:
# All the table headers are within th tag. Extract them:
# Method 1
#table_header = ["Postal", "Borough", "Neighbourhood"]
# Method 2
header = [head.text.replace('\n',"") for head in rows[0].find_all('th')]
print(header)

# Organize data into a table
table_rows = []

# Add the table headers
table_rows.append(header)

['Postal Code', 'Borough', 'Neighbourhood']


In [6]:
# Extract each row from "rows" variable and create a list for a DataFrame

# Use for loop to loop through each row of the table
for row in rows[1:]:
   
    # find all columns per row
    table_data = row.find_all('td')
    # check that columns have data 
    if len(table_data) == 0: 
        continue
    else:  
        
        # we will extract data for each coloumn in the row (result) and store it in the variable.    
        Postal = table_data[0].getText().replace("\n","")
        #print('Postal is', Postal)

        Borough = table_data[1].getText().replace("\n","")
        #print('Borough', Borough)  

        Neighbourhood = table_data[2].getText().replace("\n","")
        #print('Neighbourhood', Neighbourhood)


        # Now Append the data into our table_row variable. 
        table_rows.append([Postal, Borough, Neighbourhood])

In [7]:
print(table_rows[:20])

[['Postal Code', 'Borough', 'Neighbourhood'], ['M1A', 'Not assigned', 'Not assigned'], ['M2A', 'Not assigned', 'Not assigned'], ['M3A', 'North York', 'Parkwoods'], ['M4A', 'North York', 'Victoria Village'], ['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront'], ['M6A', 'North York', 'Lawrence Manor, Lawrence Heights'], ['M7A', 'Downtown Toronto', "Queen's Park, Ontario Provincial Government"], ['M8A', 'Not assigned', 'Not assigned'], ['M9A', 'Etobicoke', 'Islington Avenue, Humber Valley Village'], ['M1B', 'Scarborough', 'Malvern, Rouge'], ['M2B', 'Not assigned', 'Not assigned'], ['M3B', 'North York', 'Don Mills'], ['M4B', 'East York', 'Parkview Hill, Woodbine Gardens'], ['M5B', 'Downtown Toronto', 'Garden District, Ryerson'], ['M6B', 'North York', 'Glencairn'], ['M7B', 'Not assigned', 'Not assigned'], ['M8B', 'Not assigned', 'Not assigned'], ['M9B', 'Etobicoke', 'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale'], ['M1C', 'Scarborough', 'Rouge Hill, Port Union,

### Step 3: Create a Pandas Dataframe

In [8]:
import pandas as pd
import numpy as np

In [9]:
df = pd.DataFrame(table_rows[1:] , columns=header)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [10]:
df.shape

(180, 3)

In [11]:
df = df[(df.Borough.notnull())]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [12]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]))

The dataframe has 11 boroughs and 180 neighborhoods.


### Preprocessing and Cleaning the data

We will remove all not assigned or empty spaces

In [13]:
df = df[(df.Borough != "Not assigned")]
df = df[(df.Borough.notnull())]

# Group by PostalCode/Borough
df = pd.DataFrame(df.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(list)).reset_index()
df['Neighbourhood'] = df['Neighbourhood'].apply(lambda x: ', '.join(x))

#replacing any Not Assigned neighbourhood with Boruogh name if they exist.
df.loc[ (df.Neighbourhood.isnull() == True) |
               (df.Neighbourhood == "Not assigned")
               , 'Neighbourhood'] = df.Borough

df.shape

(103, 3)

In [14]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Use geopy library to get the latitude and longitude values of City of Toronto.

First import the geopy library

In [15]:
# uncomment this line if you haven't completed the Foursquare API lab
#!conda install -c conda-forge geopy --yes 

In [16]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

We will Visualize our Data as well therefore import important libraries

In [17]:
import folium 
import matplotlib.cm as cm
import matplotlib.colors as colors

In order to define an instance of the geocoder, we need to define a user_agent. 
I will name my agent to_explorer, as shown below.

In [18]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
toronto_latitude = location.latitude
toronto_longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(toronto_latitude, toronto_longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Create a new DataFrame to process data for Toronto neighbourhoods based on their Postal Code

In [19]:
# Copy our main dataframe to a new data frame to manipulate and still have the original data.
toronto_df = df
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [20]:
!pip install geocoder  #comment this if geocoder is not installed on your system
import geocoder # import geocoder



In [21]:
# create a function and use geocoder to obtain the latitude and longitude
# of each postal code in order to map them.
#lat_long = [(geocoder.arcgis('{}, Toronto, Ontario'.format(code))).latlng for code in toronto_df["Postal Code"]]

In [22]:
#Please note the preferred way would be to create a separate function and append each postal code at a time.
#toronto_df["Latitude"] = [lat[0] for lat in lat_long]
#toronto_df["Longitude"] = [long[1] for long in lat_long]

In [23]:
#toronto_df.head()

In [24]:
# This function returns the latitude and longitude of Toronto postal codes provided in argument
def getLongLatPostalCode(postal_code):
    # initialize to None
    lat_lng_coords = None

    # loop until we get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude

In [25]:
lat_long_postal_code = []

# We will first loop through each postal code from our dataframe
for code in toronto_df['Postal Code']:
    
    latitude, longitude = getLongLatPostalCode(code)
    lat_long_postal_code.append([code, latitude, longitude])
    
# Convert our list of coordinates into a DataFrame for merging
df_lat_long = pd.DataFrame(lat_long_postal_code)
df_lat_long.columns = ['Postal Code', 'Latitude', 'Longitude']

Status code Unknown from https://geocode.arcgis.com/arcgis/rest/services/World/GeocodeServer/find: ERROR - HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://geocode.arcgis.com/arcgis/rest/services/World/GeocodeServer/find: ERROR - HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://geocode.arcgis.com/arcgis/rest/services/World/GeocodeServer/find: ERROR - HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://geocode.arcgis.com/arcgis/rest/services/World/GeocodeServer/find: ERROR - HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Read timed out. (read timeout=5.0)
Status code Unknown from https://geocode.arcgis.com/arcgis/rest/services/World/GeocodeServer/find: ERROR - HTTPSConnectionPool(host='geocode.arcgis.com', port=443): Read timed out. (read timeout=5.0)


In [26]:
# We will merge our lat_long dataframe with toronto dataframe.
toronto_df = pd.merge(toronto_df, df_lat_long, on='Postal Code')

toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.81153,-79.19552
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.78564,-79.15871
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76575,-79.1752
3,M1G,Scarborough,Woburn,43.7682,-79.21761
4,M1H,Scarborough,Cedarbrae,43.76969,-79.23944


In [27]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

We can see that the above graph have small circles not covering all of Toronto's neighbourhood. Therefore, radius size should be increased.

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

#### Define Foursquare Credentials and Version

In [36]:
CLIENT_ID = 'xxxx' # your Foursquare ID
CLIENT_SECRET = 'xxxx' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

#print('Your credentails:')
#print('CLIENT_ID: ' + CLIENT_ID)
#print('CLIENT_SECRET:' + CLIENT_SECRET)

In [37]:
import requests

### We will explore venues based on our neighbourhood  in Toronto using the following function

In [38]:
#First define the limit and radius for our FourSquare API search
RADIUS = 900
LIMIT = 100

def getNearbyVenues(postal_Codes, boroughs, neighbourhoods, latitudes, longitudes, radius=500):

    venues_list=[]
    for code, borough, neighbourhood, lat, lng in zip(postal_Codes, boroughs, neighbourhoods, latitudes, longitudes):
        print("Neighbourhood: ", neighbourhood)

        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']

            
        # return only relevant information for each nearby venue  
        venues_list.append([(
            code, 
            borough, 
            neighbourhood, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

        

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])

    nearby_venues.columns = ['Postal Code',
                             'Borough',
                             'Neighbourhood', 
                             'Neighbourhood Latitude', 
                             'Neighbourhood Longitude', 
                             'Venue Name', 
                             'Venue Latitude', 
                             'Venue Longitude', 
                             'Venue Category']
    
    return(nearby_venues)

Now Call the getNearbyVenues function to explore venues in Toronto Neighbourhoods.

In [39]:
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.81153,-79.19552
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.78564,-79.15871
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76575,-79.1752
3,M1G,Scarborough,Woburn,43.7682,-79.21761
4,M1H,Scarborough,Cedarbrae,43.76969,-79.23944


In [40]:
toronto_venues = getNearbyVenues(   postal_Codes=toronto_df['Postal Code'],
                                    boroughs=toronto_df['Borough'],
                                    neighbourhoods=toronto_df['Neighbourhood'],
                                    latitudes=toronto_df['Latitude'],
                                    longitudes=toronto_df['Longitude'],
                                    radius = RADIUS)

Neighbourhood:  Malvern, Rouge
Neighbourhood:  Rouge Hill, Port Union, Highland Creek
Neighbourhood:  Guildwood, Morningside, West Hill
Neighbourhood:  Woburn
Neighbourhood:  Cedarbrae
Neighbourhood:  Scarborough Village
Neighbourhood:  Kennedy Park, Ionview, East Birchmount Park
Neighbourhood:  Golden Mile, Clairlea, Oakridge
Neighbourhood:  Cliffside, Cliffcrest, Scarborough Village West
Neighbourhood:  Birch Cliff, Cliffside West
Neighbourhood:  Dorset Park, Wexford Heights, Scarborough Town Centre
Neighbourhood:  Wexford, Maryvale
Neighbourhood:  Agincourt
Neighbourhood:  Clarks Corners, Tam O'Shanter, Sullivan
Neighbourhood:  Milliken, Agincourt North, Steeles East, L'Amoreaux East
Neighbourhood:  Steeles West, L'Amoreaux West
Neighbourhood:  Upper Rouge
Neighbourhood:  Hillcrest Village
Neighbourhood:  Fairview, Henry Farm, Oriole
Neighbourhood:  Bayview Village
Neighbourhood:  York Mills, Silver Hills
Neighbourhood:  Willowdale, Newtonbrook
Neighbourhood:  Willowdale, Willowdale

In [41]:
toronto_venues.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue Name,Venue Latitude,Venue Longitude,Venue Category
0,M1B,Scarborough,"Malvern, Rouge",43.81153,-79.19552,Canadiana exhibit,43.817962,-79.193374,Zoo Exhibit
1,M1B,Scarborough,"Malvern, Rouge",43.81153,-79.19552,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
2,M1B,Scarborough,"Malvern, Rouge",43.81153,-79.19552,Grizzly Bear Exhibit,43.817031,-79.193458,Zoo Exhibit
3,M1B,Scarborough,"Malvern, Rouge",43.81153,-79.19552,Upper Rouge Trail,43.809988,-79.186147,Trail
4,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.78564,-79.15871,Shamrock Burgers,43.783823,-79.168406,Burger Joint


In [42]:
toronto_venues.shape

(4618, 9)

### Checking Venues per Neighbourhood

In [43]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Postal Code,Borough,Neighbourhood Latitude,Neighbourhood Longitude,Venue Name,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Agincourt,38,38,38,38,38,38,38,38
"Alderwood, Long Branch",25,25,25,25,25,25,25,25
"Bathurst Manor, Wilson Heights, Downsview North",17,17,17,17,17,17,17,17
Bayview Village,6,6,6,6,6,6,6,6
"Bedford Park, Lawrence Manor East",35,35,35,35,35,35,35,35
...,...,...,...,...,...,...,...,...
"Willowdale, Willowdale West",18,18,18,18,18,18,18,18
Woburn,13,13,13,13,13,13,13,13
Woodbine Heights,58,58,58,58,58,58,58,58
York Mills West,26,26,26,26,26,26,26,26


### Let's find out how many unique categories can be curated from all the returned venues

In [44]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 315 uniques categories.


## Now analyzing each Neighbourhood
use one hot encoding

In [45]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add Postal Code, Borough and Neighbourhood columns back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood']
toronto_onehot['Postal Code'] = toronto_venues['Postal Code']
toronto_onehot['Borough'] = toronto_venues['Borough']

toronto_onehot.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Arcade,Argentinian Restaurant,...,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit,Neighbourhood,Postal Code,Borough
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,"Malvern, Rouge",M1B,Scarborough
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"Malvern, Rouge",M1B,Scarborough
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,"Malvern, Rouge",M1B,Scarborough
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"Malvern, Rouge",M1B,Scarborough
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"Rouge Hill, Port Union, Highland Creek",M1C,Scarborough


In [46]:
# move Postal Code, Borough and Neighbourhood columns to the first column
for i in range(0,3):
    fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
    toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Postal Code,Borough,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Animal Shelter,Antique Shop,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit
0,"Malvern, Rouge",M1B,Scarborough,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,"Malvern, Rouge",M1B,Scarborough,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Malvern, Rouge",M1B,Scarborough,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,"Malvern, Rouge",M1B,Scarborough,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Rouge Hill, Port Union, Highland Creek",M1C,Scarborough,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
toronto_onehot.shape

(4618, 318)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [48]:
toronto_grouped = toronto_onehot.groupby(['Postal Code','Borough', 'Neighbourhood']).mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Animal Shelter,Antique Shop,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit
0,M1B,Scarborough,"Malvern, Rouge",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,Scarborough,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,Scarborough,Cedarbrae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Let's print each neighborhood along with the top 5 most common venues

In [59]:
num_top_venues = 5

for code, borough, hood in zip(toronto_grouped["Postal Code"], toronto_grouped["Borough"], toronto_grouped['Neighbourhood']):
    
    print("----"+code+" "+borough+" "+hood+"----")
    
    #temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    
    temp = toronto_grouped[(toronto_grouped["Postal Code"] == code) &   (toronto_grouped.Borough == borough) &  (toronto_grouped.Neighbourhood == hood)].T.reset_index()
                            
                            
                            
    temp.columns = ['venue','freq']
    temp = temp.iloc[3:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    
    print('\n')

----M1B Scarborough Malvern, Rouge----
                  venue  freq
0           Zoo Exhibit  0.50
1                 Trail  0.25
2  Fast Food Restaurant  0.25
3  Other Great Outdoors  0.00
4          Optical Shop  0.00


----M1C Scarborough Rouge Hill, Port Union, Highland Creek----
                     venue  freq
0           Breakfast Spot   0.4
1             Burger Joint   0.2
2                     Park   0.2
3                      Bar   0.2
4  New American Restaurant   0.0


----M1E Scarborough Guildwood, Morningside, West Hill----
                  venue  freq
0                  Park  0.18
1        Gymnastics Gym  0.09
2  Gym / Fitness Center  0.09
3     Convenience Store  0.09
4            Restaurant  0.09


----M1G Scarborough Woburn----
                  venue  freq
0     Indian Restaurant  0.15
1           Pizza Place  0.15
2       Supplement Shop  0.08
3      Department Store  0.08
4  Fast Food Restaurant  0.08


----M1H Scarborough Cedarbrae----
                 venue  freq


                venue  freq
0         Coffee Shop  0.10
1  Italian Restaurant  0.07
2        Skating Rink  0.07
3               Diner  0.05
4                Park  0.05


----M4S Central Toronto Davisville----
            venue  freq
0     Pizza Place  0.07
1             Gym  0.05
2            Café  0.05
3     Coffee Shop  0.05
4  Sandwich Place  0.05


----M4T Central Toronto Moore Park, Summerhill East----
           venue  freq
0  Grocery Store  0.12
1           Park  0.09
2    Coffee Shop  0.06
3            Gym  0.06
4           Café  0.06


----M4V Central Toronto Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park----
                venue  freq
0         Coffee Shop  0.08
1    Sushi Restaurant  0.07
2                Café  0.05
3  Italian Restaurant  0.05
4      Sandwich Place  0.04


----M4W Downtown Toronto Rosedale----
                venue  freq
0                Park   0.4
1        Neighborhood   0.1
2          Playground   0.1
3               Trail   0.1
4  Athl

               venue  freq
0     Baseball Field  0.09
1        Pizza Place  0.09
2  Convenience Store  0.09
3          Pet Store  0.09
4         Beer Store  0.05


----M9L North York Humber Summit----
                   venue  freq
0                 Bakery  0.25
1      Electronics Store  0.25
2           Skating Rink  0.25
3  General Entertainment  0.25
4        Organic Grocery  0.00


----M9M North York Humberlea, Emery----
                       venue  freq
0                Coffee Shop  0.25
1                       Park  0.12
2                       Café  0.12
3  Latin American Restaurant  0.12
4                  Nightclub  0.12


----M9N York Weston----
                 venue  freq
0                 Park  0.11
1          Pizza Place  0.11
2          Coffee Shop  0.11
3        Train Station  0.11
4  Fried Chicken Joint  0.06


----M9P Etobicoke Westmount----
            venue  freq
0     Pizza Place  0.12
1     Gas Station  0.12
2     Flea Market  0.06
3  Sandwich Place  0.06
4  Ice 

Let's put that into a pandas dataframe
First, let's write a function to sort the venues in descending order.

In [60]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [79]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code', 'Borough', 'Neighbourhood']

for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))


neighborhoods_venues_sorted = pd.DataFrame(columns=columns)

# create a new dataframe
neighborhoods_venues_sorted['Postal Code'] = toronto_grouped['Postal Code']
neighborhoods_venues_sorted['Borough'] = toronto_grouped['Borough']
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

# loop through each rows
for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 3:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",Zoo Exhibit,Trail,Fast Food Restaurant,Food,Farm,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",Breakfast Spot,Burger Joint,Bar,Park,Zoo Exhibit,Fast Food Restaurant,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",Park,Home Service,Convenience Store,Filipino Restaurant,Pizza Place,Restaurant,Grocery Store,Gym / Fitness Center,Gymnastics Gym,Athletics & Sports
3,M1G,Scarborough,Woburn,Indian Restaurant,Pizza Place,Sandwich Place,Chinese Restaurant,Supplement Shop,Supermarket,Thrift / Vintage Store,Coffee Shop,Department Store,Park
4,M1H,Scarborough,Cedarbrae,Indian Restaurant,Bakery,Bank,Gas Station,Caribbean Restaurant,Athletics & Sports,Hakka Restaurant,Coffee Shop,Thai Restaurant,Lounge


## Now Clustering Neighbourhood

In [80]:
from sklearn.cluster import KMeans

In [164]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)
toronto_grouped_clustering = toronto_grouped_clustering.drop('Postal Code', 1)
toronto_grouped_clustering = toronto_grouped_clustering.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 1, 0, 2, 0, 0, 2, 0, 1], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [165]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ValueError: cannot insert Cluster Labels, already exists

In [166]:
toronto_merged = toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index(['Postal Code','Borough','Neighbourhood']), on=['Postal Code','Borough', 'Neighbourhood'])

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.81153,-79.19552,0.0,Zoo Exhibit,Trail,Fast Food Restaurant,Food,Farm,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.78564,-79.15871,1.0,Breakfast Spot,Burger Joint,Bar,Park,Zoo Exhibit,Fast Food Restaurant,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76575,-79.1752,1.0,Park,Home Service,Convenience Store,Filipino Restaurant,Pizza Place,Restaurant,Grocery Store,Gym / Fitness Center,Gymnastics Gym,Athletics & Sports
3,M1G,Scarborough,Woburn,43.7682,-79.21761,0.0,Indian Restaurant,Pizza Place,Sandwich Place,Chinese Restaurant,Supplement Shop,Supermarket,Thrift / Vintage Store,Coffee Shop,Department Store,Park
4,M1H,Scarborough,Cedarbrae,43.76969,-79.23944,2.0,Indian Restaurant,Bakery,Bank,Gas Station,Caribbean Restaurant,Athletics & Sports,Hakka Restaurant,Coffee Shop,Thai Restaurant,Lounge


In [176]:
t_merged = toronto_merged
t_merged["Cluster Labels"].dropna(axis=0, inplace=False)
t_merged.astype({"Cluster Labels":"int32"}).dtypes
t_merged["Cluster Labels"].unique()

array([0., 1., 2., 3.])

In [177]:
t_merged.shape

(102, 16)

### Visualize Clusters

In [119]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, code, borough, neighborhood, cluster in zip(t_merged['Latitude'], t_merged['Longitude'], t_merged['Postal Code'], t_merged['Borough'], t_merged['Neighbourhood'], t_merged['Cluster Labels']):
    label = folium.Popup(str(code) + ' - Cluster ' + str(cluster), parse_html=True)
    cluster = int(cluster)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[(cluster)-1],
        fill=True,
        fill_color=rainbow[(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

In [180]:
t_merged["Cluster Labels"].shape


(102,)

In [181]:
t_merged[t_merged['Cluster Labels'] == 0].head(10)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.81153,-79.19552,0.0,Zoo Exhibit,Trail,Fast Food Restaurant,Food,Farm,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant
3,M1G,Scarborough,Woburn,43.7682,-79.21761,0.0,Indian Restaurant,Pizza Place,Sandwich Place,Chinese Restaurant,Supplement Shop,Supermarket,Thrift / Vintage Store,Coffee Shop,Department Store,Park
5,M1J,Scarborough,Scarborough Village,43.74309,-79.23526,0.0,Sandwich Place,Ice Cream Shop,Indian Restaurant,Fast Food Restaurant,Coffee Shop,Train Station,Restaurant,Convenience Store,Pizza Place,Curling Ice
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.72861,-79.26367,0.0,Coffee Shop,Discount Store,Chinese Restaurant,Pizza Place,Rental Car Location,Sandwich Place,Bus Line,Grocery Store,Light Rail Station,Department Store
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.7236,-79.23496,0.0,Pharmacy,Ice Cream Shop,Bank,Pizza Place,Sandwich Place,Hardware Store,Liquor Store,Restaurant,Bistro,Coffee Shop
10,M1P,Scarborough,"Dorset Park, Wexford Heights, Scarborough Town...",43.75998,-79.26837,0.0,Indian Restaurant,Brewery,Thrift / Vintage Store,Pharmacy,Pet Store,Park,Restaurant,Coffee Shop,Chinese Restaurant,Electronics Store
11,M1R,Scarborough,"Wexford, Maryvale",43.75071,-79.30056,0.0,Pizza Place,Middle Eastern Restaurant,Burger Joint,Asian Restaurant,Grocery Store,Supermarket,Vietnamese Restaurant,Smoke Shop,Coffee Shop,Breakfast Spot
13,M1T,Scarborough,"Clarks Corners, Tam O'Shanter, Sullivan",43.78473,-79.29937,0.0,Coffee Shop,Fast Food Restaurant,Bank,Pharmacy,Pizza Place,Convenience Store,Gas Station,Vietnamese Restaurant,Chinese Restaurant,Bakery
15,M1W,Scarborough,"Steeles West, L'Amoreaux West",43.80052,-79.32074,0.0,Pizza Place,Chinese Restaurant,Fast Food Restaurant,Grocery Store,Bank,Other Great Outdoors,Sandwich Place,Electronics Store,Nail Salon,Bakery
24,M2R,North York,"Willowdale, Willowdale West",43.77991,-79.44678,0.0,Pizza Place,Coffee Shop,Grocery Store,Pharmacy,Park,Discount Store,Sandwich Place,Shopping Mall,Skating Rink,Butcher


In [182]:
t_merged[t_merged['Cluster Labels'] == 1].head(10)



Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.78564,-79.15871,1.0,Breakfast Spot,Burger Joint,Bar,Park,Zoo Exhibit,Fast Food Restaurant,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76575,-79.1752,1.0,Park,Home Service,Convenience Store,Filipino Restaurant,Pizza Place,Restaurant,Grocery Store,Gym / Fitness Center,Gymnastics Gym,Athletics & Sports
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.69539,-79.26194,1.0,Park,General Entertainment,Gym,Gym Pool,College Stadium,Café,Skating Rink,Zoo Exhibit,Event Space,Eastern European Restaurant
14,M1V,Scarborough,"Milliken, Agincourt North, Steeles East, L'Amo...",43.81781,-79.28024,1.0,Park,Shop & Service,Shopping Mall,Caribbean Restaurant,Hobby Shop,Pizza Place,Discount Store,Pharmacy,Gym,Ethiopian Restaurant
17,M2H,North York,Hillcrest Village,43.80285,-79.35621,1.0,Park,Chinese Restaurant,Bakery,Residential Building (Apartment / Condo),Pharmacy,Fast Food Restaurant,Falafel Restaurant,Dry Cleaner,Dumpling Restaurant,Eastern European Restaurant
19,M2K,North York,Bayview Village,43.78102,-79.3806,1.0,Chinese Restaurant,Trail,Japanese Restaurant,Park,Café,Bank,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant
25,M3A,North York,Parkwoods,43.75188,-79.33036,1.0,Bus Stop,Park,Cosmetics Shop,Train Station,Food & Drink Shop,Shopping Mall,Shop & Service,Grocery Store,Road,Fish & Chips Shop
32,M3M,North York,Downsview,43.73355,-79.4968,1.0,Park,Insurance Office,Baseball Field,Moving Target,Zoo Exhibit,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm
50,M4W,Downtown Toronto,Rosedale,43.6819,-79.3785,1.0,Park,Playground,Grocery Store,Neighborhood,Candy Store,Athletics & Sports,Trail,Zoo Exhibit,Event Space,Eastern European Restaurant
64,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.69479,-79.4144,1.0,Park,Bank,Sushi Restaurant,Café,Trail,Coffee Shop,Jewelry Store,Middle Eastern Restaurant,Bagel Shop,Japanese Restaurant


In [183]:
t_merged[t_merged['Cluster Labels'] == 2].head(10)



Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M1H,Scarborough,Cedarbrae,43.76969,-79.23944,2.0,Indian Restaurant,Bakery,Bank,Gas Station,Caribbean Restaurant,Athletics & Sports,Hakka Restaurant,Coffee Shop,Thai Restaurant,Lounge
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.71406,-79.28412,2.0,Intersection,Bakery,Bus Station,Diner,Bus Line,Convenience Store,Soccer Field,Ice Cream Shop,Beer Store,Coffee Shop
12,M1S,Scarborough,Agincourt,43.79394,-79.26711,2.0,Chinese Restaurant,Shopping Mall,Clothing Store,Latin American Restaurant,Bank,Restaurant,Sushi Restaurant,Supermarket,Motorcycle Shop,Sandwich Place
18,M2J,North York,"Fairview, Henry Farm, Oriole",43.78097,-79.34781,2.0,Clothing Store,Coffee Shop,Bakery,Fast Food Restaurant,Restaurant,Bank,Tea Room,Sandwich Place,Baseball Field,Food Court
21,M2M,North York,"Willowdale, Newtonbrook",43.79135,-79.41356,2.0,Korean Restaurant,Coffee Shop,Middle Eastern Restaurant,Café,Pizza Place,Shopping Mall,Restaurant,Park,Sandwich Place,Grocery Store
22,M2N,North York,"Willowdale, Willowdale East",43.76714,-79.40707,2.0,Coffee Shop,Pizza Place,Ramen Restaurant,Japanese Restaurant,Bank,Sandwich Place,Restaurant,Sushi Restaurant,Korean Restaurant,Fast Food Restaurant
23,M2P,North York,York Mills West,43.74787,-79.40029,2.0,Coffee Shop,Park,Intersection,Restaurant,Gym / Fitness Center,Metro Station,Golf Course,Gym,Food Court,French Restaurant
26,M3B,North York,Don Mills,43.74929,-79.36169,2.0,Coffee Shop,Bank,Athletics & Sports,Diner,Supermarket,Gas Station,Juice Bar,Liquor Store,Japanese Restaurant,Bagel Shop
27,M3C,North York,Don Mills,43.72184,-79.3434,2.0,Coffee Shop,Beer Store,Science Museum,Sporting Goods Shop,Japanese Restaurant,Restaurant,Gym,Bike Shop,Chinese Restaurant,Supermarket
30,M3K,North York,Downsview,43.73903,-79.46732,2.0,Athletics & Sports,Food Court,Sandwich Place,Airport,Park,Soccer Field,Coffee Shop,Falafel Restaurant,Eastern European Restaurant,Electronics Store


In [184]:
t_merged[t_merged['Cluster Labels'] == 3].head(10)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
20,M2L,North York,"York Mills, Silver Hills",43.75722,-79.37974,3.0,Intersection,Pool,Park,Farm,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant
