# Part 1 : Segmenting and Clustering Neighborhoods in Toronto

# 1. Preparing Data

In [2]:
# importing necessary libraries
import pandas as pd
import numpy as np
import requests
import csv

!pip install lxml
!pip install html5lib
!pip install beautifulsoup4

get_ipython().system(u' pip install beautifulsoup4')

from bs4 import BeautifulSoup


Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/1f/1d/a4485412268b38043a6c0f873245b5d9315c6615bcf44776759a2605dca5/lxml-4.6.3-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 4.9MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.6.3
Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 8.4MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2; python_version >= "3.0" (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/36/69/d82d04022f02733bf9a72bc3b96332d360c0c5307096d76f6bb7489f7e57/soupsieve-2.2.1-py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.3 soupsieve-2.2.1


In [34]:
# Step 1: Sending a HTTP request to a URL
url = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=900271985"
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

In [35]:
# Step 2: Parse the html content
soup = BeautifulSoup(html_content, "lxml")
# print(soup.prettify()) # print the parsed data of html

In [36]:
# extracting the raw table inside the webpage
table = soup.find('table')

Postcode =[]
Borough = []
Neighbourhood = []

# 2. Extracting Data

In [37]:
# extracting a clean form of the table
 for tr_cell in table.find_all('tr'):
        
        counter = 1
        Postcode_var = -1
        Borough_var = -1
        Neighbourhood_var = -1
        
        for td_cell in tr_cell.find_all('td'):
            if counter == 1:
                Postcode_var = td_cell.text
            if counter == 2:
                Borough_var = td_cell.text
                tag_a_Borough = td_cell.find('a')
            if counter == 3:
                Neighbourhood_var = str(td_cell.text).strip()
                tag_a_Neighbourhood = td_cell.find('a')
            
            counter +=1
            
            # Ignore cells with a borough that is Not assigned.
            if (Postcode_var == 'Not assigned' or Borough_var == 'Not assigned' or Neighbourhood_var =='Not assigned'):
                continue
            try:
                if ((tag_a_Borough is None) or (tag_a_Neighbourhood is None)):
                    continue
            except:
                pass
            if (Postcode_var == -1 or Borough_var == -1 or Neighbourhood_var == -1):
                continue
            Postcode.append(Postcode_var)
            Borough.append(Borough_var)
            Neighbourhood.append(Neighbourhood_var)

In [38]:
# integrating codes with more than one neighbour
unique_p = set(Postcode)
print('num of unique Postal codes:', len(unique_p))
Postcode_u = []
Borough_u = []
Neighbourhood_u = []

for postcode_unique_element in unique_p:
    p_var = ''; b_var = ''; n_var = '';
    for postcode_idx, postcode_element in enumerate(Postcode):
        if postcode_unique_element == postcode_element:
            p_var = postcode_element;
            b_var = Borough[postcode_idx]
            if n_var == '':
                n_var = Neighbourhood[postcode_idx]
            else:
                n_var = n_var + ', ' + Neighbourhood[postcode_idx]
                
    Postcode_u.append(p_var)
    Borough_u.append(n_var)
    Neighbourhood_u.append(n_var)

num of unique Postal codes: 77


# Part 2 : Segmenting and Clustering Neighborhoods in Toronto
# 3. Creating Pandas DataFrame

In [42]:
toronto_dict = {'Postcode':Postcode_u, 'Borough':Borough_u, 'Neighbourhood': Neighbourhood_u}


In [43]:
df_toronto = pd.DataFrame.from_dict(toronto_dict)
df_toronto.to_csv('toronto_part1.csv')
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M9A,Islington Avenue,Islington Avenue
1,M4H,Thorncliffe Park,Thorncliffe Park
2,M1B,"Rouge, Malvern","Rouge, Malvern"
3,M9L,Humber Summit,Humber Summit
4,M4Y,Church and Wellesley,Church and Wellesley


In [44]:
df_toronto.shape

(77, 3)

In [45]:
conda install -c conda-forge geocoder

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.12.5          |   py36h5fab9bb_1         143 KB  conda-forge
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    openssl-1.1.1j             |       h7f98852_0         2.1 MB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.3 MB

The following NEW packages will be INSTALLED:

  geocoder           conda-forge/noarch::geocoder-1.38.1-py_1
  ratelim            conda-forge/noarch::ratelim-0.1.6-py_2

The following packages will be UPDAT

In [46]:
import geocoder


# 4. Loading Data

In [47]:
df_lon_lat = pd.read_csv('Geospatial_Coordinates.csv')
df_lon_lat.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [48]:
df_lon_lat.columns=['Postcode','Latitude','Longitude']
df_lon_lat.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [49]:
df = pd.read_csv('toronto_part1.csv',index_col=0)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M9A,Islington Avenue,Islington Avenue
1,M4H,Thorncliffe Park,Thorncliffe Park
2,M1B,"Rouge, Malvern","Rouge, Malvern"
3,M9L,Humber Summit,Humber Summit
4,M4Y,Church and Wellesley,Church and Wellesley


# 5.Extracting Lattitudes and Longtitude

In [50]:
Toronto_df = pd.merge(df,
                 df_lon_lat[['Postcode','Latitude', 'Longitude']],
                 on='Postcode')
Toronto_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M9A,Islington Avenue,Islington Avenue,43.667856,-79.532242
1,M4H,Thorncliffe Park,Thorncliffe Park,43.705369,-79.349372
2,M1B,"Rouge, Malvern","Rouge, Malvern",43.806686,-79.194353
3,M9L,Humber Summit,Humber Summit,43.756303,-79.565963
4,M4Y,Church and Wellesley,Church and Wellesley,43.665860,-79.383160
...,...,...,...,...,...
72,M5C,St. James Town,St. James Town,43.651494,-79.375418
73,M1S,Agincourt,Agincourt,43.794200,-79.262029
74,M1P,"Dorset Park, Scarborough Town Centre, Wexford ...","Dorset Park, Scarborough Town Centre, Wexford ...",43.757410,-79.273304
75,M4J,East Toronto,East Toronto,43.685347,-79.338106


- Importing Libraries

In [51]:
# import for plotting
import matplotlib.cm as cm
import matplotlib.colors as colors

# import K-means from clustering state
from sklearn.cluster import KMeans

# map rendering library
import folium

print('Libraries imported')

Libraries imported


In [52]:
!conda install -c conda-forge geopy --yes


Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-2.1.0                |     pyhd3deb0d_0          64 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          98 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-2.1.0-pyhd3deb0d_0



Downloading and Extracting Packages
geopy-2.1.0          | 64 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ################################

- Convert an address into latitude and longtitude values

In [53]:
from geopy.geocoders import Nominatim

In [55]:
address = 'Toronto, ON'

geolocator = Nominatim (user_agent = 'Toronto')
location = geolocator.geocode(address)
latitude_toronto = location.latitude
longitude_toronto = location.longitude
print('The geographical coordinate of Toronto are {},{}.'.format(latitude_toronto, longitude_toronto))

The geographical coordinate of Toronto are 43.6534817,-79.3839347.


In [57]:
Toronto_df = pd.merge(df,
                 df_lon_lat[['Postcode','Latitude', 'Longitude']],
                 on='Postcode')
Toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M9A,Islington Avenue,Islington Avenue,43.667856,-79.532242
1,M4H,Thorncliffe Park,Thorncliffe Park,43.705369,-79.349372
2,M1B,"Rouge, Malvern","Rouge, Malvern",43.806686,-79.194353
3,M9L,Humber Summit,Humber Summit,43.756303,-79.565963
4,M4Y,Church and Wellesley,Church and Wellesley,43.66586,-79.38316


# 6. Map

In [58]:

map_toronto = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start = 10)

# add markers to map
for lat, lng, borough, Neighbourhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Neighbourhood']):
    label = '{},{}'.format(Neighbourhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup=label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_toronto)
    
map_toronto

In [59]:
# Foursquare Credentials and version
CLIENT_ID = 'CAGW3IJCNXAVPNDNU2DIRHX50B2OGTMCTI0EPFCSEUHMT5BJ'
CLIENT_SECRET = '5VWK5Z0ACUFVBGOVILYTNVBJ1T4ZPQ4O5XTKVHGY3MVUU3B5'
VERSION = '20190706'

print('Your credentials: ')
print('CLIENT_ID: '+CLIENT_ID)
print('CLIENT_SECRET: '+CLIENT_SECRET)

Your credentials: 
CLIENT_ID: CAGW3IJCNXAVPNDNU2DIRHX50B2OGTMCTI0EPFCSEUHMT5BJ
CLIENT_SECRET: 5VWK5Z0ACUFVBGOVILYTNVBJ1T4ZPQ4O5XTKVHGY3MVUU3B5


In [60]:
# defining radius and limit of venues to get 
redius = 500
LIMIT = 100

In [65]:
def getNearbyVenues(names, latitudes, longitudes, radius = 500):
    venues_list =[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                                 'Neighbourhood Latitude', 
                                 'Neighbourhood Longitude', 
                                 'Venue',
                                 'Venue Latitude',
                                 'Venue Longitude',
                                 'Venue Category']
    return(nearby_venues)
        
        

In [66]:
toronto_venues = getNearbyVenues(names=Toronto_df['Neighbourhood'],
                                latitudes = Toronto_df['Latitude'],
                                 longitudes = Toronto_df['Longitude']
                                )

Islington Avenue
Thorncliffe Park
Rouge, Malvern
Humber Summit
Church and Wellesley
Weston
Northwood Park, York University
Hillcrest Village
Henry Farm
University of Toronto
Tam O'Shanter
Downsview
Humber Bay, Mimico NE, Old Mill South, The Queensway East, Royal York South East, Sunnylea
Bathurst Manor, Wilson Heights
Markland Wood
CFB Toronto
Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Lawrence Park
Moore Park
Chinatown, Grange Park, Kensington Market
Lawrence Heights, Lawrence Manor
Riverdale
India Bazaar
Cabbagetown, St. James Town
First Canadian Place, Underground city
Runnymede, Swansea
Woodbine Gardens, Parkview Hill
Bayview Village
York Mills
Emery, Humberlea
Upper Rouge
Cliffcrest, Cliffside
Dovercourt Village
Rosedale
Kingsview Village
Downsview West
Exhibition Place, Parkdale Village
Highland Creek, Rouge Hill, Port Union
Beaumond Heights, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown
High Park
Maryvale, Wexford
Alderwood, Long Br

In [67]:
toronto_venues.head(10)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Thorncliffe Park,43.705369,-79.349372,Costco,43.707051,-79.348093,Warehouse Store
1,Thorncliffe Park,43.705369,-79.349372,Iqbal Kebab & Sweet Centre,43.705923,-79.351521,Indian Restaurant
2,Thorncliffe Park,43.705369,-79.349372,Fit4Less,43.705689,-79.346018,Gym
3,Thorncliffe Park,43.705369,-79.349372,Bikram Yoga East York,43.70545,-79.351448,Yoga Studio
4,Thorncliffe Park,43.705369,-79.349372,Shoppers Drug Mart,43.70581,-79.347044,Pharmacy
5,Thorncliffe Park,43.705369,-79.349372,Hero Certified Burgers,43.705511,-79.347064,Burger Joint
6,Thorncliffe Park,43.705369,-79.349372,Subway,43.704596,-79.34967,Sandwich Place
7,Thorncliffe Park,43.705369,-79.349372,Iqbal foods,43.705751,-79.352054,Grocery Store
8,Thorncliffe Park,43.705369,-79.349372,Hakka Garden,43.704578,-79.34977,Indian Restaurant
9,Thorncliffe Park,43.705369,-79.349372,Petro-Canada,43.704058,-79.348094,Gas Station


In [68]:
toronto_venues.shape

(1475, 7)

In [70]:
# How many venues were returned for each neighbourhood
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
"Agincourt North, Milliken",4,4,4,4,4,4
"Alderwood, Long Branch",9,9,9,9,9,9
"Bathurst Manor, Wilson Heights",21,21,21,21,21,21
Bayview Village,4,4,4,4,4,4
...,...,...,...,...,...,...
Willowdale West,5,5,5,5,5,5
Woburn,3,3,3,3,3,3
"Woodbine Gardens, Parkview Hill",10,10,10,10,10,10
Woodbine Heights,5,5,5,5,5,5


# Part 3 : Segmenting and Clustering Neighborhoods in Toronto
# 7. Analysing each Neighbourhood

In [71]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix = '', prefix_sep = '')

# add neighbourhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood']

# move neighbourhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot.head()

Unnamed: 0,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Neighbourhood
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,Thorncliffe Park
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Thorncliffe Park
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Thorncliffe Park
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Thorncliffe Park
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Thorncliffe Park


In [72]:
toronto_onehot.shape

(1475, 243)

In [73]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Agincourt North, Milliken",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Bathurst Manor, Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Willowdale West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70,"Woodbine Gardens, Parkview Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- Printing each neighbourhood along with the top 5 most common venues


In [74]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print('----'+hood+'----')
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending = False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0             Breakfast Spot   0.2
1                     Lounge   0.2
2               Skating Rink   0.2
3             Clothing Store   0.2
4  Latin American Restaurant   0.2


----Agincourt North, Milliken----
          venue  freq
0    Playground  0.25
1          Park  0.25
2  Intersection  0.25
3        Bakery  0.25
4   Men's Store  0.00


----Alderwood, Long Branch----
                venue  freq
0         Pizza Place  0.22
1         Coffee Shop  0.11
2        Skating Rink  0.11
3      Sandwich Place  0.11
4  Athletics & Sports  0.11


----Bathurst Manor, Wilson Heights----
           venue  freq
0    Coffee Shop  0.10
1           Bank  0.10
2     Restaurant  0.05
3  Shopping Mall  0.05
4           Park  0.05


----Bayview Village----
                 venue  freq
0                 Café  0.25
1  Japanese Restaurant  0.25
2   Chinese Restaurant  0.25
3                 Bank  0.25
4    Accessories Store  0.00


----Beaumond Heights, 

In [75]:
# function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    
    return row_categories_sorted.index.values[0: num_top_venues]

In [76]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns = columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    
neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Skating Rink,Latin American Restaurant,Breakfast Spot,Clothing Store,Discount Store,Falafel Restaurant,Event Space,Escape Room,Electronics Store
1,"Agincourt North, Milliken",Bakery,Park,Playground,Intersection,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
2,"Alderwood, Long Branch",Pizza Place,Pharmacy,Sandwich Place,Skating Rink,Gym,Coffee Shop,Pub,Athletics & Sports,Diner,Discount Store
3,"Bathurst Manor, Wilson Heights",Coffee Shop,Bank,Mobile Phone Shop,Bridal Shop,Sandwich Place,Diner,Restaurant,Supermarket,Sushi Restaurant,Deli / Bodega
4,Bayview Village,Café,Japanese Restaurant,Chinese Restaurant,Bank,Farmers Market,Falafel Restaurant,Event Space,Escape Room,Electronics Store,Eastern European Restaurant


# 8. Cluster Neighbourhoods

In [78]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering 
kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([4, 0, 4, 4, 4, 4, 4, 4, 4, 4], dtype=int32)

In [80]:
toronto_grouped_clustering.head()


Unnamed: 0,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
neighbourhoods_venues_sorted.head()


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Skating Rink,Latin American Restaurant,Breakfast Spot,Clothing Store,Discount Store,Falafel Restaurant,Event Space,Escape Room,Electronics Store
1,"Agincourt North, Milliken",Bakery,Park,Playground,Intersection,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
2,"Alderwood, Long Branch",Pizza Place,Pharmacy,Sandwich Place,Skating Rink,Gym,Coffee Shop,Pub,Athletics & Sports,Diner,Discount Store
3,"Bathurst Manor, Wilson Heights",Coffee Shop,Bank,Mobile Phone Shop,Bridal Shop,Sandwich Place,Diner,Restaurant,Supermarket,Sushi Restaurant,Deli / Bodega
4,Bayview Village,Café,Japanese Restaurant,Chinese Restaurant,Bank,Farmers Market,Falafel Restaurant,Event Space,Escape Room,Electronics Store,Eastern European Restaurant


In [83]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = Toronto_df

# merge toronto_grouped with toronto_data to add latitude/longtitude for each neighborhod
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M9A,Islington Avenue,Islington Avenue,43.667856,-79.532242,,,,,,,,,,,
1,M4H,Thorncliffe Park,Thorncliffe Park,43.705369,-79.349372,4.0,Indian Restaurant,Pharmacy,Sandwich Place,Liquor Store,Burger Joint,Bus Line,Fast Food Restaurant,Restaurant,Discount Store,Supermarket
2,M1B,"Rouge, Malvern","Rouge, Malvern",43.806686,-79.194353,4.0,Fast Food Restaurant,Yoga Studio,Comfort Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
3,M9L,Humber Summit,Humber Summit,43.756303,-79.565963,0.0,Pizza Place,Intersection,Comic Shop,Diner,Farmers Market,Falafel Restaurant,Event Space,Escape Room,Electronics Store,Eastern European Restaurant
4,M4Y,Church and Wellesley,Church and Wellesley,43.66586,-79.38316,4.0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant,Yoga Studio,Pub,Café,Men's Store,Mediterranean Restaurant


In [84]:
toronto_merged = toronto_merged.dropna()


In [86]:
# create map
map_clusters = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start = 11)

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map 
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'],
                                 toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi)+'Cluster'+str(cluster), parse_html = True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color =rainbow[1],
        fill = True,
        fill_color = rainbow[1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters

In [87]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]


Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Humber Summit,0.0,Pizza Place,Intersection,Comic Shop,Diner,Farmers Market,Falafel Restaurant,Event Space,Escape Room,Electronics Store,Eastern European Restaurant
46,Scarborough Village,0.0,Pizza Place,Playground,Yoga Studio,Dim Sum Restaurant,Falafel Restaurant,Event Space,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
48,"Agincourt North, Milliken",0.0,Bakery,Park,Playground,Intersection,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
59,Victoria Village,0.0,Intersection,Pizza Place,Coffee Shop,Hockey Arena,Portuguese Restaurant,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run
62,"Islington, Princess Gardens, West Deane Park",0.0,Brewery,Bakery,Yoga Studio,Distribution Center,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Escape Room,Electronics Store
75,East Toronto,0.0,Intersection,Pizza Place,Park,Convenience Store,Metro Station,Donut Shop,Discount Store,Distribution Center,Dog Run,Doner Restaurant


In [89]:
# cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
29,York Mills,1.0,Park,Yoga Studio,Diner,Farmers Market,Falafel Restaurant,Event Space,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
34,Rosedale,1.0,Park,Playground,Trail,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
66,"Newtonbrook, Willowdale",1.0,Park,Yoga Studio,Diner,Farmers Market,Falafel Restaurant,Event Space,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


In [90]:
# cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
38,"Highland Creek, Rouge Hill, Port Union",2.0,Bar,Yoga Studio,Discount Store,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Escape Room,Electronics Store,Eastern European Restaurant


In [91]:
# cluster 4
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
30,"Emery, Humberlea",3.0,Baseball Field,Yoga Studio,Discount Store,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Escape Room,Electronics Store,Eastern European Restaurant


In [92]:
# cluster 5
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Thorncliffe Park,4.0,Indian Restaurant,Pharmacy,Sandwich Place,Liquor Store,Burger Joint,Bus Line,Fast Food Restaurant,Restaurant,Discount Store,Supermarket
2,"Rouge, Malvern",4.0,Fast Food Restaurant,Yoga Studio,Comfort Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
4,Church and Wellesley,4.0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant,Yoga Studio,Pub,Café,Men's Store,Mediterranean Restaurant
6,"Northwood Park, York University",4.0,Falafel Restaurant,Massage Studio,Coffee Shop,Miscellaneous Shop,Bar,Caribbean Restaurant,Yoga Studio,Distribution Center,Event Space,Escape Room
7,Hillcrest Village,4.0,Mediterranean Restaurant,Fast Food Restaurant,Dog Run,Golf Course,Pool,Dumpling Restaurant,Distribution Center,Doner Restaurant,Donut Shop,Yoga Studio
...,...,...,...,...,...,...,...,...,...,...,...,...
71,"Morningside, West Hill",4.0,Intersection,Medical Center,Restaurant,Rental Car Location,Mexican Restaurant,Bank,Electronics Store,Breakfast Spot,Eastern European Restaurant,Dumpling Restaurant
72,St. James Town,4.0,Coffee Shop,Café,Cosmetics Shop,Gastropub,Cocktail Bar,Restaurant,Clothing Store,Hotel,Moroccan Restaurant,Department Store
73,Agincourt,4.0,Lounge,Skating Rink,Latin American Restaurant,Breakfast Spot,Clothing Store,Discount Store,Falafel Restaurant,Event Space,Escape Room,Electronics Store
74,"Dorset Park, Scarborough Town Centre, Wexford ...",4.0,Indian Restaurant,Pet Store,Vietnamese Restaurant,Chinese Restaurant,Brewery,Gaming Cafe,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dim Sum Restaurant
