### Segmenting and Clustering Neighborhoods in Toronto 

In [2]:
#The purpose of this project is to cluster, visualize, and analyze neighborhoods in Toronto,CA

import pandas as pd #load library for data analysis
import numpy as np #load library for vector data management
import requests #load http library to send requests with python
import matplotlib.pyplot as plt #load a 2D plotting library
import matplotlib.cm as cm #load for built in color maps
import matplotlib.colors as colors #load for color plotting
import time #load time library to delay geocoder requests

#set backend of matplotlib to inline backend
%matplotlib inline 

!conda install -c conda beautifulsoup4 --yes #install the beautifulsoup package for webscraping
from bs4 import BeautifulSoup as bs

!conda install -c conda lxml --yes #install to handle html files

!conda install -c conda-forge geopy --yes #install geocoder for location data
from geopy.geocoders import Nominatim as nm

!conda install -c conda-forge geopandas --yes #install to allow spatial operations on geometric types

!conda install -c conda-forge folium=0.5.0 --yes #install for map rendering
import folium 

from IPython.display import Image 
from IPython.core.display import HTML

from pandas.io.json import json_normalize #load to convert json file to pandas dataframe

from sklearn.cluster import KMeans #load for clustering analysis
from sklearn.datasets.samples_generator import make_blobs #generate gaussian blobs for clustering

print("Packages installed.")

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         156 KB
    beautifulsoup4-4.8.1       |           py36_0         153 KB
    soupsieve-1.9.5            |           py36_0          61 KB
    openssl-1.1.1d             |       h7b6447c_3         3.7 MB
    ------------------------------------------------------------
                                           Total:         4.1 MB

The following NEW packages will be INSTALLED:

    soupsieve:      1.9.5-py36_0                 

The following packages will be UPDATED:

    beautifulsoup4: 4.6.3-py37_0     

### Acquire the data, clean the data, and explore it
Use the imported tools to acquire the data off of wikipedia, then manipulate and prepare the dataframes to be examined.  

In [3]:
#Assign the website to a variable and request it
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
rq = requests.get(url)

In [4]:
#Use beautifulsoup as bs to webscrape the table
soup = bs(rq.content,'html')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))
dfa = df[0]
dfa.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Not assigned
8,M8A,Not assigned,Not assigned
9,M9A,Downtown Toronto,Queen's Park


In [5]:
#Remove rows that don't have a borough assigned
dfb = dfa[dfa.Borough != 'Not assigned'].reset_index(drop=True)
dfb.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Not assigned
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [6]:
#Assign the borough to the cells without a neighborhood
dfc = pd.DataFrame(dfb)
dfc['Neighbourhood'] = np.where(dfc['Neighbourhood'] == 'Not assigned', dfc['Borough'], dfc['Neighbourhood'])
dfc.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Queen's Park
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [7]:
#Find the shape of the dataframe
dfc.shape

(210, 3)

In [8]:
#Use the geocoder to load location data into a dataframe
locator = nm(user_agent='Toronto_Geocoder')

dflat=[]
dflong=[]
BACKOFF_TIME=3

for i in dfc['Neighbourhood']:
    location = locator.geocode('{}, Toronto, Ontario'.format(i))
    if location == None:
        lat = pd.DataFrame({'Latitude': ['Nan']})  
        long = pd.DataFrame({'Longitude': ['Nan']})
        dflat.append(lat)
        dflong.append(long)
    else:
        lat = pd.DataFrame({'Latitude': [location.latitude]})  
        long = pd.DataFrame({'Longitude': [location.longitude]})
        dflat.append(lat)
        dflong.append(long)
        time.sleep(BACKOFF_TIME * 1)

dflat = pd.concat(dflat, axis=0)
dflong = pd.concat(dflong, axis=0)

In [9]:
dflong2 = dflong.reset_index(drop=True)
dflat2 = dflat.reset_index(drop=True)

In [10]:
#Join the dataframes along the column axis
dfd = pd.concat([dfc, dflat2, dflong2], axis=1)
dfd

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7588,-79.3202
1,M4A,North York,Victoria Village,43.7327,-79.3112
2,M5A,Downtown Toronto,Harbourfront,43.6401,-79.3801
3,M6A,North York,Lawrence Heights,43.7228,-79.4509
4,M6A,North York,Lawrence Manor,43.7221,-79.4375
...,...,...,...,...,...
205,M8Z,Etobicoke,Kingsway Park South West,43.6504,-79.5
206,M8Z,Etobicoke,Mimico NW,43.6167,-79.4968
207,M8Z,Etobicoke,The Queensway West,43.6236,-79.5148
208,M8Z,Etobicoke,Royal York South West,43.6482,-79.5113


In [11]:
#Clean table by removing any rows with 'Nan' location
dfe = dfd[dfd.Latitude != 'Nan'].reset_index(drop=True)
dfe.shape

(200, 5)

In [12]:
#Print the number of boroughs and neighborhoods
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(dfe['Borough'].unique()),
        dfe.shape[0]))


The dataframe has 10 boroughs and 200 neighborhoods.


In [13]:
#Find the borough with the most neighborhoods for analysis
dff = pd.DataFrame(dfe.Borough.value_counts().reset_index())
dff.columns = ['Borough', 'Count']
print(dff)

            Borough  Count
0         Etobicoke     42
1        North York     37
2       Scarborough     37
3  Downtown Toronto     35
4   Central Toronto     17
5      West Toronto     13
6      East Toronto      6
7              York      6
8         East York      6
9      Queen's Park      1


In [14]:
#Create a dataframe for Etobicoke, Toronto
etobicoke_data = dfe[dfe['Borough'] == 'Etobicoke'].reset_index(drop=True)
etobicoke_data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M9B,Etobicoke,Cloverdale,43.6336,-79.5497
1,M9B,Etobicoke,Islington,43.6453,-79.5249
2,M9B,Etobicoke,Martin Grove,43.6449,-79.3818
3,M9B,Etobicoke,Princess Gardens,43.6405,-79.3912
4,M9B,Etobicoke,West Deane Park,43.6632,-79.5686
5,M9C,Etobicoke,Bloordale Gardens,43.6353,-79.5637
6,M9C,Etobicoke,Eringate,43.6623,-79.5765
7,M9C,Etobicoke,Markland Wood,43.6312,-79.5854
8,M9C,Etobicoke,Old Burnhamthorpe,43.6394,-79.5844
9,M9P,Etobicoke,Westmount,43.6936,-79.521


In [15]:
#Find the coordinates of Etobicoke
etobicoke_address = 'Etobicoke, Toronto, Ontario'

etobicoke_geo = nm(user_agent="etobicoke_explorer")
etobicoke_loc = etobicoke_geo.geocode(etobicoke_address)
etobicoke_lat = etobicoke_loc.latitude
etobicoke_long = etobicoke_loc.longitude
print('The geograpical coordinates of Etobicoke are {}, {}.'.format(etobicoke_lat, etobicoke_long))

The geograpical coordinates of Etobicoke are 43.67145915, -79.5524920661167.


In [16]:
# Create map of Etobicoke, Toronto using latitude and longitude values
map_etobicoke = folium.Map(location=[etobicoke_lat, etobicoke_long], zoom_start=10)

# Place neighborhood markers on the map
for et_lat, et_lng, label in zip(etobicoke_data['Latitude'], etobicoke_data['Longitude'], etobicoke_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [et_lat, et_lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_etobicoke)  
    
map_etobicoke

In [17]:
#Load Foursquare credentials to search nearby venues

In [18]:
#Hide this cell for security before posting
CLIENT_ID = '2PXCN5AAYZUFABTQUCEZMWDK3H4FF4KLPNKDOBPJIV3F4CNL' # your Foursquare ID
CLIENT_SECRET = '54GR42ZLZNHVREVCT3LKTGVQ3CL41TUWFVEEKDDP0TPSXBBB' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 2PXCN5AAYZUFABTQUCEZMWDK3H4FF4KLPNKDOBPJIV3F4CNL
CLIENT_SECRET:54GR42ZLZNHVREVCT3LKTGVQ3CL41TUWFVEEKDDP0TPSXBBB


In [19]:
#Find the location of the first neighborhood in the etobicoke dataframe
neighbourhood_latitude = etobicoke_data.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = etobicoke_data.loc[0, 'Longitude'] # neighbourhood longitude value

neighbourhood_name = etobicoke_data.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Cloverdale are 43.633637, -79.5497447.


In [20]:
#Find the top 100 venues in a 500m radius
radius = 500
LIMIT = 100

urlb = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighbourhood_latitude, neighbourhood_longitude, VERSION, radius, LIMIT)
urlb

'https://api.foursquare.com/v2/venues/search?client_id=2PXCN5AAYZUFABTQUCEZMWDK3H4FF4KLPNKDOBPJIV3F4CNL&client_secret=54GR42ZLZNHVREVCT3LKTGVQ3CL41TUWFVEEKDDP0TPSXBBB&ll=43.633637,-79.5497447&v=20180605&radius=500&limit=100'

In [21]:
#Send the get request and observe the results
results = requests.get(urlb).json()

In [22]:
venues = results['response']['venues']

# Use the json_normalize function to flatten the JSON file
nearby_venues = json_normalize(venues) 

# Filter the columns for the nearby venues
filtered_columns = ['name', 'categories', 'location.lat', 'location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# This is a function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# Filter the category for each row
nearby_venues['categories'] = nearby_venues.apply(get_category_type, axis=1)

# Clean the columns and check the table for proper titles
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Silverhill Park,Baseball Field,43.63394,-79.550237
1,TTC Bus 111 East Mall,Bus Line,43.63738,-79.536241
2,Enterprise Rent-A-Car,Rental Car Location,43.630257,-79.548677
3,Yellow Cup Cafe,Café,43.631744,-79.55223
4,Etobicoke Mews,Shopping Mall,43.632578,-79.551434


In [23]:
#Find the number of venues returned for Cloverdale
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

67 venues were returned by Foursquare.


### Gather the venue data for each neighborhood in Etobicoke
As the borough with the most neighborhoods, Etobicoke is a good focus point for data gathering.

In [24]:
#Create a function to gather venue data for each neighborhood in Etobicoke into a dataframe
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_neigh = []
    venues_neigh_lat = []
    venues_neigh_lng = []
    venues_ven = []
    venues_lat = []
    venues_lng = []
    venues_cat = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # Call using the Foursquare api
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            lat, 
            lng, 
            VERSION,
            radius, 
            LIMIT)
         
        # Create the get request for the url
        results = requests.get(url).json()
        
        venues = results['response']['venues']

        # Flatten the json file
        close_venues = json_normalize(venues) 

        # Filter the coloumns
        filtered_columns = ['name', 'categories', 'location.lat', 'location.lng']
        close_venues = close_venues.loc[:, filtered_columns]

        # This function extracts the category of the venues
        def get_category_type(row):
            try:
                categories_list = row['categories']
            except:
                categories_list = row['venue.categories']

            if len(categories_list) == 0:
                return None
            else:
                return categories_list[0]['name']

        # Filter the categories of the rows
        close_venues['categories'] = close_venues.apply(get_category_type, axis=1)

        # Clean the columns for appending
        close_venues.columns = [col.split(".")[-1] for col in close_venues.columns]
        
        close_venues_pd = pd.DataFrame(close_venues)
        
        #This loop fills dataframes with venue name, location, and category
        for v in close_venues_pd:
            ven = pd.DataFrame({'Venue': close_venues_pd['name']})
            ven_lat = pd.DataFrame({'Venue Latitude': close_venues_pd['lat']})
            ven_lng = pd.DataFrame({'Venue Longitude': close_venues_pd['lng']})
            ven_cat = pd.DataFrame({'Venue Category': close_venues_pd['categories']})
        
        count_venues = len(ven) #Find count of venues, then fill name, lat, lng *count_venues times
        
        neigh = pd.DataFrame({'Neighbourhood': [name]*count_venues})
        neigh_lat = pd.DataFrame({'Neighbourhood Latitude': [lat]*count_venues})
        neigh_lng = pd.DataFrame({'Neighbourhood Longitude': [lng]*count_venues})

        # Return only the needed information for each nearby venue
        venues_neigh.append(neigh)
        venues_neigh_lat.append(neigh_lat)
        venues_neigh_lng.append(neigh_lng)
        venues_ven.append(ven)
        venues_lat.append(ven_lat)
        venues_lng.append(ven_lng)
        venues_cat.append(ven_cat)
    
    #Concatenate each dataframe, reset the indices, and join the tables along the column axis
    venues_neigh = pd.concat(venues_neigh, axis=0)
    venues_neigh_lat = pd.concat(venues_neigh_lat, axis=0)
    venues_neigh_lng = pd.concat(venues_neigh_lng, axis=0)
    venues_ven = pd.concat(venues_ven, axis=0)
    venues_lat = pd.concat(venues_lat, axis=0)
    venues_lng = pd.concat(venues_lng, axis=0)
    venues_cat = pd.concat(venues_cat, axis=0)

    venues_neigh = venues_neigh.reset_index(drop=True)
    venues_neigh_lat = venues_neigh_lat.reset_index(drop=True)
    venues_neigh_lng = venues_neigh_lng.reset_index(drop=True)
    venues_ven = venues_ven.reset_index(drop=True)
    venues_lat = venues_lat.reset_index(drop=True)
    venues_lng = venues_lng.reset_index(drop=True)
    venues_cat = venues_cat.reset_index(drop=True)
    
    venues_nearby = pd.concat((venues_neigh, venues_neigh_lat, venues_neigh_lng, venues_ven, 
                               venues_lat, venues_lng, venues_cat), axis=1)
    return(venues_nearby)
           

In [25]:
#Run the getNearbyVenues function for each neighborhood and create a new dataframe
etobicoke_venues = getNearbyVenues(names=etobicoke_data['Neighbourhood'],
                                   latitudes=etobicoke_data['Latitude'],
                                   longitudes=etobicoke_data['Longitude']
                                   )

Cloverdale
Islington
Martin Grove
Princess Gardens
West Deane Park
Bloordale Gardens
Eringate
Markland Wood
Old Burnhamthorpe
Westmount
Kingsview Village
Martin Grove Gardens
Richview Gardens
St. Phillips
Mimico South
New Toronto
Albion Gardens
Humbergate
Jamestown
Mount Olive
Silverstone
South Steeles
Thistletown
Alderwood
Long Branch
Northwest
The Kingsway
Montgomery Road
Old Mill North
Humber Bay
King's Mill Park
Kingsway Park South East
Mimico NE
Old Mill South
The Queensway East
Royal York South East
Sunnylea
Kingsway Park South West
Mimico NW
The Queensway West
Royal York South West
South of Bloor


In [26]:
etobicoke_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Cloverdale,43.633637,-79.549745,Silverhill Park,43.63394,-79.550237,Baseball Field
1,Cloverdale,43.633637,-79.549745,TTC Bus 111 East Mall,43.63738,-79.536241,Bus Line
2,Cloverdale,43.633637,-79.549745,Enterprise Rent-A-Car,43.630257,-79.548677,Rental Car Location
3,Cloverdale,43.633637,-79.549745,Yellow Cup Cafe,43.631744,-79.55223,Café
4,Cloverdale,43.633637,-79.549745,Etobicoke Mews,43.632578,-79.551434,Shopping Mall


In [27]:
print(etobicoke_venues.shape)

(4062, 7)


In [28]:
etobicoke_venues

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Cloverdale,43.633637,-79.549745,Silverhill Park,43.633940,-79.550237,Baseball Field
1,Cloverdale,43.633637,-79.549745,TTC Bus 111 East Mall,43.637380,-79.536241,Bus Line
2,Cloverdale,43.633637,-79.549745,Enterprise Rent-A-Car,43.630257,-79.548677,Rental Car Location
3,Cloverdale,43.633637,-79.549745,Yellow Cup Cafe,43.631744,-79.552230,Café
4,Cloverdale,43.633637,-79.549745,Etobicoke Mews,43.632578,-79.551434,Shopping Mall
...,...,...,...,...,...,...,...
4057,South of Bloor,43.667662,-79.394698,Alliance for Audited Media,43.668396,-79.393610,Non-Profit
4058,South of Bloor,43.667662,-79.394698,Canadian Scholars Press,43.668718,-79.394944,Office
4059,South of Bloor,43.667662,-79.394698,Cornerstone Academic College,43.668647,-79.394298,School
4060,South of Bloor,43.667662,-79.394698,TRIEC,43.668789,-79.394334,Office


In [29]:
#Group the table by neighborhood and find the count of venues for each
etobicoke_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Albion Gardens,56,56,56,56,56,51
Alderwood,100,100,100,100,100,93
Bloordale Gardens,100,100,100,100,100,85
Cloverdale,67,67,67,67,67,64
Eringate,100,100,100,100,100,89
Humber Bay,100,100,100,100,100,87
Humbergate,100,100,100,100,100,97
Islington,79,79,79,79,79,75
Jamestown,100,100,100,100,100,89
King's Mill Park,83,83,83,83,83,82


In [30]:
print('There are {} unique categories.'.format(len(etobicoke_venues['Venue Category'].unique())))

There are 379 unique categories.


### Use Ecoding to Analyze the Etobicoke Neighborhoods
Onehot encoding allows for the analysis of categorical features by encoding them into numerical values.  This will generate information about venue frequency.  

In [31]:
# Encode categorical features into numerical with one hot encoding
etobicoke_onehot = pd.get_dummies(etobicoke_venues['Venue Category'], prefix="", prefix_sep="")

# Add the neighborhood column back onto dataframe
etobicoke_onehot['Neighbourhood'] = etobicoke_venues['Neighbourhood'] 

# Shift the neighborhood column to the first column
fixed_columns = [etobicoke_onehot.columns[-1]] + list(etobicoke_onehot.columns[:-1])
etobicoke_onehot = etobicoke_onehot[fixed_columns]

etobicoke_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Acupuncturist,Adult Boutique,Adult Education Center,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,...,Volleyball Court,Warehouse,Warehouse Store,Water Park,Wedding Hall,Wine Bar,Winery,Wings Joint,Women's Store,Yoga Studio
0,Cloverdale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Cloverdale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Cloverdale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Cloverdale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Cloverdale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
etobicoke_onehot.shape

(4062, 379)

In [33]:
etobicoke_grouped = etobicoke_onehot.groupby('Neighbourhood').mean().reset_index()
etobicoke_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Acupuncturist,Adult Boutique,Adult Education Center,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,...,Volleyball Court,Warehouse,Warehouse Store,Water Park,Wedding Hall,Wine Bar,Winery,Wings Joint,Women's Store,Yoga Studio
0,Albion Gardens,0.017857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Alderwood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bloordale Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Cloverdale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014925,0.0
4,Eringate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
5,Humber Bay,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
6,Humbergate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
7,Islington,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Jamestown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,King's Mill Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012048,0.0,0.0,0.0,0.0,0.0


In [34]:
etobicoke_grouped.shape

(42, 379)

In [35]:
#Print the top 5 venues for each neighborhood
num_top_venues = 5

for hood in etobicoke_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = etobicoke_grouped[etobicoke_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Albion Gardens----
                  venue  freq
0              Pharmacy  0.05
1     Convenience Store  0.04
2  Fast Food Restaurant  0.04
3  Caribbean Restaurant  0.04
4      Dentist's Office  0.04


----Alderwood----
                venue  freq
0              Office  0.07
1         Gas Station  0.04
2  Salon / Barbershop  0.03
3   Convenience Store  0.03
4          Playground  0.03


----Bloordale Gardens----
                                      venue  freq
0                         Convenience Store  0.06
1                                    Office  0.06
2                                      Park  0.04
3  Residential Building (Apartment / Condo)  0.04
4                          Dentist's Office  0.04


----Cloverdale----
                 venue  freq
0               Office  0.06
1             Bus Line  0.04
2  Arts & Crafts Store  0.03
3             Building  0.03
4   Miscellaneous Shop  0.03


----Eringate----
                                      venue  freq
0                

In [36]:
#This function sorts the venues in the dataframe returns them in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [37]:
#Create a new dataframe to display the top 10 venues in the Etobicoke neighborhoods 
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

#Create columns by the number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

#Create a new dataframe for sorting
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = etobicoke_grouped['Neighbourhood']

for ind in np.arange(etobicoke_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(etobicoke_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head(10)

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Albion Gardens,Pharmacy,Electronics Store,Library,Bus Line,Indian Restaurant,Caribbean Restaurant,Fast Food Restaurant,Dentist's Office,Convenience Store,Nightclub
1,Alderwood,Office,Gas Station,Convenience Store,Dentist's Office,Conference Room,Salon / Barbershop,Medical Center,Playground,Bank,Café
2,Bloordale Gardens,Convenience Store,Office,Park,Residential Building (Apartment / Condo),Dentist's Office,Coffee Shop,Church,School,Hospital,Bank
3,Cloverdale,Office,Bus Line,Massage Studio,Auto Dealership,Building,Doctor's Office,Miscellaneous Shop,Arts & Crafts Store,Gas Station,Dentist's Office
4,Eringate,Office,Residential Building (Apartment / Condo),Pizza Place,Convenience Store,Playground,Building,Athletics & Sports,Electronics Store,Bank,Park
5,Humber Bay,Residential Building (Apartment / Condo),Park,Bus Line,Office,Doctor's Office,Convenience Store,Harbor / Marina,Elementary School,School,Grocery Store
6,Humbergate,College Classroom,College Academic Building,College Administrative Building,College Communications Building,College Lab,Coffee Shop,General College & University,College Residence Hall,Fast Food Restaurant,Bus Station
7,Islington,Bus Line,Doctor's Office,Office,Parking,Bus Stop,Building,Bakery,Italian Restaurant,Bank,Salon / Barbershop
8,Jamestown,Office,Building,Government Building,Courthouse,Tech Startup,Paper / Office Supplies Store,Deli / Bodega,Pub,Coffee Shop,Residential Building (Apartment / Condo)
9,King's Mill Park,Residential Building (Apartment / Condo),Doctor's Office,Office,Building,Lounge,Salon / Barbershop,Park,Conference Room,Spa,Meeting Room


## Clustering the Neighborhoods
Use K-Means clustering to form the neighborhoods into 5 clusters.  The clusters will then be examined for any defining characteristics.  

In [38]:
#Enter the number of clusters
kclusters = 5

etobicoke_grouped_clustering = etobicoke_grouped.drop('Neighbourhood', 1)

#Execute clustering with Kmeans
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(etobicoke_grouped_clustering)

#Review the cluster labels for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 4, 0, 1, 3, 4, 2], dtype=int32)

In [39]:
#Add clustering labels to the dataframe for analysis
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

etobicoke_merged = etobicoke_data

#Merge the dataframes to add latitude/longitude for each neighbourhood
etobicoke_merged = etobicoke_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

etobicoke_merged.head() 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M9B,Etobicoke,Cloverdale,43.6336,-79.5497,0,Office,Bus Line,Massage Studio,Auto Dealership,Building,Doctor's Office,Miscellaneous Shop,Arts & Crafts Store,Gas Station,Dentist's Office
1,M9B,Etobicoke,Islington,43.6453,-79.5249,3,Bus Line,Doctor's Office,Office,Parking,Bus Stop,Building,Bakery,Italian Restaurant,Bank,Salon / Barbershop
2,M9B,Etobicoke,Martin Grove,43.6449,-79.3818,0,Platform,Coffee Shop,Office,Event Space,Fast Food Restaurant,Travel Lounge,Convenience Store,Cocktail Bar,Spa,Gift Shop
3,M9B,Etobicoke,Princess Gardens,43.6405,-79.3912,0,Residential Building (Apartment / Condo),Baseball Stadium,Office,Gym / Fitness Center,General Entertainment,Gym,Coffee Shop,Rental Car Location,Chinese Restaurant,Event Space
4,M9B,Etobicoke,West Deane Park,43.6632,-79.5686,0,Residential Building (Apartment / Condo),Office,Convenience Store,Church,Park,Salon / Barbershop,Building,Intersection,Pizza Place,Gym


In [40]:
#Create a cluster map of Etobicoke neighborhood venues
map_clusters = folium.Map(location=[etobicoke_lat, etobicoke_long], zoom_start=11)

#Assigne the color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#Add markers onto the map
markers_colors = []
for lat, lon, poi, cluster in zip(etobicoke_merged['Latitude'], etobicoke_merged['Longitude'], etobicoke_merged['Neighbourhood'], etobicoke_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Review of Clusters
Each of the 5 clusters are examined to assess the defining characteristics of the venue categories,
based on which an appropriate title will be given to the cluster.

#### Residential Cluster 1

In [50]:
#1st Cluster(0)
clust1 = etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 0, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]
clust1

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Etobicoke,0,Office,Bus Line,Massage Studio,Auto Dealership,Building,Doctor's Office,Miscellaneous Shop,Arts & Crafts Store,Gas Station,Dentist's Office
2,Etobicoke,0,Platform,Coffee Shop,Office,Event Space,Fast Food Restaurant,Travel Lounge,Convenience Store,Cocktail Bar,Spa,Gift Shop
3,Etobicoke,0,Residential Building (Apartment / Condo),Baseball Stadium,Office,Gym / Fitness Center,General Entertainment,Gym,Coffee Shop,Rental Car Location,Chinese Restaurant,Event Space
4,Etobicoke,0,Residential Building (Apartment / Condo),Office,Convenience Store,Church,Park,Salon / Barbershop,Building,Intersection,Pizza Place,Gym
5,Etobicoke,0,Convenience Store,Office,Park,Residential Building (Apartment / Condo),Dentist's Office,Coffee Shop,Church,School,Hospital,Bank
7,Etobicoke,0,Residential Building (Apartment / Condo),Office,Tennis Court,Park,Bank,Pharmacy,High School,Indian Restaurant,Bus Line,Art Gallery
8,Etobicoke,0,Park,Church,Residential Building (Apartment / Condo),Bus Line,Bank,Pool,Baseball Field,Art Gallery,Bus Stop,Dog Run
11,Etobicoke,0,Bank,Medical Center,Bus Stop,Building,Bus Line,Park,Pizza Place,Salon / Barbershop,Residential Building (Apartment / Condo),Office
12,Etobicoke,0,Bus Line,Bus Stop,Medical Center,Park,Building,Gym,High School,Residential Building (Apartment / Condo),Dentist's Office,Church
14,Etobicoke,0,Church,Diner,Salon / Barbershop,Automotive Shop,Office,Bar,Convenience Store,School,Beach,Building


In [42]:
#Examine cluster for most frequent 
clust1['1st Most Common Venue'].value_counts()

Residential Building (Apartment / Condo)    5
Church                                      3
Office                                      3
Platform                                    1
Medical Center                              1
Chinese Restaurant                          1
Pharmacy                                    1
Gas Station                                 1
Bus Line                                    1
Bank                                        1
Park                                        1
Exhibit                                     1
Salon / Barbershop                          1
Convenience Store                           1
Name: 1st Most Common Venue, dtype: int64

#### College Cluster 2

In [43]:
#2nd Cluster(1)
clust2 = etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 1, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]
clust2

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,Etobicoke,1,College Classroom,College Academic Building,College Administrative Building,College Communications Building,College Lab,Coffee Shop,General College & University,College Residence Hall,Fast Food Restaurant,Bus Station


In [44]:
#Examine cluster for most frequent, most common venues
clust2['1st Most Common Venue'].value_counts()

College Classroom    1
Name: 1st Most Common Venue, dtype: int64

#### Shopping and Residential Cluster 3

In [45]:
#3rd Cluster(2)
clust3 = etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 2, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]
clust3

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,Etobicoke,2,Pizza Place,Park,Bank,Salon / Barbershop,Gas Station,Caribbean Restaurant,Doctor's Office,Building,Pharmacy,Office
15,Etobicoke,2,Music Venue,Bank,Residential Building (Apartment / Condo),Hardware Store,Café,Salon / Barbershop,Convenience Store,Mexican Restaurant,Sushi Restaurant,Bar
22,Etobicoke,2,Indian Restaurant,Pizza Place,Caribbean Restaurant,Electronics Store,Doctor's Office,Shopping Mall,Jewelry Store,Salon / Barbershop,Spa,Bank
25,Etobicoke,2,Art Gallery,Bar,Boutique,Furniture / Home Store,Coffee Shop,New American Restaurant,Nail Salon,Office,Thai Restaurant,Ice Cream Shop
26,Etobicoke,2,Salon / Barbershop,Bank,Dentist's Office,Italian Restaurant,Office,Mobile Phone Shop,Pizza Place,Metro Station,Coffee Shop,Optical Shop
28,Etobicoke,2,Residential Building (Apartment / Condo),Salon / Barbershop,Park,Lounge,Conference Room,Other Great Outdoors,Dentist's Office,Doctor's Office,Office,Bus Line
30,Etobicoke,2,Residential Building (Apartment / Condo),Doctor's Office,Office,Building,Lounge,Salon / Barbershop,Park,Conference Room,Spa,Meeting Room
31,Etobicoke,2,Salon / Barbershop,Bakery,Doctor's Office,Bank,Dentist's Office,Miscellaneous Shop,Park,Medical Center,Other Great Outdoors,Residential Building (Apartment / Condo)
33,Etobicoke,2,Residential Building (Apartment / Condo),Park,Dentist's Office,Doctor's Office,Conference Room,Other Great Outdoors,Salon / Barbershop,Lounge,Optical Shop,Bank
35,Etobicoke,2,Salon / Barbershop,Bank,Italian Restaurant,Dentist's Office,Optical Shop,Shoe Store,Office,Pizza Place,Nail Salon,Grocery Store


In [51]:
clust3['1st Most Common Venue'].value_counts()

Salon / Barbershop                          5
Residential Building (Apartment / Condo)    3
Indian Restaurant                           1
Pizza Place                                 1
Art Gallery                                 1
Music Venue                                 1
Name: 1st Most Common Venue, dtype: int64

#### Business and Transport Cluster 4

In [46]:
#4th Cluster(3)
clust4 = etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 3, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]
clust4

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Etobicoke,3,Bus Line,Doctor's Office,Office,Parking,Bus Stop,Building,Bakery,Italian Restaurant,Bank,Salon / Barbershop
13,Etobicoke,3,Bus Line,Office,Residential Building (Apartment / Condo),Doctor's Office,Bank,Dentist's Office,Building,Grocery Store,Convenience Store,Pub


#### Business Cluster 5

In [47]:
#5th Cluster(4)
clust5 = etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 4, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]
clust5

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Etobicoke,4,Office,Residential Building (Apartment / Condo),Pizza Place,Convenience Store,Playground,Building,Athletics & Sports,Electronics Store,Bank,Park
10,Etobicoke,4,Office,Building,Bus Line,Gas Station,Doctor's Office,Residential Building (Apartment / Condo),Miscellaneous Shop,Café,Bus Stop,Factory
18,Etobicoke,4,Office,Building,Government Building,Courthouse,Tech Startup,Paper / Office Supplies Store,Deli / Bodega,Pub,Coffee Shop,Residential Building (Apartment / Condo)
19,Etobicoke,4,Office,Building,Government Building,Courthouse,Tech Startup,Paper / Office Supplies Store,Deli / Bodega,Pub,Coffee Shop,Residential Building (Apartment / Condo)
34,Etobicoke,4,Office,Medical Center,Clothing Store,Residential Building (Apartment / Condo),Shoe Store,Building,Automotive Shop,Pharmacy,Coffee Shop,Medical Lab


In [48]:
#Examine cluster for most frequent, most common venues
clust5['1st Most Common Venue'].value_counts()

Office    5
Name: 1st Most Common Venue, dtype: int64