#### import packages needed for this project

In [263]:
# data analysis
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# handle data in a vectorized manner
import numpy as np 
# random number generation
import random 
# display images
from IPython.display import Image 
from IPython.core.display import HTML 
# tranform .json file into pandas dataframe
from pandas.io.json import json_normalize
# import json file handling
import json
# plotting and maps 
import folium
# webpage handling 
import requests
# my foursquare functions
from four2 import four2
# find latitude and longitude from address
import geocoder
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans

# PART 1

#### get neighbourhood info for every postal code/ borough combination

read wikipedia page of Toronto postal codes

In [264]:
# define webpage address to scrape
wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
# get the above wikipedia page
page = requests.get(wiki)
# get text from this page
pageText = page.text
# read table into a pandas dataframe
dfT = pd.read_html(pageText, header=0)
print('number of dataframes read from website: ' + \
    str(np.shape(dfT)[0]) + '\n')
# select the tabular data index wanted
tabInd = 0
# show first five rows of dataframe
dfT[tabInd].head()

number of dataframes read from website: 3



Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


filter for entries containing 'Borough' information

In [265]:
# check which Boroughs are in the dataframe
print(dfT[0].Borough.unique())
print()
# delete all entries "Not assigned"
dfTfilter = dfT[0][dfT[0].Borough != 'Not assigned']
# show first five entries in new dataframe
dfTfilter.head()

['Not assigned' 'North York' 'Downtown Toronto' "Queen's Park"
 'Scarborough' 'East York' 'Etobicoke' 'York' 'East Toronto'
 'West Toronto' 'Central Toronto' 'Mississauga']



Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


show unique postal codes which will be the dataframes rows

In [266]:
unqPCs = dfTfilter.Postcode.unique()
unqPCs

array(['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B',
       'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C',
       'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H',
       'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J',
       'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L',
       'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M',
       'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N',
       'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R',
       'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S',
       'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V',
       'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X',
       'M4Y', 'M7Y', 'M8Y', 'M8Z'], dtype=object)

go through all unique postal codes and extract the neighbourhoods 

In [267]:
# initialize dataframe with same column headers as before
dfColumns = [dfTfilter.columns[0], dfTfilter.columns[1], dfTfilter.columns[2]]
dfTfinal = pd.DataFrame(columns=dfTfilter.columns)
# one row per unique postcode, fill with the applicable borough and neighbourhoods
for pc in dfTfilter.Postcode.unique():
    # get borough of this postcode
    borTemp = dfTfilter[dfTfilter.Postcode == pc].Borough
    # take the first as it is the same for all neighbourhoods 
    borTemp = borTemp.iloc[0]
    # get a list of unique neighbourhoods in this postcode
    neighTemp = dfTfilter[dfTfilter.Postcode == pc].Neighborhood.unique()
    # formatting list into a string of neighbourhood1, neighbourhood2, ... shape
    neighTempStr = ''
    for i in range(np.shape(neighTemp)[0]):
        if str(neighTemp[i]) == 'Not assigned':
            neighTempStr += borTemp
        else:
            neighTempStr += str(neighTemp[i])
        neighTempStr += ', '
    # delete last comma and space character
    neighTempStr = neighTempStr[:-2]
    # join postcode, borough and string containing all neighbourhoods into one row of the dataframe
    dfTemp = pd.DataFrame({dfTfilter.columns[0]:pc, dfTfilter.columns[1]:borTemp, \
                           dfTfilter.columns[2]:neighTempStr}, index=[0])
    # append the new row to the dataframe
    dfTfinal = dfTfinal.append(dfTemp)
# get shape of the dataframe
print('The dataframe has {} rows.'.format(dfTfinal.shape[0]) + '\n')
# print resulting dataframe
dfTfinal

The dataframe has 103 rows.



Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
0,M4A,North York,Victoria Village
0,M5A,Downtown Toronto,Harbourfront
0,M6A,North York,"Lawrence Heights, Lawrence Manor"
0,M7A,Downtown Toronto,Queen's Park
0,M9A,Queen's Park,Queen's Park
0,M1B,Scarborough,"Rouge, Malvern"
0,M3B,North York,Don Mills North
0,M4B,East York,"Woodbine Gardens, Parkview Hill"
0,M5B,Downtown Toronto,"Ryerson, Garden District"


# PART 2

#### get latitude and longitude of these neighbourhoods

tried using geopy package (finds only a few locations)

In [268]:
#latLng = [four2.getLatLng('{}, postcode {}, Toronto, Ontario, Canada'.format(\
#        dfTfinal.iloc[n].Borough, dfTfinal.iloc[n].Postcode)) for n in range(dfTfinal.shape[0])]
#latLng

load data from supplied file instead

In [269]:
filePath = 'Geospatial_Coordinates.csv'
dfFile = pd.read_csv(filePath)

make sure the postal codes are sorted alphabetically

In [270]:
print(dfFile.shape)
dfFile.sort_values(by=['Postal Code'])

dfFile.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


sort the neighbourhood data alphabetically as well and combine the two dataframes 

In [271]:
dfT = dfTfinal.copy()
dfT = dfT.reset_index(drop=True)
dfT = dfT.sort_values(by=['Postcode'])
dfT.insert(dfT.shape[1],'Latitude', dfFile['Latitude'])
dfT.insert(dfT.shape[1],'Longitude', dfFile['Longitude'])
print(dfT.columns)
print(dfT.shape)
dfT = dfT.reset_index(drop=True)
#dfT.head()
# show resulting combined dataframe
dfT

Index(['Postcode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'], dtype='object')
(103, 5)


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.727929,-79.262029
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7942,-79.262029
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.778517,-79.346556
3,M1G,Scarborough,Woburn,43.77012,-79.408493
4,M1H,Scarborough,Cedarbrae,43.745906,-79.352188
5,M1J,Scarborough,Scarborough Village,43.728496,-79.495697
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.70906,-79.363452
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.72802,-79.38879
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.667967,-79.367675
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.650571,-79.384568


# PART 3

#### create a map of Toronto with the neighbourhoods superimposed

In [272]:
# get the general location of Toronto
latT, lngT = four2.getLatLng('Toronto, Canada')
# create the map of Toronto
mapT = folium.Map(location=[latT, lngT], zoom_start=10)

# add one marker per neighborhood to map
for lat, lng, borough, neighbourhood in \
zip(dfT['Latitude'], dfT['Longitude'], dfT['Borough'], dfT['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(mapT) 
    
# display the map
mapT

#### explore the neighbourhoods using foursquare

get foursquare client info and define base path for the search URLs

# (see four2 folder in github for function definitions)

In [273]:
# get my client ID and secret from a file
client = four2.getClient()
# this is how every query in foursquare starts
baseURL = 'https://api.foursquare.com/v2/'

for simplicity take only neighborhoods containing the word "Toronto"

In [274]:
dfTT = dfT[dfT['Borough'].str.contains('Toronto', regex=False)]
dfTT.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.786947,-79.385975
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.704324,-79.38879
43,M4M,East Toronto,Studio District,43.657162,-79.378937
44,M4N,Central Toronto,Lawrence Park,43.648198,-79.379817


show the neighborhoods containing the word "Toronto"

In [275]:
# get the general location of Toronto
latT, lngT = four2.getLatLng('Toronto, Canada')
# create the map of Toronto
mapTT = folium.Map(location=[latT, lngT], zoom_start=11)

# add one marker per neighborhood to map
for lat, lng, borough, neighbourhood in \
zip(dfTT['Latitude'], dfTT['Longitude'], dfTT['Borough'], dfTT['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(mapTT) 
    
# display the map
mapTT

get all venues in those neighborhoods

In [276]:
# define search parameters
# see four2 folder in github for function definitions
group = 'venues'
version = -1
limit = 100
meters = 500
query = -1
# define columns needed in order to keep a search result
colFilter = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
# initialize a list of pandas dataframes
venues = []
for lat, lng in zip (dfTT['Latitude'], dfTT['Longitude']):
    results = four2.infoLocation(baseURL, client, group, version, lat, lng, limit, meters, query)
    if results['response']['totalResults'] == 0:
        venues.append(pd.DataFrame())
    else:
        venues.append(four2.json2pdDF(results, colFilter))
print('venues found for {} unique postal codes.'.format(len(venues)))

venues found for 39 unique postal codes.


test the function return for a user-defined neigborhood/ postal code (select value for "k")

In [277]:
k=15 # this is the neighborhood/ postal code that we want to see the venues in
print('{} venues were returned for neighbourhood "{}".'.format(venues[k].shape[0], dfTT.iloc[k]['Neighborhood']))
venues[k]

14 venues were returned for neighbourhood "St. James Town".


Unnamed: 0,name,categories,lat,lng
0,Mr Congee Chinese Cuisine 龍粥記,Chinese Restaurant,43.798879,-79.318335
1,Phoenix Restaurant 金鳳餐廳,Chinese Restaurant,43.798198,-79.318432
2,Subway,Sandwich Place,43.798671,-79.318475
3,Price Chopper,Grocery Store,43.799445,-79.318563
4,KFC,Fast Food Restaurant,43.798938,-79.318854
5,Shoppers Drug Mart,Pharmacy,43.799966,-79.317985
6,Tim Hortons,Coffee Shop,43.799102,-79.318715
7,McDonald's,Fast Food Restaurant,43.798249,-79.318167
8,Pizza Pizza,Pizza Place,43.797909,-79.318113
9,Eggsmart,Breakfast Spot,43.796375,-79.318681


lets put all venues in one dataframe with the neighbourhood as an additional column

In [278]:
# create one data frame with all venues and the postal code as key for the individual dataframes
dfVenues = pd.concat(venues, keys=dfTT['Neighborhood'], sort=False)
dfVenues.head(n=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,name,categories,lat,lng
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
The Beaches,0,Sun Star Chinese Cuisine 翠景小炒,Chinese Restaurant,43.787914,-79.381234
The Beaches,1,TD Canada Trust,Bank,43.788074,-79.380367
The Beaches,2,Maxim's Cafe and Patisserie,Café,43.787863,-79.380751
The Beaches,3,Kaga Sushi,Japanese Restaurant,43.787758,-79.38109
"The Danforth West, Riverdale",0,Pantheon,Greek Restaurant,43.677621,-79.351434
"The Danforth West, Riverdale",1,MenEssentials,Cosmetics Shop,43.67782,-79.351265
"The Danforth West, Riverdale",2,Cafe Fiorentina,Italian Restaurant,43.677743,-79.350115
"The Danforth West, Riverdale",3,Dolce Gelato,Ice Cream Shop,43.677773,-79.351187
"The Danforth West, Riverdale",4,Mezes,Greek Restaurant,43.677962,-79.350196
"The Danforth West, Riverdale",5,Louis Cifer Brew Works,Brewery,43.677663,-79.351313


let's find out about the number of unique venue types in our data

In [279]:
print('There are {} uniques categories.'.format(len(dfVenues['categories'].unique())))

There are 215 uniques categories.


how many venues were returned for each neighborhood?

In [280]:
dfVenues.groupby('Neighborhood').count()

Unnamed: 0_level_0,name,categories,lat,lng
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Beaches,4,4,4,4
"The Danforth West, Riverdale",43,43,43,43
"The Beaches West, India Bazaar",33,33,33,33
Studio District,100,100,100,100
Lawrence Park,100,100,100,100
Davisville North,94,94,94,94
North Toronto West,4,4,4,4
Davisville,4,4,4,4
"Moore Park, Summerhill East",14,14,14,14
"Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West",11,11,11,11


In [281]:
dfVenues.columns

Index(['name', 'categories', 'lat', 'lng'], dtype='object')

go to one hot encoding

In [282]:
# initialize dummy pandas dataframe
dfTT_onehot = pd.get_dummies(dfVenues[['categories']], prefix="", prefix_sep="")
# display shape and first ten entries in onehot encoded dataframe
print("onehot encoding shape: " + str(dfTT_onehot.shape))
dfTT_onehot.head(n=10)

onehot encoding shape: (792, 215)


Unnamed: 0_level_0,Unnamed: 1_level_0,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Basketball Court,Basketball Stadium,Bed & Breakfast,Beer Bar,Beer Store,Belgian Restaurant,Bistro,Board Shop,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Building,Burger Joint,Burrito Place,Bus Line,Bus Stop,Business Service,Butcher,Cafeteria,Café,Candy Store,Caribbean Restaurant,Check Cashing Service,Cheese Shop,Chinese Restaurant,Church,Clothing Store,Cocktail Bar,Coffee Shop,College Rec Center,College Stadium,Comfort Food Restaurant,Comic Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Costume Shop,Coworking Space,Creperie,Cuban Restaurant,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Fireworks Store,Fish & Chips Shop,Fish Market,Food,Food & Drink Shop,Food Court,Food Truck,Fountain,French Restaurant,Fried Chicken Joint,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden,Garden Center,Gas Station,Gastropub,Gay Bar,General Entertainment,General Travel,Gift Shop,Gluten-free Restaurant,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hakka Restaurant,Harbor / Marina,Hardware Store,Health Food Store,Hockey Arena,Home Service,Hookah Bar,Hospital,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Indoor Play Area,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Jazz Club,Juice Bar,Knitting Store,Lake,Latin American Restaurant,Light Rail Station,Lingerie Store,Liquor Store,Lounge,Market,Martial Arts Dojo,Medical Center,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Movie Theater,Museum,Music Venue,Neighborhood,New American Restaurant,Nightclub,Noodle House,Office,Optical Shop,Organic Grocery,Other Great Outdoors,Park,Pet Store,Pharmacy,Pizza Place,Plane,Playground,Plaza,Poke Place,Portuguese Restaurant,Post Office,Poutine Place,Pub,Ramen Restaurant,Record Shop,Rental Car Location,Restaurant,Salad Place,Sandwich Place,Sculpture Garden,Seafood Restaurant,Shopping Mall,Skate Park,Skating Rink,Smoke Shop,Smoothie Shop,Snack Place,Social Club,South American Restaurant,Spa,Sporting Goods Shop,Stationery Store,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Taco Place,Tailor Shop,Tanning Salon,Tea Room,Thai Restaurant,Theater,Thrift / Vintage Store,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1
The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The Beaches,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The Beaches,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The Beaches,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"The Danforth West, Riverdale",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"The Danforth West, Riverdale",2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"The Danforth West, Riverdale",3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"The Danforth West, Riverdale",4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"The Danforth West, Riverdale",5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


take the mean of the frequency of occurrence of each category

In [285]:
# change index name to avoid "groupby" ambiguity
# (Neighborhood is both an index and a column)
dfTT_onehot.index.names = ['Neighbourhood', None]
# group data
dfTT_grouped = dfTT_onehot.groupby('Neighbourhood').mean().reset_index()
# display shape of grouped dataframe and the df itself
print("mean category df grouped by neighborhood shape: " + str(dfTT_grouped.shape))
dfTT_grouped.head()

mean category df grouped by neighborhood shape: (39, 216)


Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Basketball Court,Basketball Stadium,Bed & Breakfast,Beer Bar,Beer Store,Belgian Restaurant,Bistro,Board Shop,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Building,Burger Joint,Burrito Place,Bus Line,Bus Stop,Business Service,Butcher,Cafeteria,Café,Candy Store,Caribbean Restaurant,Check Cashing Service,Cheese Shop,Chinese Restaurant,Church,Clothing Store,Cocktail Bar,Coffee Shop,College Rec Center,College Stadium,Comfort Food Restaurant,Comic Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Costume Shop,Coworking Space,Creperie,Cuban Restaurant,Curling Ice,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Fireworks Store,Fish & Chips Shop,Fish Market,Food,Food & Drink Shop,Food Court,Food Truck,Fountain,French Restaurant,Fried Chicken Joint,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden,Garden Center,Gas Station,Gastropub,Gay Bar,General Entertainment,General Travel,Gift Shop,Gluten-free Restaurant,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hakka Restaurant,Harbor / Marina,Hardware Store,Health Food Store,Hockey Arena,Home Service,Hookah Bar,Hospital,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Indoor Play Area,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Jazz Club,Juice Bar,Knitting Store,Lake,Latin American Restaurant,Light Rail Station,Lingerie Store,Liquor Store,Lounge,Market,Martial Arts Dojo,Medical Center,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Movie Theater,Museum,Music Venue,Neighborhood,New American Restaurant,Nightclub,Noodle House,Office,Optical Shop,Organic Grocery,Other Great Outdoors,Park,Pet Store,Pharmacy,Pizza Place,Plane,Playground,Plaza,Poke Place,Portuguese Restaurant,Post Office,Poutine Place,Pub,Ramen Restaurant,Record Shop,Rental Car Location,Restaurant,Salad Place,Sandwich Place,Sculpture Garden,Seafood Restaurant,Shopping Mall,Skate Park,Skating Rink,Smoke Shop,Smoothie Shop,Snack Place,Social Club,South American Restaurant,Spa,Sporting Goods Shop,Stationery Store,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Taco Place,Tailor Shop,Tanning Salon,Tea Room,Thai Restaurant,Theater,Thrift / Vintage Store,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"The Danforth West, Riverdale",0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046512,0.0,0.0,0.023256,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.093023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.046512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209302,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069767,0.023256,0.0,0.0,0.0,0.0,0.069767,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.023256,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256
2,"The Beaches West, India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.030303,0.0,0.060606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,0.030303,0.0,0.0,0.060606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,0.030303,0.060606,0.0,0.0,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,0.090909,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060606,0.0,0.0,0.0,0.0,0.030303,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Studio District,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.01,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.01,0.0,0.05,0.0,0.1,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.01,0.0,0.01,0.0,0.0,0.02,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.03,0.01,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.02,0.0,0.0,0.02,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.02,0.0,0.01,0.0,0.0,0.01,0.01,0.0,0.01,0.02,0.01,0.02,0.0,0.01,0.0,0.01,0.01,0.0,0.01,0.01,0.0,0.0,0.0
4,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.11,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.01,0.01,0.01,0.0,0.01,0.0,0.03,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.03,0.02,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.05,0.01,0.01,0.0,0.03,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.03,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0,0.0


Let's print each neighborhood along with the top 5 most common venues

In [286]:
num_top_venues = 5
for hood in dfTT_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = dfTT_grouped[dfTT_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----The Beaches----
                 venue  freq
0                 Café  0.25
1  Japanese Restaurant  0.25
2   Chinese Restaurant  0.25
3                 Bank  0.25
4              Airport  0.00


----The Danforth West, Riverdale----
                    venue  freq
0        Greek Restaurant  0.21
1             Coffee Shop  0.09
2      Italian Restaurant  0.07
3          Ice Cream Shop  0.07
4  Furniture / Home Store  0.05


----The Beaches West, India Bazaar----
                venue  freq
0        Dessert Shop  0.09
1      Sandwich Place  0.09
2         Coffee Shop  0.06
3                Café  0.06
4  Italian Restaurant  0.06


----Studio District----
                       venue  freq
0                Coffee Shop  0.10
1             Clothing Store  0.05
2             Cosmetics Shop  0.04
3                       Café  0.04
4  Middle Eastern Restaurant  0.03


----Lawrence Park----
         venue  freq
0  Coffee Shop  0.11
1         Café  0.07
2        Hotel  0.06
3   Restaurant  0.05
4

#### Let's put that into a pandas dataframe

First, let's write a function to sort the venues in descending order

In [287]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood

In [288]:
num_top_venues = 5

# define exceptions to "th" for 1st, 2nd and 3rd
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
hood_venues_sort = pd.DataFrame(columns=columns)
hood_venues_sort['Neighbourhood'] = dfTT_grouped['Neighbourhood']

for ind in np.arange(dfTT_grouped.shape[0]):
    hood_venues_sort.iloc[ind, 1:] = return_most_common_venues(dfTT_grouped.iloc[ind, :], num_top_venues)

hood_venues_sort.head(n=10)

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,The Beaches,Japanese Restaurant,Chinese Restaurant,Bank,Café,Yoga Studio
1,"The Danforth West, Riverdale",Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store
2,"The Beaches West, India Bazaar",Sandwich Place,Dessert Shop,Coffee Shop,Sushi Restaurant,Café
3,Studio District,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant
4,Lawrence Park,Coffee Shop,Café,Hotel,Restaurant,Gastropub
5,Davisville North,Café,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Dumpling Restaurant,Chinese Restaurant
6,North Toronto West,Park,Field,Trail,Hockey Arena,Yoga Studio
7,Davisville,Construction & Landscaping,Park,Basketball Court,Bakery,Yoga Studio
8,"Moore Park, Summerhill East",Gift Shop,Coffee Shop,Dessert Shop,Breakfast Spot,Italian Restaurant
9,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",Hotel,Coffee Shop,American Restaurant,Fried Chicken Joint,Burrito Place


#### Cluster Neighborhoods

Run k-means to cluster the neighborhood into clusters

In [289]:
# create weights for the individual columns based on number of top venues considered for each neighborhood
weights = 1/(np.arange(0,num_top_venues,1)+1)
weights_norm = weights/np.sum(weights)
print('weights for k means (%):')
print(weights_norm*100)
#np.sum(weights_norm)

# set number of clusters
kclusters = 5

dfTT_grouped_cluster = dfTT_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=9).fit(dfTT_grouped_cluster, weights_norm)

# add clustering labels
hood_venues_sort_cluster = hood_venues_sort.copy()
hood_venues_sort_cluster.insert(0, 'Cluster Labels', kmeans.labels_)

# check cluster labels generated for each row in the dataframe
print()
print('cluster labels for each neighbourhood')
kmeans.labels_ 

weights for k means (%):
[43.79562044 21.89781022 14.59854015 10.94890511  8.75912409]

cluster labels for each neighbourhood


array([1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3,
       1, 1, 2, 1, 1, 1, 1, 4, 2, 1, 1, 1, 1, 1, 1, 1, 0])

Let's create a new dataframe that includes the cluster label as well as the top venues for each neighborhood.

In [290]:
dfTT_merged = dfTT.copy()

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
dfTT_merged = dfTT_merged.join(hood_venues_sort_cluster.set_index('Neighbourhood'), on='Neighborhood')

dfTT_merged.head(n=20) # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
37,M4E,East Toronto,The Beaches,43.786947,-79.385975,1,Japanese Restaurant,Chinese Restaurant,Bank,Café,Yoga Studio
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.704324,-79.38879,1,Sandwich Place,Dessert Shop,Coffee Shop,Sushi Restaurant,Café
43,M4M,East Toronto,Studio District,43.657162,-79.378937,1,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant
44,M4N,Central Toronto,Lawrence Park,43.648198,-79.379817,1,Coffee Shop,Café,Hotel,Restaurant,Gastropub
45,M4P,Central Toronto,Davisville North,43.653206,-79.400049,1,Café,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Dumpling Restaurant,Chinese Restaurant
46,M4R,Central Toronto,North Toronto West,43.693781,-79.428191,2,Park,Field,Trail,Hockey Arena,Yoga Studio
47,M4S,Central Toronto,Davisville,43.713756,-79.490074,2,Construction & Landscaping,Park,Basketball Court,Bakery,Yoga Studio
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.64896,-79.456325,1,Gift Shop,Coffee Shop,Dessert Shop,Breakfast Spot,Italian Restaurant
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.636966,-79.615819,1,Hotel,Coffee Shop,American Restaurant,Fried Chicken Joint,Burrito Place


#### Examine Clusters

examine each cluster and determine the discriminating venue categories that distinguish each cluster  
assign a name to each cluster based on the defining categories

In [291]:
cluster_names = []

Cluster 1

In [292]:
dfTT_merged.loc[dfTT_merged['Cluster Labels'] == 0, dfTT_merged.columns[[1] + list(range(5, dfTT_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
87,East Toronto,0,Sandwich Place,Pizza Place,Mobile Phone Shop,Bus Line,Yoga Studio


In [293]:
cluster_names.append('misc')

Cluster 2

In [294]:
dfTT_merged.loc[dfTT_merged['Cluster Labels'] == 1, dfTT_merged.columns[[1] + list(range(5, dfTT_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
37,East Toronto,1,Japanese Restaurant,Chinese Restaurant,Bank,Café,Yoga Studio
41,East Toronto,1,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store
42,East Toronto,1,Sandwich Place,Dessert Shop,Coffee Shop,Sushi Restaurant,Café
43,East Toronto,1,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant
44,Central Toronto,1,Coffee Shop,Café,Hotel,Restaurant,Gastropub
45,Central Toronto,1,Café,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Dumpling Restaurant,Chinese Restaurant
48,Central Toronto,1,Gift Shop,Coffee Shop,Dessert Shop,Breakfast Spot,Italian Restaurant
49,Central Toronto,1,Hotel,Coffee Shop,American Restaurant,Fried Chicken Joint,Burrito Place
51,Downtown Toronto,1,Empanada Restaurant,Pizza Place,Home Service,Food Truck,Department Store
52,Downtown Toronto,1,Pizza Place,Middle Eastern Restaurant,Coffee Shop,Intersection,Chinese Restaurant


In [295]:
cluster_names.append('food')

Cluster 3

In [296]:
dfTT_merged.loc[dfTT_merged['Cluster Labels'] == 2, dfTT_merged.columns[[1] + list(range(5, dfTT_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
46,Central Toronto,2,Park,Field,Trail,Hockey Arena,Yoga Studio
47,Central Toronto,2,Construction & Landscaping,Park,Basketball Court,Bakery,Yoga Studio
50,Downtown Toronto,2,Park,Baseball Field,Business Service,Yoga Studio,Dim Sum Restaurant
58,Downtown Toronto,2,Airport,Park,Department Store,Empanada Restaurant,Electronics Store
65,Central Toronto,2,Women's Store,Fast Food Restaurant,Park,Market,Yoga Studio
75,Downtown Toronto,2,Park,Fireworks Store,Food & Drink Shop,Bus Stop,Yoga Studio


In [297]:
cluster_names.append('recreation')

Cluster 4

In [298]:
dfTT_merged.loc[dfTT_merged['Cluster Labels'] == 3, dfTT_merged.columns[[1] + list(range(5, dfTT_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
61,Downtown Toronto,3,Playground,Yoga Studio,Department Store,Empanada Restaurant,Electronics Store


In [299]:
cluster_names.append('misc')

Cluster 5

In [300]:
dfTT_merged.loc[dfTT_merged['Cluster Labels'] == 4, dfTT_merged.columns[[1] + list(range(5, dfTT_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
70,Downtown Toronto,4,Baseball Field,Yoga Studio,Dessert Shop,Empanada Restaurant,Electronics Store


In [301]:
cluster_names.append('baseball')

Finally, let's visualize the resulting clusters

In [302]:
# create map
map_clusters = folium.Map(location=[latT, lngT], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dfTT_merged['Latitude'], dfTT_merged['Longitude'], \
                                  dfTT_merged['Neighborhood'], dfTT_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster: ' + cluster_names[cluster], parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters