# Clustering and Segmenting Neighborhoods in Toronto
***

##### Import all necessary libraries and functions

In [1]:
#Import libraries

import numpy as np  #library for taking care of data vectors and arrays

import pandas as pd  #data analysis library
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json #json library

#!pip install geopy
from geopy.geocoders import Nominatim #used to convert an address into latitude and longitude

import requests #request handling
from pandas.io.json import json_normalize # make json file into a pandas dataframe


import matplotlib.cm as cm #plotting libraries
import matplotlib.colors as colors

#!pip install sklearn
from sklearn.cluster import KMeans #k-means for clustering

import folium #map making library

#!pip install bs4
import urllib.request #used for web request handling
from bs4 import BeautifulSoup #used for scraping the webpage
import html5lib #used for reading html

#!pip install geopy
import geopy 

print('All done!')

All done!


## Web Scrape

##### Scrape the Wikipedia page for the table

In [2]:
#Raw web scrape

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')

In [3]:
#Arrange the table text into lists

info_table = soup.findAll('table')[0] #Get 1st table on page

A=[] #Empty lists to fill
B=[]
C=[]

for row in info_table.findAll('tr'): #Look in BS table elements
    cells = row.findAll('td') #Look in each element of BS table element
    if len(cells)==3:
        A.append(cells[0].find(text=True)) #Assign first element to list A
        B.append(cells[1].find(text=True)) #Assign second element to list B
        C.append(cells[2].find(text=True)) #Assign third element to list C
        
C = [item.strip() for item in C if str(item)] #Remove /n from the end of words in C

In [4]:
#Form and clean the data frame

df = pd.DataFrame({'Postal Code':A, 'Borough':B, 'Neighborhood':C})  #Make dataframe
df = df[df.Borough !='Not assigned'] #Delete all Not assigned values

In [5]:
#Group dataframe by Postal Code

df = df.groupby(["Postal Code",'Borough'])['Neighborhood'].apply(lambda tags: ','.join(tags)) #Group all neighborhoods with the same Postal Code
df = pd.DataFrame(data=df)  #Reformat as data frame
df = df.reset_index() #Add index column back in
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [6]:
#Get shape of dataframe
df.shape

(103, 3)

## Add Geographic Data

In [7]:
#Read csv file
geodata = 'http://cocl.us/Geospatial_data' #variable name for csv file containing geographic data
geodf = pd.read_csv(geodata) #create data frame from csv
geodf = geodf.sort_values(by='Postal Code', ascending=True)  #insure geographic data frame is sorted by postal code
geodf.head(10) #view top of dataframe

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [8]:
#Add latitdue and longitude to data frame for corresponding postal codes

df = pd.merge(df, geodf, on='Postal Code') #Merge df and geodf, matching up rows with the same Postal Code
df.head(10) #view dataframe

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [9]:
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


## Explore Toronto Neighborhoods

In [10]:
#Slice dataframe for Toronto boroughs only

dfT = df.set_index('Borough') #set index column to Borough to make slicing easier
dfT = dfT.loc[['East Toronto','West Toronto','Central Toronto','Downtown Toronto']] #Grab only boroughs that pertain to Toronto
dfT = dfT.sort_values(by='Borough').reset_index(drop=False) #Clean up the dataframe
dfT #View dataframe

Unnamed: 0,Borough,Postal Code,Neighborhood,Latitude,Longitude
0,Central Toronto,M5R,"The Annex,North Midtown,Yorkville",43.67271,-79.405678
1,Central Toronto,M5P,"Forest Hill North,Forest Hill West",43.696948,-79.411307
2,Central Toronto,M5N,Roselawn,43.711695,-79.416936
3,Central Toronto,M4V,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049
4,Central Toronto,M4T,"Moore Park,Summerhill East",43.689574,-79.38316
5,Central Toronto,M4S,Davisville,43.704324,-79.38879
6,Central Toronto,M4R,North Toronto West,43.715383,-79.405678
7,Central Toronto,M4P,Davisville North,43.712751,-79.390197
8,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
9,Downtown Toronto,M5H,"Adelaide,King,Richmond",43.650571,-79.384568


In [11]:
#Make a map of Toronto

latitude = dfT.iloc[0]['Latitude']  #Use neighborhood coordinates from Central Toronto for long and lat
longitude = dfT.iloc[0]['Longitude']
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(dfT['Latitude'], dfT['Longitude'], dfT['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

In [12]:
#Define Foursquare credentials

CLIENT_ID = '4GM1A03YOPGDEIR0CVZDIDKOQTPGJCSAOVGUW5WH122BKJPF' # your Foursquare ID
CLIENT_SECRET = 'MBYZDG2IIVPUYMCFKCJ2O52ZZT4VCDCRGMVLK3L3NY5O5GIP' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 4GM1A03YOPGDEIR0CVZDIDKOQTPGJCSAOVGUW5WH122BKJPF
CLIENT_SECRET:MBYZDG2IIVPUYMCFKCJ2O52ZZT4VCDCRGMVLK3L3NY5O5GIP


In [13]:
#Define a function to find all venues in Toronto Neighborhoods

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
#Run function to find venues in neighborhoods

toronto_venues = getNearbyVenues(names=dfT['Neighborhood'],
                                   latitudes=dfT['Latitude'],
                                   longitudes=dfT['Longitude']
                                  )


The Annex,North Midtown,Yorkville
Forest Hill North,Forest Hill West
Roselawn
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Moore Park,Summerhill East
Davisville
North Toronto West
Davisville North
Lawrence Park
Adelaide,King,Richmond
Central Bay Street
Ryerson,Garden District
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Berczy Park
St. James Town
Harbourfront East,Toronto Islands,Union Station
Harbourfront
Queen's Park
Cabbagetown,St. James Town
Rosedale
Christie
Church and Wellesley
Business Reply Mail Processing Centre 969 Eastern
Studio District
The Beaches West,India Bazaar
The Danforth West,Riverdale
The Beaches
Runnymede,Swansea
Parkdale,Roncesvalles
High Park,The Junction South
Brockton,Exhibition Pla

In [15]:
#Some exploratory analysis on the venues dataframe

print(toronto_venues.shape)
toronto_venues.head()

(1714, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"The Annex,North Midtown,Yorkville",43.67271,-79.405678,Ezra's Pound,43.675153,-79.405858,Café
1,"The Annex,North Midtown,Yorkville",43.67271,-79.405678,Roti Cuisine of India,43.674618,-79.408249,Indian Restaurant
2,"The Annex,North Midtown,Yorkville",43.67271,-79.405678,Rose & Sons,43.675668,-79.403617,American Restaurant
3,"The Annex,North Midtown,Yorkville",43.67271,-79.405678,Live Organic Food Bar,43.675053,-79.406715,Vegetarian / Vegan Restaurant
4,"The Annex,North Midtown,Yorkville",43.67271,-79.405678,Fet Zun,43.675147,-79.406346,Middle Eastern Restaurant


In [16]:
#Count number of venues in the neighborhoods

toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100
Berczy Park,57,57,57,57,57,57
"Brockton,Exhibition Place,Parkdale Village",25,25,25,25,25,25
Business Reply Mail Processing Centre 969 Eastern,15,15,15,15,15,15
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",15,15,15,15,15,15
"Cabbagetown,St. James Town",44,44,44,44,44,44
Central Bay Street,79,79,79,79,79,79
"Chinatown,Grange Park,Kensington Market",88,88,88,88,88,88
Christie,18,18,18,18,18,18
Church and Wellesley,86,86,86,86,86,86


## Analyze the Neighborhoods

In [17]:
# One hot encode the Venue Categories

toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
del toronto_onehot['Neighborhood'] #remove existing Neighborhood column

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Stadium,Basketball Stadium,Beach,Bed & Breakfast,Beer Bar,Beer Store,Belgian Restaurant,Bistro,Boat or Ferry,Bookstore,Boutique,Brazilian Restaurant,Breakfast Spot,Brewery,Bubble Tea Shop,Building,Burger Joint,Burrito Place,Bus Line,Butcher,Café,Cajun / Creole Restaurant,Camera Store,Candy Store,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Chocolate Shop,Church,Climbing Gym,Clothing Store,Cocktail Bar,Coffee Shop,College Arts Building,College Auditorium,College Gym,College Rec Center,Colombian Restaurant,Comfort Food Restaurant,Comic Shop,Concert Hall,Convenience Store,Cosmetics Shop,Costume Shop,Coworking Space,Creperie,Cuban Restaurant,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Fish & Chips Shop,Fish Market,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court,Food Truck,Fountain,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden,Garden Center,Gas Station,Gastropub,Gay Bar,General Entertainment,General Travel,German Restaurant,Gift Shop,Gluten-free Restaurant,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Harbor / Marina,Health & Beauty Service,Health Food Store,Historic Site,History Museum,Hospital,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,IT Services,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Indoor Play Area,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Korean Restaurant,Lake,Latin American Restaurant,Light Rail Station,Lingerie Store,Liquor Store,Lounge,Mac & Cheese Joint,Market,Massage Studio,Mediterranean Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Movie Theater,Museum,Music Venue,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Park,Performing Arts Venue,Pet Store,Pharmacy,Pizza Place,Plane,Playground,Plaza,Poke Place,Pool,Portuguese Restaurant,Poutine Place,Pub,Ramen Restaurant,Record Shop,Recording Studio,Rental Car Location,Restaurant,Roof Deck,Sake Bar,Salad Place,Salon / Barbershop,Sandwich Place,Scenic Lookout,Sculpture Garden,Seafood Restaurant,Shoe Store,Shopping Mall,Skate Park,Skating Rink,Smoke Shop,Snack Place,Spa,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Stationery Store,Steakhouse,Strip Club,Supermarket,Sushi Restaurant,Swim School,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"The Annex,North Midtown,Yorkville",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"The Annex,North Midtown,Yorkville",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"The Annex,North Midtown,Yorkville",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"The Annex,North Midtown,Yorkville",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,"The Annex,North Midtown,Yorkville",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
#Group rows by neighborhood, take mean of occurance of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [19]:
#Print Each Neighborhood along with top 5 venue categories

num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
             venue  freq
0      Coffee Shop  0.07
1       Restaurant  0.05
2             Café  0.04
3  Thai Restaurant  0.04
4       Steakhouse  0.03


----Berczy Park----
                venue  freq
0         Coffee Shop  0.09
1        Cocktail Bar  0.05
2            Beer Bar  0.04
3      Farmers Market  0.04
4  Seafood Restaurant  0.04


----Brockton,Exhibition Place,Parkdale Village----
                   venue  freq
0                   Café  0.12
1         Breakfast Spot  0.08
2  Performing Arts Venue  0.08
3            Coffee Shop  0.08
4              Nightclub  0.08


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0         Yoga Studio  0.07
1       Auto Workshop  0.07
2       Garden Center  0.07
3              Garden  0.07
4  Light Rail Station  0.07


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
                venue  freq
0      Airport Lounge  

In [20]:
#Define function to sort venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [21]:
#Create dataframe with top 10 venues in each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Restaurant,Thai Restaurant,Café,Bar,Steakhouse,Sushi Restaurant,Concert Hall,Seafood Restaurant,Bakery
1,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Beer Bar,Seafood Restaurant,Restaurant,Farmers Market,Café,Cheese Shop,Comfort Food Restaurant
2,"Brockton,Exhibition Place,Parkdale Village",Café,Performing Arts Venue,Nightclub,Coffee Shop,Breakfast Spot,Bakery,Pet Store,Climbing Gym,Restaurant,Burrito Place
3,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Auto Workshop,Skate Park,Brewery,Spa,Burrito Place,Farmers Market,Fast Food Restaurant,Restaurant,Light Rail Station
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Lounge,Airport Service,Boutique,Harbor / Marina,Boat or Ferry,Bar,Coffee Shop,Plane,Sculpture Garden,Airport Terminal


## Cluster Neighborhoods

In [22]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [23]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = dfT

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Postal Code,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,M5R,"The Annex,North Midtown,Yorkville",43.67271,-79.405678,1,Sandwich Place,Café,Coffee Shop,Donut Shop,Flower Shop,Pub,Middle Eastern Restaurant,BBQ Joint,History Museum,Indian Restaurant
1,Central Toronto,M5P,"Forest Hill North,Forest Hill West",43.696948,-79.411307,4,Jewelry Store,Trail,Sushi Restaurant,Mexican Restaurant,Yoga Studio,Dim Sum Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
2,Central Toronto,M5N,Roselawn,43.711695,-79.416936,2,Pool,Garden,Yoga Studio,Department Store,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
3,Central Toronto,M4V,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049,1,Pub,Coffee Shop,Supermarket,Restaurant,Light Rail Station,Vietnamese Restaurant,Liquor Store,Pizza Place,American Restaurant,Sushi Restaurant
4,Central Toronto,M4T,"Moore Park,Summerhill East",43.689574,-79.38316,3,Park,Playground,Restaurant,Tennis Court,Comic Shop,Department Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


In [24]:
#Visualize the clusters

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12.1)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

### Cluster 1

In [25]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postal Code,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
32,M4E,0,Trail,Pub,Health Food Store,Yoga Studio,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant


### Cluster 2

In [26]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postal Code,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5R,1,Sandwich Place,Café,Coffee Shop,Donut Shop,Flower Shop,Pub,Middle Eastern Restaurant,BBQ Joint,History Museum,Indian Restaurant
3,M4V,1,Pub,Coffee Shop,Supermarket,Restaurant,Light Rail Station,Vietnamese Restaurant,Liquor Store,Pizza Place,American Restaurant,Sushi Restaurant
5,M4S,1,Pizza Place,Sandwich Place,Dessert Shop,Gym,Italian Restaurant,Café,Sushi Restaurant,Coffee Shop,Greek Restaurant,Seafood Restaurant
6,M4R,1,Coffee Shop,Clothing Store,Seafood Restaurant,Salon / Barbershop,Restaurant,Rental Car Location,Café,Chinese Restaurant,Park,Sporting Goods Shop
7,M4P,1,Park,Hotel,Breakfast Spot,Sandwich Place,Dog Run,Food & Drink Shop,Department Store,Gym,Costume Shop,Coworking Space
8,M4N,1,Dim Sum Restaurant,Park,Swim School,Bus Line,Yoga Studio,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
9,M5H,1,Coffee Shop,Restaurant,Thai Restaurant,Café,Bar,Steakhouse,Sushi Restaurant,Concert Hall,Seafood Restaurant,Bakery
10,M5G,1,Coffee Shop,Italian Restaurant,Sandwich Place,Juice Bar,Burger Joint,Japanese Restaurant,Ice Cream Shop,Department Store,Dessert Shop,Thai Restaurant
11,M5B,1,Coffee Shop,Clothing Store,Middle Eastern Restaurant,Café,Japanese Restaurant,Burger Joint,Electronics Store,Bubble Tea Shop,Plaza,Cosmetics Shop
12,M5K,1,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Gastropub,Bar,American Restaurant,Japanese Restaurant,Seafood Restaurant


### Cluster 3

In [27]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postal Code,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5N,2,Pool,Garden,Yoga Studio,Department Store,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


### Cluster 4

In [28]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postal Code,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M4T,3,Park,Playground,Restaurant,Tennis Court,Comic Shop,Department Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
25,M4W,3,Park,Playground,Trail,Yoga Studio,Department Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


### Cluster 5

In [29]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postal Code,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M5P,4,Jewelry Store,Trail,Sushi Restaurant,Mexican Restaurant,Yoga Studio,Dim Sum Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
