## Final assignment project - clustering North York

In [1]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.

In [2]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files
from pandas.io.json import json_normalize

import geopy
from geopy.geocoders import Nominatim

import requests
from bs4 import BeautifulSoup

import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

# All requested packages already installed.



In [3]:
#Get request and parse data
data= requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup= BeautifulSoup(data, 'html.parser')

PostalCodeList= []
BoroughList= []
NeighborhoodList= []

for row in soup.find('table').find_all('tr'):
    cells= row.find_all('td')
    if(len(cells) > 0):
        PostalCodeList.append(cells[0].text)
        BoroughList.append(cells[1].text)
        NeighborhoodList.append(cells[2].text.rstrip('\n'))
        
tor_df= pd.DataFrame({"PostalCode": PostalCodeList,
                           "Borough": BoroughList,
                           "Neighborhood": NeighborhoodList})

tor_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
# clean data and group Neighborhoods
tor_df_dropna= tor_df[tor_df.Borough != "Not assigned"].reset_index(drop= True)
tor_df_grouped= tor_df_dropna.groupby(["PostalCode", "Borough"], as_index= False).agg(lambda x: ", ".join(x))

for index, row in tor_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]

print(tor_df_grouped.shape)
tor_df_grouped.head(3)

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"


In [5]:
#get postal coodes with geo coordinates linked
coordinates= pd.read_csv('https://cocl.us/Geospatial_data')
coordinates.rename(columns= {'Postal Code': 'PostalCode'}, inplace= True)
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
# merge tables by PostalCode
tor_df_new= tor_df_grouped.merge(coordinates, on= "PostalCode", how= "left")
tor_df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [7]:
#map of all Toronto's postal codes
address= 'Toronto'

geolocator= Nominatim(user_agent= "my-application")
location= geolocator.geocode(address)
latitude= location.latitude
longitude= location.longitude

map_tor = folium.Map(location= [latitude, longitude], zoom_start= 10)

#with markers
iter= 1
for lat, lng, borough, neighborhood in zip(tor_df_new['Latitude'], tor_df_new['Longitude'], tor_df_new['Borough'], tor_df_new['Neighborhood']):
    label= '{}, {}'.format(neighborhood, borough)
    label= folium.Popup(label, parse_html= True)
    folium.Marker(
        [lat, lng],
        popup= label).add_to(map_tor) 
       
map_tor

In [8]:
# filter York boroughs in the City of Toronto
borough_names = list(tor_df_new.Borough.unique())

borough_york = []

for x in borough_names:
    if "york" in x.lower():
        borough_york.append(x)
        
borough_york

['North York', 'East York', 'York']

In [9]:
# Exploe neighborhood with Foursquare API
CLIENT_ID = 'QZYQ0D1RVTROGQMQWME0QZS2Z3VRKEF0CKT0ENNLCYJQX3EC'
CLIENT_SECRET = '4SIARAYL1IKFO2PDCALVF1RAMNCYHOIWPNQP0MLPTYR025S2'
VERSION = '20180604'

radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(tor_df_new['Latitude'], tor_df_new['Longitude'], tor_df_new['PostalCode'], tor_df_new['Borough'], tor_df_new['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    print(url)
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

https://api.foursquare.com/v2/venues/explore?client_id=QZYQ0D1RVTROGQMQWME0QZS2Z3VRKEF0CKT0ENNLCYJQX3EC&client_secret=4SIARAYL1IKFO2PDCALVF1RAMNCYHOIWPNQP0MLPTYR025S2&v=20180604&ll=43.806686299999996,-79.19435340000001&radius=500&limit=100
https://api.foursquare.com/v2/venues/explore?client_id=QZYQ0D1RVTROGQMQWME0QZS2Z3VRKEF0CKT0ENNLCYJQX3EC&client_secret=4SIARAYL1IKFO2PDCALVF1RAMNCYHOIWPNQP0MLPTYR025S2&v=20180604&ll=43.7845351,-79.16049709999999&radius=500&limit=100
https://api.foursquare.com/v2/venues/explore?client_id=QZYQ0D1RVTROGQMQWME0QZS2Z3VRKEF0CKT0ENNLCYJQX3EC&client_secret=4SIARAYL1IKFO2PDCALVF1RAMNCYHOIWPNQP0MLPTYR025S2&v=20180604&ll=43.7635726,-79.1887115&radius=500&limit=100
https://api.foursquare.com/v2/venues/explore?client_id=QZYQ0D1RVTROGQMQWME0QZS2Z3VRKEF0CKT0ENNLCYJQX3EC&client_secret=4SIARAYL1IKFO2PDCALVF1RAMNCYHOIWPNQP0MLPTYR025S2&v=20180604&ll=43.7709921,-79.21691740000001&radius=500&limit=100
https://api.foursquare.com/v2/venues/explore?client_id=QZYQ0D1RVTROGQMQ

In [10]:
#convert venues into a dataframe
venues_df= pd.DataFrame(venues)
venues_df.columns= ['PostalCode', 'Borough', 'Neighborhood', 'Lat', 'Lon', 'VenueName', 'VenueLat', 'VenueLon', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(2221, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,Lat,Lon,VenueName,VenueLat,VenueLon,VenueCategory
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant


In [11]:
#filter out North York locations
NY_df= venues_df.loc[venues_df['Borough'] == "North York"]

print(NY_df.shape)
NY_df.head()

(243, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,Lat,Lon,VenueName,VenueLat,VenueLon,VenueCategory
84,M2H,North York,Hillcrest Village,43.803762,-79.363452,Eagle's Nest Golf Club,43.805455,-79.364186,Golf Course
85,M2H,North York,Hillcrest Village,43.803762,-79.363452,New York Fries,43.803664,-79.363905,Fast Food Restaurant
86,M2H,North York,Hillcrest Village,43.803762,-79.363452,AY Jackson Pool,43.804515,-79.366138,Pool
87,M2H,North York,Hillcrest Village,43.803762,-79.363452,Villa Madina,43.801685,-79.363938,Mediterranean Restaurant
88,M2H,North York,Hillcrest Village,43.803762,-79.363452,Duncan Creek Park,43.805539,-79.360695,Dog Run


In [12]:
#map of North York
address= 'North York'

geolocator= Nominatim(user_agent= "my-application")
location= geolocator.geocode(address)
latitude= location.latitude
longitude= location.longitude

map_NY = folium.Map(location= [latitude, longitude], zoom_start= 12)

#with markers
iter= 1
for lat, lng, borough, neighborhood in zip(NY_df['Lat'], NY_df['Lon'], NY_df['Borough'], NY_df['Neighborhood']):
    label= '{}, {}'.format(neighborhood, borough)
    label= folium.Popup(label, parse_html= True)
    folium.Marker(
        [lat, lng],
        popup= label).add_to(map_NY) 
       
map_NY

In [13]:
NY_df.groupby(["PostalCode", "Borough", "Neighborhood"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Lat,Lon,VenueName,VenueLat,VenueLon,VenueCategory
PostalCode,Borough,Neighborhood,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M2H,North York,Hillcrest Village,5,5,5,5,5,5
M2J,North York,"Fairview, Henry Farm, Oriole",67,67,67,67,67,67
M2K,North York,Bayview Village,4,4,4,4,4,4
M2L,North York,"Silver Hills, York Mills",1,1,1,1,1,1
M2N,North York,Willowdale South,35,35,35,35,35,35
M2P,North York,York Mills West,4,4,4,4,4,4
M2R,North York,Willowdale West,6,6,6,6,6,6
M3A,North York,Parkwoods,3,3,3,3,3,3
M3B,North York,Don Mills North,5,5,5,5,5,5
M3C,North York,"Flemingdon Park, Don Mills South",22,22,22,22,22,22


In [14]:
NY_df['VenueCategory'].unique()[:100]

array(['Golf Course', 'Fast Food Restaurant', 'Pool',
       'Mediterranean Restaurant', 'Dog Run', 'Toy / Game Store',
       'Burger Joint', 'Movie Theater', 'Shopping Mall', 'Bakery',
       'Candy Store', 'Tea Room', 'Electronics Store',
       'American Restaurant', 'Pharmacy', 'Clothing Store',
       'Department Store', 'Coffee Shop', 'Salon / Barbershop',
       'Smoothie Shop', 'Theater', 'Bank', 'Food Court',
       'Japanese Restaurant', 'Juice Bar', 'Liquor Store', 'Restaurant',
       'Cosmetics Shop', 'Video Game Store', 'Wings Joint',
       'Sporting Goods Shop', 'Asian Restaurant', 'Burrito Place',
       "Women's Store", 'Deli / Bodega', 'Gift Shop', 'Shoe Store',
       'Boutique', 'Luggage Store', 'Home Service', 'Chinese Restaurant',
       'Sandwich Place', 'Dessert Shop', 'Spa', 'Bus Station',
       'Bookstore', 'Baseball Field', 'Café', 'Cafeteria',
       'Grocery Store', 'Ramen Restaurant', 'Steakhouse',
       'Indonesian Restaurant', 'Plaza', 'Arts & Crafts

In [15]:
#number of categories
print('{} venue categories'.format(len(NY_df['VenueCategory'].unique())))

103 venue categories


In [16]:
#North York array by category
NY= pd.get_dummies(NY_df[['VenueCategory']], prefix="", prefix_sep="")

NY['PostalCode']= NY_df['PostalCode'] 
NY['Borough']= NY_df['Borough'] 
NY['Neighborhoods']= NY_df['Neighborhood'] 

fixed_columns = list(NY.columns[-3:]) + list(NY.columns[:-3])
NY= NY[fixed_columns]

print(NY.shape)
NY.head()

(243, 106)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,Beer Store,Bike Shop,Bookstore,Boutique,Bridal Shop,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Station,Butcher,Cafeteria,Café,Candy Store,Caribbean Restaurant,Chinese Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Electronics Store,Empanada Restaurant,Falafel Restaurant,Fast Food Restaurant,Financial or Legal Service,Food & Drink Shop,Food Court,Food Truck,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Gas Station,Gift Shop,Golf Course,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Hockey Arena,Home Service,Hotel,Ice Cream Shop,Indian Restaurant,Indonesian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Juice Bar,Liquor Store,Lounge,Luggage Store,Massage Studio,Mediterranean Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Movie Theater,Park,Pet Store,Pharmacy,Pizza Place,Plaza,Pool,Portuguese Restaurant,Pub,Ramen Restaurant,Restaurant,Salon / Barbershop,Sandwich Place,Shoe Store,Shopping Mall,Smoothie Shop,Spa,Sporting Goods Shop,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store
84,M2H,North York,Hillcrest Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
85,M2H,North York,Hillcrest Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
86,M2H,North York,Hillcrest Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
87,M2H,North York,Hillcrest Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
88,M2H,North York,Hillcrest Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
#groupping by Neighborhood
NY_grouped= NY.groupby(["PostalCode", "Borough", "Neighborhoods"]).mean().reset_index()

print(NY_grouped.shape)
NY_grouped.head()

(23, 106)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,Beer Store,Bike Shop,Bookstore,Boutique,Bridal Shop,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Station,Butcher,Cafeteria,Café,Candy Store,Caribbean Restaurant,Chinese Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Electronics Store,Empanada Restaurant,Falafel Restaurant,Fast Food Restaurant,Financial or Legal Service,Food & Drink Shop,Food Court,Food Truck,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Gas Station,Gift Shop,Golf Course,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Hockey Arena,Home Service,Hotel,Ice Cream Shop,Indian Restaurant,Indonesian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Juice Bar,Liquor Store,Lounge,Luggage Store,Massage Studio,Mediterranean Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Movie Theater,Park,Pet Store,Pharmacy,Pizza Place,Plaza,Pool,Portuguese Restaurant,Pub,Ramen Restaurant,Restaurant,Salon / Barbershop,Sandwich Place,Shoe Store,Shopping Mall,Smoothie Shop,Spa,Sporting Goods Shop,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,M2H,North York,Hillcrest Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M2J,North York,"Fairview, Henry Farm, Oriole",0.0,0.0,0.014925,0.0,0.014925,0.029851,0.014925,0.0,0.014925,0.0,0.0,0.014925,0.014925,0.0,0.0,0.014925,0.014925,0.029851,0.0,0.0,0.0,0.014925,0.0,0.014925,0.134328,0.074627,0.0,0.0,0.0,0.0,0.014925,0.014925,0.014925,0.014925,0.0,0.0,0.0,0.0,0.014925,0.0,0.0,0.074627,0.0,0.0,0.029851,0.0,0.0,0.0,0.0,0.0,0.014925,0.0,0.0,0.0,0.0,0.0,0.0,0.014925,0.0,0.0,0.0,0.0,0.0,0.0,0.044776,0.014925,0.014925,0.0,0.014925,0.0,0.0,0.0,0.0,0.014925,0.0,0.0,0.014925,0.0,0.0,0.0,0.0,0.0,0.0,0.014925,0.014925,0.014925,0.014925,0.014925,0.014925,0.014925,0.014925,0.0,0.0,0.0,0.029851,0.0,0.014925,0.029851,0.014925,0.0,0.0,0.014925,0.029851
2,M2K,North York,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M2L,North York,"Silver Hills, York Mills",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M2N,North York,Willowdale South,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.057143,0.0,0.0,0.0,0.0,0.085714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.028571,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.028571,0.028571,0.0,0.028571,0.0,0.0,0.028571,0.028571,0.0,0.028571,0.0,0.0,0.0,0.028571,0.0,0.028571,0.0,0.028571,0.0,0.057143,0.028571,0.0,0.0,0.0,0.085714,0.028571,0.0,0.057143,0.0,0.028571,0.0,0.0,0.0,0.028571,0.0,0.085714,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0


In [18]:
#top 10 venues by postal code
num_top_venues= 10
indicators= ['st', 'nd', 'rd']

areaColumns= ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns= []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns= areaColumns+freqColumns

neighborhood_venue= pd.DataFrame(columns=columns)
neighborhood_venue['PostalCode']= NY_grouped['PostalCode']
neighborhood_venue['Borough']= NY_grouped['Borough']
neighborhood_venue['Neighborhoods']= NY_grouped['Neighborhoods']

for ind in np.arange(NY_grouped.shape[0]):
    row_categories= NY_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted= row_categories.sort_values(ascending= False)
    neighborhood_venue.iloc[ind, 3:]= row_categories_sorted.index.values[0:num_top_venues]

print(neighborhood_venue.shape)
neighborhood_venue.head()

(23, 13)


Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M2H,North York,Hillcrest Village,Golf Course,Pool,Mediterranean Restaurant,Fast Food Restaurant,Dog Run,Diner,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop
1,M2J,North York,"Fairview, Henry Farm, Oriole",Clothing Store,Fast Food Restaurant,Coffee Shop,Japanese Restaurant,Women's Store,Food Court,Toy / Game Store,Bus Station,Tea Room,Bakery
2,M2K,North York,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Dog Run,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store
3,M2L,North York,"Silver Hills, York Mills",Cafeteria,Women's Store,Coffee Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop
4,M2N,North York,Willowdale South,Ramen Restaurant,Coffee Shop,Sushi Restaurant,Café,Pizza Place,Sandwich Place,Juice Bar,Japanese Restaurant,Lounge,Ice Cream Shop


In [19]:
#cluster the Area
kclusters= 5

NY_clustering= NY_grouped.drop(["PostalCode", "Borough", "Neighborhoods"], 1)
kmeans= KMeans(n_clusters= kclusters, random_state= 0).fit(NY_clustering)

kmeans.labels_[0:24] 

array([0, 4, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 3, 4,
       1], dtype=int32)

In [20]:
#new dataframe with clusters and top 5 venues
NY_new= neighborhood_venue.copy()

NY_new['Cluster Labels']= kmeans.labels_

print(NY_new.shape)
NY_new.head(3)

(23, 14)


Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
0,M2H,North York,Hillcrest Village,Golf Course,Pool,Mediterranean Restaurant,Fast Food Restaurant,Dog Run,Diner,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,0
1,M2J,North York,"Fairview, Henry Farm, Oriole",Clothing Store,Fast Food Restaurant,Coffee Shop,Japanese Restaurant,Women's Store,Food Court,Toy / Game Store,Bus Station,Tea Room,Bakery,4
2,M2K,North York,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Dog Run,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,4


In [21]:
#clustered area with coordinates
NY_new= NY_new.merge(coordinates, on= "PostalCode", how= "left")
NY_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
0,M2H,North York,Hillcrest Village,Golf Course,Pool,Mediterranean Restaurant,Fast Food Restaurant,Dog Run,Diner,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,0,43.803762,-79.363452
1,M2J,North York,"Fairview, Henry Farm, Oriole",Clothing Store,Fast Food Restaurant,Coffee Shop,Japanese Restaurant,Women's Store,Food Court,Toy / Game Store,Bus Station,Tea Room,Bakery,4,43.778517,-79.346556
2,M2K,North York,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Dog Run,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,4,43.786947,-79.385975
3,M2L,North York,"Silver Hills, York Mills",Cafeteria,Women's Store,Coffee Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,2,43.75749,-79.374714
4,M2N,North York,Willowdale South,Ramen Restaurant,Coffee Shop,Sushi Restaurant,Café,Pizza Place,Sandwich Place,Juice Bar,Japanese Restaurant,Lounge,Ice Cream Shop,4,43.77012,-79.408493


In [22]:
#sort by clusters
print(NY_new.shape)
NY_new.sort_values(["Cluster Labels"], inplace= True)
NY_new.head(3)

(23, 16)


Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
0,M2H,North York,Hillcrest Village,Golf Course,Pool,Mediterranean Restaurant,Fast Food Restaurant,Dog Run,Diner,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,0,43.803762,-79.363452
22,M9M,North York,"Emery, Humberlea",Furniture / Home Store,Baseball Field,Women's Store,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,1,43.724766,-79.532242
3,M2L,North York,"Silver Hills, York Mills",Cafeteria,Women's Store,Coffee Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,2,43.75749,-79.374714


In [23]:
#visualize area with Clusters
NYmap_clusters= folium.Map(location= [latitude, longitude], zoom_start= 11)

#color scheme
x= np.arange(kclusters)
ys= [i+x+(i*x)**2 for i in range(kclusters)]
colors_array= cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow= [colors.rgb2hex(i) for i in colors_array]

#add markers
markers_colors= []
for lat, lon, post, bor, nei, cluster in zip(NY_new['Latitude'], NY_new['Longitude'], NY_new['PostalCode'], NY_new['Borough'], NY_new['Neighborhoods'], NY_new['Cluster Labels']):
    label= folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, nei, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius= 5,
        popup= label,
        color= rainbow[cluster-1],
        fill= True,
        fill_color= rainbow[cluster-1],
        fill_opacity= 0.7).add_to(NYmap_clusters)
       
NYmap_clusters

In [24]:
#examine Cluster 1
NY_new.loc[NY_new['Cluster Labels'] == 0, NY_new.columns[[0] + list(range(2, NY_new.shape[1]))]]

Unnamed: 0,PostalCode,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
0,M2H,Hillcrest Village,Golf Course,Pool,Mediterranean Restaurant,Fast Food Restaurant,Dog Run,Diner,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,0,43.803762,-79.363452


In [25]:
#Cluster 2
NY_new.loc[NY_new['Cluster Labels'] == 1, NY_new.columns[[0] + list(range(2, NY_new.shape[1]))]]

Unnamed: 0,PostalCode,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
22,M9M,"Emery, Humberlea",Furniture / Home Store,Baseball Field,Women's Store,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,1,43.724766,-79.532242


In [26]:
#Cluster 3
NY_new.loc[NY_new['Cluster Labels'] == 2, NY_new.columns[[0] + list(range(2, NY_new.shape[1]))]]

Unnamed: 0,PostalCode,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
3,M2L,"Silver Hills, York Mills",Cafeteria,Women's Store,Coffee Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,2,43.75749,-79.374714


In [27]:
#Cluster 4
NY_new.loc[NY_new['Cluster Labels'] == 3, NY_new.columns[[0] + list(range(2, NY_new.shape[1]))]]

Unnamed: 0,PostalCode,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
12,M3K,"CFB Toronto, Downsview East",Park,Airport,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,3,43.737473,-79.464763
20,M6L,"Downsview, North Park, Upwood Park",Park,Construction & Landscaping,Bakery,Dog Run,Concert Hall,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop,3,43.713756,-79.490074
7,M3A,Parkwoods,Park,Construction & Landscaping,Food & Drink Shop,Women's Store,Discount Store,Concert Hall,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,3,43.753259,-79.329656


In [28]:
#Cluster 5
NY_new.loc[NY_new['Cluster Labels'] == 4, NY_new.columns[[0] + list(range(2, NY_new.shape[1]))]]

Unnamed: 0,PostalCode,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
19,M6B,Glencairn,Pizza Place,Bakery,Pub,Japanese Restaurant,Women's Store,Diner,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,4,43.709577,-79.445073
18,M6A,"Lawrence Heights, Lawrence Manor",Clothing Store,Accessories Store,Sporting Goods Shop,Miscellaneous Shop,Coffee Shop,Furniture / Home Store,Boutique,Gift Shop,Vietnamese Restaurant,Concert Hall,4,43.718518,-79.464763
17,M5M,"Bedford Park, Lawrence Manor East",Coffee Shop,Fast Food Restaurant,Italian Restaurant,Sandwich Place,Restaurant,Liquor Store,Juice Bar,Café,Pharmacy,Pizza Place,4,43.733283,-79.41975
16,M4A,Victoria Village,Intersection,Pizza Place,Portuguese Restaurant,Financial or Legal Service,Hockey Arena,Coffee Shop,Diner,Concert Hall,Construction & Landscaping,Convenience Store,4,43.725882,-79.315572
15,M3N,Downsview Northwest,Grocery Store,Gym,Gym / Fitness Center,Discount Store,Women's Store,Dog Run,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,4,43.761631,-79.520999
14,M3M,Downsview Central,Home Service,Food Truck,Baseball Field,Women's Store,Dog Run,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,4,43.728496,-79.495697
13,M3L,Downsview West,Grocery Store,Bank,Hotel,Park,Shopping Mall,Women's Store,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store,4,43.739015,-79.506944
2,M2K,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Dog Run,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,4,43.786947,-79.385975
21,M9L,Humber Summit,Pharmacy,Pizza Place,Empanada Restaurant,Shopping Mall,Diner,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,4,43.756303,-79.565963
10,M3H,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Supermarket,Middle Eastern Restaurant,Pharmacy,Pizza Place,Deli / Bodega,Bridal Shop,Restaurant,Diner,Shopping Mall,4,43.754328,-79.442259


### Observations
###### The most developed neighborhood is located in Cluster 5 which has lots of restaurans and cafees as well as small shops and department or grocery stores.
###### There are different sport facilities available - gyms, Baseball Field and a Hockey Arena. Also the area has a Concert Hall and a number of parks.

In [29]:
#Cluster 5 with geo coordinates
NY_2= NY_new.loc[NY_new['Cluster Labels'] == 4, NY_new.columns[[0] + list(range(2, NY_new.shape[1]))]]
NY_2= NY_2.filter(["PostalCode", "Neighborhoods", "Cluster Labels"], axis= 1)

print(NY_2.shape)
NY_2.head()

(17, 3)


Unnamed: 0,PostalCode,Neighborhoods,Cluster Labels
19,M6B,Glencairn,4
18,M6A,"Lawrence Heights, Lawrence Manor",4
17,M5M,"Bedford Park, Lawrence Manor East",4
16,M4A,Victoria Village,4
15,M3N,Downsview Northwest,4


In [30]:
#parse school data
data= requests.get('https://www.tcdsb.org/school/Alphabeticalschooldirectory/Pages/default.aspx').text
soup1= BeautifulSoup(data, 'html.parser')

SchoolNameList= []
PanelList= []
AddressList= []

for row in soup1.find('table').find_all('tr'):
    cells= row.find_all('td')
    if(len(cells) > 0):
        SchoolNameList.append(cells[1].text)
        PanelList.append(cells[2].text)
        AddressList.append(cells[3].text.rstrip('\n'))
        
tor_school= pd.DataFrame({"SchoolName": SchoolNameList,
                          "Panel": PanelList,
                          "Address": AddressList})

print(tor_school.shape)
tor_school.head()

(198, 3)


Unnamed: 0,SchoolName,Panel,Address
0,\r\n\t\t\tElementary schools serve students fr...,A,All Saints
1,A,All Saints,Elementary
2,All Saints,Elementary,1435 Royal York Road Weston ON M9P 3A7
3,Annunciation,Elementary,65 Avonwick Gate Don Mills ON M3A 2M8
4,St Agatha,Elementary,49 Cathedral Bluffs Drive Scarborough ON M1M 2T6


In [31]:
#clean school data
tor_school.drop(tor_school.index[[0,1]], inplace= True)
tor_school= tor_school[(tor_school['Panel'] == "Secondary")]

tor_school['Address'].astype(str)
tor_school['Address']= tor_school['Address'].str.slice(-7, -4)    
tor_school.rename(columns = {'Address':'PostalCode'}, inplace = True)

print(tor_school.shape)
tor_school.head()

(31, 3)


Unnamed: 0,SchoolName,Panel,PostalCode
17,Bishop Allen Academy,Secondary,M8Y
19,Bishop Marrocco/Thomas Merton,Secondary,M6P
24,Brebeuf,Secondary,M2M
28,St Basil-The-Great,Secondary,M9M
38,Cardinal Carter Academy,Secondary,M2N


In [32]:
#merge cluster 1 data with schools
NY_2= NY_2.merge(tor_school, on= "PostalCode", how= "left")

NY_2final= NY_2.dropna()
NY_2final

Unnamed: 0,PostalCode,Neighborhoods,Cluster Labels,SchoolName,Panel
0,M6B,Glencairn,4,Dante Alighieri,Secondary
2,M5M,"Bedford Park, Lawrence Manor East",4,Loretto Abbey,Secondary
14,M2N,Willowdale South,4,Cardinal Carter Academy,Secondary
16,M3J,"Northwood Park, York University",4,James Cardinal McGuigan,Secondary


In [33]:
#visualize the final choice
NY_2final= NY_2final.merge(coordinates, on= "PostalCode", how= "left")
NY_2final

Unnamed: 0,PostalCode,Neighborhoods,Cluster Labels,SchoolName,Panel,Latitude,Longitude
0,M6B,Glencairn,4,Dante Alighieri,Secondary,43.709577,-79.445073
1,M5M,"Bedford Park, Lawrence Manor East",4,Loretto Abbey,Secondary,43.733283,-79.41975
2,M2N,Willowdale South,4,Cardinal Carter Academy,Secondary,43.77012,-79.408493
3,M3J,"Northwood Park, York University",4,James Cardinal McGuigan,Secondary,43.76798,-79.487262


In [34]:
NY2map= folium.Map(location= [latitude, longitude], zoom_start= 12)

#add markers
markers_colors= []
for lat, lon, post, nei in zip(NY_2final['Latitude'], NY_2final['Longitude'], NY_2final['PostalCode'], NY_2final['Neighborhoods']):
    label= folium.Popup('{} ({}): {} - Cluster {}'.format(lat, lon, post, nei), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius= 8,
        popup= label,
        color=  '#12c92b',
        fill= True,
        fill_color= '#12c92b',
        fill_opacity= 0.7).add_to(NY2map)
       
NY2map

In [35]:
#Selected schools with venues in the area
NY5= NY_new.loc[NY_new['Cluster Labels'] == 4, NY_new.columns[[0] + list(range(2, NY_new.shape[1]))]]
NY_5final= NY_2final.merge(NY5, on= "PostalCode", how= "left")

NY_5final

Unnamed: 0,PostalCode,Neighborhoods_x,Cluster Labels_x,SchoolName,Panel,Latitude_x,Longitude_x,Neighborhoods_y,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels_y,Latitude_y,Longitude_y
0,M6B,Glencairn,4,Dante Alighieri,Secondary,43.709577,-79.445073,Glencairn,Pizza Place,Bakery,Pub,Japanese Restaurant,Women's Store,Diner,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,4,43.709577,-79.445073
1,M5M,"Bedford Park, Lawrence Manor East",4,Loretto Abbey,Secondary,43.733283,-79.41975,"Bedford Park, Lawrence Manor East",Coffee Shop,Fast Food Restaurant,Italian Restaurant,Sandwich Place,Restaurant,Liquor Store,Juice Bar,Café,Pharmacy,Pizza Place,4,43.733283,-79.41975
2,M2N,Willowdale South,4,Cardinal Carter Academy,Secondary,43.77012,-79.408493,Willowdale South,Ramen Restaurant,Coffee Shop,Sushi Restaurant,Café,Pizza Place,Sandwich Place,Juice Bar,Japanese Restaurant,Lounge,Ice Cream Shop,4,43.77012,-79.408493
3,M3J,"Northwood Park, York University",4,James Cardinal McGuigan,Secondary,43.76798,-79.487262,"Northwood Park, York University",Coffee Shop,Furniture / Home Store,Caribbean Restaurant,Bar,Massage Studio,Falafel Restaurant,Discount Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,4,43.76798,-79.487262


In [36]:
NY_5final.drop(['Latitude_x', 'Longitude_x', 'Cluster Labels_x', 'Latitude_y', 'Longitude_y', 'Cluster Labels_y', 'Panel', 'Neighborhoods_y'], axis= 1, inplace= True)
NY_5final

Unnamed: 0,PostalCode,Neighborhoods_x,SchoolName,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M6B,Glencairn,Dante Alighieri,Pizza Place,Bakery,Pub,Japanese Restaurant,Women's Store,Diner,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop
1,M5M,"Bedford Park, Lawrence Manor East",Loretto Abbey,Coffee Shop,Fast Food Restaurant,Italian Restaurant,Sandwich Place,Restaurant,Liquor Store,Juice Bar,Café,Pharmacy,Pizza Place
2,M2N,Willowdale South,Cardinal Carter Academy,Ramen Restaurant,Coffee Shop,Sushi Restaurant,Café,Pizza Place,Sandwich Place,Juice Bar,Japanese Restaurant,Lounge,Ice Cream Shop
3,M3J,"Northwood Park, York University",James Cardinal McGuigan,Coffee Shop,Furniture / Home Store,Caribbean Restaurant,Bar,Massage Studio,Falafel Restaurant,Discount Store,Construction & Landscaping,Convenience Store,Cosmetics Shop
