In [1]:
import pandas as pd
import folium # map rendering library
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import numpy as np
# import k-means from clustering stage
from sklearn.cluster import KMeans
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
df = pd.read_csv('combination.csv')

In [3]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Add points on the map

In [5]:
latitude = df['Latitude'][0]
longitude = df['Longitude'][0]
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

## Toronto

In [6]:
toronto_data = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


### Define Foursquare Credentials and Version

In [7]:
CLIENT_ID = 'U05DZSMYRSDFZMX4WSLDRYNVPKLZVOCNOBUFR5QRZ1YD2VHN' # your Foursquare ID
CLIENT_SECRET = '52P5CHTWWTQCLPOGTQY0DL1W2MLY3XOFVDVCKPN3YMIHP4QZ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: U05DZSMYRSDFZMX4WSLDRYNVPKLZVOCNOBUFR5QRZ1YD2VHN
CLIENT_SECRET:52P5CHTWWTQCLPOGTQY0DL1W2MLY3XOFVDVCKPN3YMIHP4QZ


In [8]:
neighborhood_latitude = df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Dorset Park, Wexford Heights, Scarborough Town Centre are 43.7574096, -79.27330400000002.


### Get the top 100 venues that are in Parkwoods within a radius of 500 meters.

In [9]:
LIMIT = 100
radius=500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url


'https://api.foursquare.com/v2/venues/explore?&client_id=U05DZSMYRSDFZMX4WSLDRYNVPKLZVOCNOBUFR5QRZ1YD2VHN&client_secret=52P5CHTWWTQCLPOGTQY0DL1W2MLY3XOFVDVCKPN3YMIHP4QZ&v=20180605&ll=43.7574096,-79.27330400000002&radius=500&limit=100'

In [10]:
results = requests.get(url).json()

### From the Foursquare lab in the previous module, we know that all the information is in the *items* key. Before we proceed, let's borrow the **get_category_type** function from the Foursquare lab.

In [11]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Clean the json and structure it into a *pandas* dataframe.

In [12]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Kairali,Indian Restaurant,43.754915,-79.276945
1,Kim Kim restaurant,Chinese Restaurant,43.753833,-79.276611
2,Karaikudi Chettinad South Indian Restaurant,Indian Restaurant,43.756042,-79.276276
3,Pho Vietnam,Vietnamese Restaurant,43.75777,-79.278572
4,Big Al's Pet Supercentre,Pet Store,43.759279,-79.278325


In [13]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

6 venues were returned by Foursquare.


### Create a function to repeat the same process to all the neighborhoods in Toronto

In [87]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Get Toronto nearby venues

In [15]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

In [16]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


### Toronto neighborhood venues count

In [17]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
"Alderwood, Long Branch",9,9,9,9,9,9
"Bathurst Manor, Wilson Heights, Downsview North",21,21,21,21,21,21
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",23,23,23,23,23,23
...,...,...,...,...,...,...
"Willowdale, Willowdale East",32,32,32,32,32,32
"Willowdale, Willowdale West",5,5,5,5,5,5
Woburn,3,3,3,3,3,3
Woodbine Heights,8,8,8,8,8,8


In [18]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 269 uniques categories.


### Transfer format of Toronto neighborhood venues category to onehot encoding

In [19]:
# onehot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

In [20]:
toronto_onehot.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# add neighborhood column back to dataframe
toronto_onehot['Neigh'] = toronto_venues['Neighborhood'] 


In [22]:
# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neigh,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
toronto_grouped = toronto_onehot.groupby('Neigh').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neigh,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Get top5 venues of every neighborhood

In [26]:
num_top_venues = 5

for hood in toronto_grouped['Neigh']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neigh'] == hood].T.reset_index()
    
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0             Clothing Store   0.2
1             Breakfast Spot   0.2
2                     Lounge   0.2
3               Skating Rink   0.2
4  Latin American Restaurant   0.2


----Alderwood, Long Branch----
            venue  freq
0     Pizza Place  0.22
1        Pharmacy  0.11
2             Pub  0.11
3  Sandwich Place  0.11
4    Dance Studio  0.11


----Bathurst Manor, Wilson Heights, Downsview North----
         venue  freq
0         Bank  0.10
1  Coffee Shop  0.10
2  Gas Station  0.05
3        Diner  0.05
4         Park  0.05


----Bayview Village----
                 venue  freq
0                 Café  0.25
1  Japanese Restaurant  0.25
2                 Bank  0.25
3   Chinese Restaurant  0.25
4                Motel  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0      Sandwich Place  0.09
1          Restaurant  0.09
2  Italian Restaurant  0.09
3         Coffee Shop  0.09
4                Café  0.04

                venue  freq
0                Park  0.50
1          Playground  0.25
2               Trail  0.25
3   Accessories Store  0.00
4  Mexican Restaurant  0.00


----Roselawn----
                             venue  freq
0                           Garden   0.5
1                     Home Service   0.5
2               Mexican Restaurant   0.0
3              Monument / Landmark   0.0
4  Molecular Gastronomy Restaurant   0.0


----Rouge Hill, Port Union, Highland Creek----
                             venue  freq
0                              Bar   1.0
1                Accessories Store   0.0
2        Middle Eastern Restaurant   0.0
3              Monument / Landmark   0.0
4  Molecular Gastronomy Restaurant   0.0


----Runnymede, Swansea----
              venue  freq
0              Café  0.08
1       Coffee Shop  0.08
2       Pizza Place  0.05
3               Pub  0.05
4  Sushi Restaurant  0.05


----Runnymede, The Junction North----
               venue  freq
0  Convenience Store

### Function of returning most common venues

In [34]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Get top10 common venues in neighborhood of Toronto

In [35]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neigh']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neigh'] = toronto_grouped['Neigh']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neigh,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Skating Rink,Latin American Restaurant,Breakfast Spot,Clothing Store,Drugstore,Discount Store,Distribution Center,Dog Run,Doner Restaurant
1,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop,Sandwich Place,Athletics & Sports,Pub,Dance Studio,Pharmacy,Drugstore,Donut Shop
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Park,Frozen Yogurt Shop,Bridal Shop,Sandwich Place,Diner,Restaurant,Deli / Bodega,Middle Eastern Restaurant
3,Bayview Village,Café,Bank,Chinese Restaurant,Japanese Restaurant,Yoga Studio,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Restaurant,Sandwich Place,Italian Restaurant,Greek Restaurant,Thai Restaurant,Pharmacy,Pizza Place,Pub,Café


### Use kmean to clustering toronto_grouped

In [39]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neigh', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)

### Insert cluster labels to neighborhoods_venues_sorted and merge with the toronto_data

In [40]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neigh'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2,Coffee Shop,Park,Bakery,Pub,Breakfast Spot,Café,Theater,Beer Store,Shoe Store,Restaurant
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2,Coffee Shop,Sushi Restaurant,Yoga Studio,Bar,Beer Bar,Smoothie Shop,Sandwich Place,Burrito Place,Café,College Auditorium
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,2,Clothing Store,Coffee Shop,Italian Restaurant,Café,Japanese Restaurant,Bubble Tea Shop,Middle Eastern Restaurant,Cosmetics Shop,Theater,Fast Food Restaurant
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,2,Café,Coffee Shop,Restaurant,Cocktail Bar,Gastropub,American Restaurant,Clothing Store,Gym,Italian Restaurant,Hotel
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Trail,Health Food Store,Pub,Neighborhood,Yoga Studio,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center


In [216]:
latitude = df['Latitude'][0]
longitude = df['Longitude'][0]
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [43]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Central Toronto,0,Park,Swim School,Bus Line,Dog Run,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Doner Restaurant,Farmers Market
21,Central Toronto,0,Park,Trail,Sushi Restaurant,Jewelry Store,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
29,Central Toronto,0,Park,Trail,Restaurant,Colombian Restaurant,Comfort Food Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Drugstore
33,Downtown Toronto,0,Park,Trail,Playground,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant


In [44]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,2,Coffee Shop,Park,Bakery,Pub,Breakfast Spot,Café,Theater,Beer Store,Shoe Store,Restaurant
1,Downtown Toronto,2,Coffee Shop,Sushi Restaurant,Yoga Studio,Bar,Beer Bar,Smoothie Shop,Sandwich Place,Burrito Place,Café,College Auditorium
2,Downtown Toronto,2,Clothing Store,Coffee Shop,Italian Restaurant,Café,Japanese Restaurant,Bubble Tea Shop,Middle Eastern Restaurant,Cosmetics Shop,Theater,Fast Food Restaurant
3,Downtown Toronto,2,Café,Coffee Shop,Restaurant,Cocktail Bar,Gastropub,American Restaurant,Clothing Store,Gym,Italian Restaurant,Hotel
4,East Toronto,2,Trail,Health Food Store,Pub,Neighborhood,Yoga Studio,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center
5,Downtown Toronto,2,Coffee Shop,Cocktail Bar,Bakery,Café,Restaurant,Beer Bar,Seafood Restaurant,Cheese Shop,Diner,Park
6,Downtown Toronto,2,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Japanese Restaurant,Burger Joint,Department Store,Salad Place,Bubble Tea Shop,Spa
7,Downtown Toronto,2,Grocery Store,Café,Park,Athletics & Sports,Diner,Italian Restaurant,Restaurant,Baby Store,Candy Store,Coffee Shop
8,Downtown Toronto,2,Coffee Shop,Café,Restaurant,Gym,Deli / Bodega,Thai Restaurant,Clothing Store,Hotel,Salad Place,American Restaurant
9,West Toronto,2,Bakery,Pharmacy,Music Venue,Middle Eastern Restaurant,Bar,Supermarket,Café,Portuguese Restaurant,Bank,Grocery Store


### New York

In [51]:
import json

In [49]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

Data downloaded!


In [52]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [53]:
neighborhoods_data = newyork_data['features']

In [54]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [55]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [60]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


In [64]:
# New York city location
latitude = 40.7127281
longitude = -74.0060152

In [65]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [89]:
CLIENT_ID = 'U05DZSMYRSDFZMX4WSLDRYNVPKLZVOCNOBUFR5QRZ1YD2VHN' # your Foursquare ID
CLIENT_SECRET = '12WI4L42EUIEFLKOJF3UJOHEMBSEIY3GLEBVHIUNVZL1S23G' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: U05DZSMYRSDFZMX4WSLDRYNVPKLZVOCNOBUFR5QRZ1YD2VHN
CLIENT_SECRET:12WI4L42EUIEFLKOJF3UJOHEMBSEIY3GLEBVHIUNVZL1S23G


In [90]:
neighborhood_latitude = neighborhoods.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = neighborhoods.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = neighborhoods.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Wakefield are 40.89470517661, -73.84720052054902.


In [91]:
# type your answer here

LIMIT = 100
radius=500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url


'https://api.foursquare.com/v2/venues/explore?&client_id=U05DZSMYRSDFZMX4WSLDRYNVPKLZVOCNOBUFR5QRZ1YD2VHN&client_secret=12WI4L42EUIEFLKOJF3UJOHEMBSEIY3GLEBVHIUNVZL1S23G&v=20180605&ll=40.89470517661,-73.84720052054902&radius=500&limit=100'

In [92]:
results = requests.get(url).json()

In [93]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Lollipops Gelato,Dessert Shop,40.894123,-73.845892
1,Carvel Ice Cream,Ice Cream Shop,40.890487,-73.848568
2,Walgreens,Pharmacy,40.896528,-73.8447
3,Rite Aid,Pharmacy,40.896649,-73.844846
4,Dunkin',Donut Shop,40.890459,-73.849089


In [94]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

9 venues were returned by Foursquare.


In [95]:
newyork_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                  )

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

In [96]:
print(newyork_venues.shape)
newyork_venues.head()

(9972, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
2,Wakefield,40.894705,-73.847201,Walgreens,40.896528,-73.8447,Pharmacy
3,Wakefield,40.894705,-73.847201,Rite Aid,40.896649,-73.844846,Pharmacy
4,Wakefield,40.894705,-73.847201,Dunkin',40.890459,-73.849089,Donut Shop


In [97]:
 newyork_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Allerton,28,28,28,28,28,28
Annadale,11,11,11,11,11,11
Arden Heights,5,5,5,5,5,5
Arlington,5,5,5,5,5,5
Arrochar,21,21,21,21,21,21
...,...,...,...,...,...,...
Woodhaven,24,24,24,24,24,24
Woodlawn,25,25,25,25,25,25
Woodrow,19,19,19,19,19,19
Woodside,73,73,73,73,73,73


In [98]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 269 uniques categories.


In [99]:
# onehot encoding
newyork_onehot = pd.get_dummies(newyork_venues[['Venue Category']], prefix="", prefix_sep="")

In [100]:
newyork_onehot.head()

Unnamed: 0,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,American Restaurant,Animal Shelter,Antique Shop,Arcade,Arepa Restaurant,...,Warehouse Store,Waste Facility,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
newyork_onehot['Neigh'] = newyork_venues['Neighborhood']

In [103]:
# move neighborhood column to the first column
fixed_columns = [newyork_onehot.columns[-1]] + list(newyork_onehot.columns[:-1])
newyork_onehot = newyork_onehot[fixed_columns]

In [104]:
newyork_onehot.head()

Unnamed: 0,Neigh,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,American Restaurant,Animal Shelter,Antique Shop,Arcade,...,Warehouse Store,Waste Facility,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
newyork_grouped = newyork_onehot.groupby('Neigh').mean().reset_index()
newyork_grouped.head()

Unnamed: 0,Neigh,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,American Restaurant,Animal Shelter,Antique Shop,Arcade,...,Warehouse Store,Waste Facility,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Allerton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Annadale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Arden Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arlington,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Arrochar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [106]:
num_top_venues = 5

for hood in newyork_grouped['Neigh']:
    print("----"+hood+"----")
    temp = newyork_grouped[newyork_grouped['Neigh'] == hood].T.reset_index()
    
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Allerton----
                venue  freq
0         Pizza Place  0.14
1       Deli / Bodega  0.07
2         Supermarket  0.07
3  Chinese Restaurant  0.07
4        Dessert Shop  0.04


----Annadale----
           venue  freq
0    Pizza Place  0.45
1          Diner  0.09
2     Restaurant  0.09
3         Bakery  0.09
4  Train Station  0.09


----Arden Heights----
           venue  freq
0       Pharmacy   0.2
1  Deli / Bodega   0.2
2    Pizza Place   0.2
3       Bus Stop   0.2
4    Coffee Shop   0.2


----Arlington----
                 venue  freq
0        Deli / Bodega   0.2
1  American Restaurant   0.2
2          Coffee Shop   0.2
3             Bus Stop   0.2
4        Grocery Store   0.2


----Arrochar----
                   venue  freq
0          Deli / Bodega  0.10
1               Bus Stop  0.10
2            Pizza Place  0.10
3     Italian Restaurant  0.10
4  Outdoors & Recreation  0.05


----Arverne----
            venue  freq
0       Surf Spot  0.22
1   Metro Station  0.11
2  Sand

                  venue  freq
0         Deli / Bodega  0.22
1        Cosmetics Shop  0.11
2    Italian Restaurant  0.11
3  Gym / Fitness Center  0.11
4           Coffee Shop  0.11


----Concourse----
               venue  freq
0      Grocery Store  0.19
1        Pizza Place  0.08
2  Convenience Store  0.04
3        Video Store  0.04
4           Pharmacy  0.04


----Concourse Village----
                  venue  freq
0    Mexican Restaurant  0.08
1         Deli / Bodega  0.08
2        Sandwich Place  0.08
3  Fast Food Restaurant  0.08
4   Fried Chicken Joint  0.05


----Coney Island----
                  venue  freq
0      Baseball Stadium  0.12
1  Caribbean Restaurant  0.12
2              Pharmacy  0.12
3           Music Venue  0.06
4          Skating Rink  0.06


----Corona----
                venue  freq
0  Mexican Restaurant  0.11
1              Bakery  0.11
2       Deli / Bodega  0.05
3          Donut Shop  0.05
4    Basketball Court  0.05


----Country Club----
                ven

            venue  freq
0      Donut Shop  0.11
1          Market  0.06
2      Food Court  0.06
3     Pizza Place  0.06
4  Ice Cream Shop  0.06


----Hollis----
                 venue  freq
0       Baseball Field  0.12
1        Shopping Mall  0.12
2  Fried Chicken Joint  0.12
3                 Park  0.12
4     Asian Restaurant  0.06


----Holliswood----
               venue  freq
0     Sandwich Place   0.2
1        Supermarket   0.2
2  Mobile Phone Shop   0.2
3         Playground   0.2
4         Donut Shop   0.2


----Homecrest----
                venue  freq
0                Bank  0.11
1       Grocery Store  0.08
2          Donut Shop  0.08
3  Mexican Restaurant  0.05
4      Sandwich Place  0.05


----Howard Beach----
                venue  freq
0  Italian Restaurant  0.11
1          Bagel Shop  0.08
2            Pharmacy  0.08
3  Chinese Restaurant  0.05
4      Sandwich Place  0.05


----Hudson Yards----
                  venue  freq
0    Italian Restaurant  0.07
1                 Ho

         venue  freq
0  Pizza Place  0.11
1        Diner  0.06
2         Bank  0.06
3     Pharmacy  0.06
4          Gym  0.06


----Paerdegat Basin----
                venue  freq
0                Food  0.29
1                 Gym  0.14
2  Child Care Service  0.14
3         Auto Garage  0.14
4       Moving Target  0.14


----Park Hill----
                  venue  freq
0              Bus Stop  0.29
1  Gym / Fitness Center  0.14
2           Coffee Shop  0.14
3                 Hotel  0.14
4                  Park  0.14


----Park Slope----
                 venue  freq
0          Coffee Shop  0.08
1         Burger Joint  0.07
2  American Restaurant  0.05
3                  Pub  0.03
4               Bakery  0.03


----Parkchester----
                 venue  freq
0          Supermarket  0.16
1          Pizza Place  0.09
2  American Restaurant  0.06
3           Kids Store  0.06
4       Discount Store  0.03


----Pelham Bay----
                  venue  freq
0    Italian Restaurant  0.07
1  Fast 

           venue  freq
0          Hotel  0.20
1  Deli / Bodega  0.13
2    Comedy Club  0.07
3    Sports Club  0.07
4  Bowling Alley  0.07


----Tribeca----
                 venue  freq
0                 Park  0.07
1  American Restaurant  0.07
2   Italian Restaurant  0.07
3             Wine Bar  0.04
4     Greek Restaurant  0.04


----Tudor City----
                venue  freq
0                Park  0.07
1                Café  0.07
2  Mexican Restaurant  0.05
3       Deli / Bodega  0.04
4         Pizza Place  0.04


----Turtle Bay----
                venue  freq
0  Italian Restaurant  0.06
1    Sushi Restaurant  0.04
2                Café  0.04
3         Coffee Shop  0.04
4  Seafood Restaurant  0.03


----Unionport----
                       venue  freq
0             Ice Cream Shop  0.10
1                 Donut Shop  0.10
2  Latin American Restaurant  0.10
3          Mobile Phone Shop  0.05
4       Fast Food Restaurant  0.05


----University Heights----
            venue  freq
0     Piz

In [107]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neigh']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neigh'] = newyork_grouped['Neigh']

for ind in np.arange(newyork_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(newyork_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neigh,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Allerton,Pizza Place,Chinese Restaurant,Deli / Bodega,Supermarket,Donut Shop,Playground,Fast Food Restaurant,Martial Arts Dojo,Electronics Store,Bakery
1,Annadale,Pizza Place,Sports Bar,Train Station,Diner,Liquor Store,Bakery,Restaurant,Event Space,Exhibit,Fast Food Restaurant
2,Arden Heights,Pizza Place,Deli / Bodega,Pharmacy,Bus Stop,Coffee Shop,Yoga Studio,Event Service,Event Space,Exhibit,Factory
3,Arlington,Deli / Bodega,American Restaurant,Bus Stop,Coffee Shop,Grocery Store,Yoga Studio,Financial or Legal Service,Event Space,Exhibit,Factory
4,Arrochar,Italian Restaurant,Deli / Bodega,Bus Stop,Pizza Place,Mediterranean Restaurant,Supermarket,Liquor Store,Bagel Shop,Athletics & Sports,Outdoors & Recreation


In [108]:
# set number of clusters
kclusters = 5

newyork_grouped_clustering = newyork_grouped.drop('Neigh', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(newyork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 2, 2, 2, 0, 0, 1, 0, 0], dtype=int32)

In [131]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

newyork_merged = neighborhoods

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
newyork_merged = newyork_merged.join(neighborhoods_venues_sorted.set_index('Neigh'), on='Neighborhood')

newyork_merged.head() # check the last columns!


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bronx,Wakefield,40.894705,-73.847201,1.0,Pharmacy,Ice Cream Shop,Gas Station,Donut Shop,Laundromat,Deli / Bodega,Sandwich Place,Dessert Shop,Fast Food Restaurant,Farmers Market
1,Bronx,Co-op City,40.874294,-73.829939,1.0,Grocery Store,Fried Chicken Joint,Mattress Store,Baseball Field,Fast Food Restaurant,Bus Station,Park,Ice Cream Shop,Restaurant,Pizza Place
2,Bronx,Eastchester,40.887556,-73.827806,1.0,Caribbean Restaurant,Deli / Bodega,Bus Stop,Diner,Bus Station,Bowling Alley,Seafood Restaurant,Donut Shop,Pizza Place,Platform
3,Bronx,Fieldston,40.895437,-73.905643,1.0,Cosmetics Shop,High School,Bus Station,Plaza,Yoga Studio,Filipino Restaurant,Ethiopian Restaurant,Event Service,Event Space,Exhibit
4,Bronx,Riverdale,40.890834,-73.912585,1.0,Bus Station,Park,Bank,Food Truck,Medical Supply Store,Gym,Home Service,Plaza,Baseball Field,Farmers Market


In [132]:
newyork_merged = newyork_merged.dropna(axis=0)
newyork_merged['Cluster Labels'] = newyork_merged['Cluster Labels'].astype(int)

In [218]:
# New York city location
latitude = 40.7127281
longitude = -74.0060152

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(newyork_merged['Latitude'], newyork_merged['Longitude'], newyork_merged['Neighborhood'], newyork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [141]:
label1 = newyork_merged[newyork_merged['Cluster Labels'] == 0].iloc[:, [0]+list(range(5, newyork_merged.shape[1]))]
label1['1st Most Common Venue'].value_counts()[:5]

Italian Restaurant    19
Coffee Shop           16
Bar                   12
Park                   7
Café                   5
Name: 1st Most Common Venue, dtype: int64

In [212]:
import matplotlib.pyplot as plt

%matplotlib inline

def summaryLabels(merged, order, plot=False):
    orders = ['st', 'nd', 'rd']
    labels = list(set(merged['Cluster Labels']))
    #print(labels)
    columns = [list(merged.columns).index('Borough')] + list(range(list(merged.columns).index('1st Most Common Venue'), merged.shape[1]))
    for i in labels:
        print('Label '+ str(i) + ':')
        label = merged.loc[merged['Cluster Labels'] == i].iloc[:, columns]
        value = label['Borough'].value_counts()
        if plot:
            value.plot(kind='bar')
            plt.title('Borough distribution')
            plt.xlabel('Borough')
            plt.ylabel('Count')
            for x, y in enumerate(list(value)):
                plt.text(x=x, y=y, s=str(y), horizontalalignment='center')
            plt.show()
        print('Borough distribution: ',dict(value))
        
        for j in range(1, order+1):
            try:
                if plot:
                    value = label[str(j)+orders[j-1] +' Most Common Venue'].value_counts()
                    value[:5].plot(kind='bar')
                    plt.title(str(j)+orders[j-1] + ' common venue distribution')
                    plt.xlabel('Venue')
                    plt.ylabel('Count')
                    for x, y in enumerate(list(value)[:5]):
                        plt.text(x=x, y=y, s=str(y), horizontalalignment='center')
                    plt.show()
                print(str(j)+orders[j-1] +' Most Common Venue:', dict(label[str(j)+orders[j-1] +' Most Common Venue'].value_counts()[:5]))
            except:
                if plot:
                    value = label[str(j)+'th' +' Most Common Venue'].value_counts()
                    value[:5].plot(kind='bar')
                    plt.title(str(j)+'th' + ' common venue distribution')
                    plt.xlabel('Venue')
                    plt.ylabel('Count')
                    for x, y in enumerate(list(value)[:5]):
                        plt.text(x=x, y=y, s=str(y), horizontalalignment='center')
                    plt.show()
                print(str(j)+'th' +' Most Common Venue:', dict(label[str(j)+'th' +' Most Common Venue'].value_counts()[:5]))


In [213]:
summaryLabels(newyork_merged, 3)


Label 0:
Borough distribution:  {'Manhattan': 40, 'Brooklyn': 37, 'Queens': 29, 'Staten Island': 18, 'Bronx': 7}
1st Most Common Venue: {'Italian Restaurant': 19, 'Coffee Shop': 16, 'Bar': 12, 'Park': 7, 'Café': 5}
2nd Most Common Venue: {'Coffee Shop': 13, 'Pizza Place': 8, 'Bakery': 8, 'Italian Restaurant': 7, 'Hotel': 5}
3rd Most Common Venue: {'Pizza Place': 13, 'Italian Restaurant': 9, 'Coffee Shop': 7, 'Café': 7, 'Sandwich Place': 6}
Label 1:
Borough distribution:  {'Bronx': 40, 'Queens': 37, 'Brooklyn': 32, 'Staten Island': 20}
1st Most Common Venue: {'Pizza Place': 25, 'Chinese Restaurant': 12, 'Caribbean Restaurant': 10, 'Pharmacy': 9, 'Bank': 9}
2nd Most Common Venue: {'Pizza Place': 10, 'Grocery Store': 8, 'Sandwich Place': 7, 'Bus Station': 7, 'Donut Shop': 7}
3rd Most Common Venue: {'Pizza Place': 10, 'Deli / Bodega': 8, 'Donut Shop': 8, 'Pharmacy': 6, 'Grocery Store': 6}
Label 2:
Borough distribution:  {'Staten Island': 20, 'Queens': 13, 'Bronx': 4, 'Brooklyn': 1}
1st Mos

In [214]:
summaryLabels(toronto_merged, 3)


Label 0:
Borough distribution:  {'Central Toronto': 3, 'Downtown Toronto': 1}
1st Most Common Venue: {'Park': 4}
2nd Most Common Venue: {'Trail': 3, 'Swim School': 1}
3rd Most Common Venue: {'Bus Line': 1, 'Restaurant': 1, 'Playground': 1, 'Sushi Restaurant': 1}
Label 2:
Borough distribution:  {'Downtown Toronto': 18, 'Central Toronto': 6, 'West Toronto': 6, 'East Toronto': 5}
1st Most Common Venue: {'Coffee Shop': 14, 'Café': 5, 'Sandwich Place': 3, 'Clothing Store': 2, 'Park': 1}
2nd Most Common Venue: {'Café': 7, 'Coffee Shop': 7, 'Sushi Restaurant': 2, 'Airport Lounge': 1, 'Performing Arts Venue': 1}
3rd Most Common Venue: {'Café': 4, 'Restaurant': 4, 'Bakery': 3, 'Park': 2, 'Sushi Restaurant': 2}


### Toronto:
We can see from the above result, most area are labeled as cluster 2, including Downtown Toronto, Central Toronto, West Toronto and East Toronto, most of venues are coffee shop, cafe and restaurant, which indicates coffee shop have some market in these areas.

### New York
Italian Restaurant, Coffee Shop and Bar are the most common venues in cluster 0 areas, which are mainly Manhattan, Brooklyn, Queens, Staten Island.
While cluster 1 areas including Bronx, Queens, Brooklyn, Staten Island has a lot of Pizza Place, Chinese Restaurant, Caribbean Restaurant, Pharmacy, Bank and Grocery Store.
Cluster 2 areas are mainly Staten Island and Queens, where many Deli and Italian restaurants are built there.