## Step 0: Download all the libraries needed in this project

In [91]:
import numpy as np 
import pandas as pd 
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
from   folium.plugins import MarkerCluster

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import matplotlib.cm as cm
import matplotlib.colors as colors

#import k-means from clustering stage
from sklearn.cluster import KMeans


## Step 1: Let's download and explore [NYC neighbourhood geography data]("https://geo.nyu.edu/catalog/nyu_2451_34572") 

In [92]:
with open('ny-geojson.json') as json_data:
    ny_geometry = json.load(json_data)
#ny_geometry

##### It looks we the "features" data is what we wanted.

In [93]:
nbh_geometry = ny_geometry['features']
#nbh_geometry[0]

##### Transform nbh_geometry into a pandas dataframe

In [94]:
# Instantiate the dataframe
ny_nbhs = pd.DataFrame(columns=['Borough', 'Neighborhood', 'Latitude', 'Longitude'])

# Go through the data and fill the dataframe one row at a time.
for data in nbh_geometry:
    borough = data['properties']['borough'] 
    nbh_name = data['properties']['name']
        
    nbh_coordinates = data['geometry']['coordinates']
    nbh_latitude = nbh_coordinates[1]
    nbh_lontitude = nbh_coordinates[0]
    
    ny_nbhs = ny_nbhs.append({'Borough': borough,
                              'Neighborhood': nbh_name,
                              'Latitude': nbh_latitude,
                              'Longitude': nbh_lontitude}, ignore_index=True)
print(ny_nbhs.shape)
ny_nbhs.head(10)

(306, 4)


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585
5,Bronx,Kingsbridge,40.881687,-73.902818
6,Manhattan,Marble Hill,40.876551,-73.91066
7,Bronx,Woodlawn,40.898273,-73.867315
8,Bronx,Norwood,40.877224,-73.879391
9,Bronx,Williamsbridge,40.881039,-73.857446


In [95]:
# Let's see how many boroughs and neighborhoods in our data
print(" There are {} Boroughs and {} Neighborhoods in New York city. ".format(len(ny_nbhs['Borough'].unique()), ny_nbhs.shape[0]))


 There are 5 Boroughs and 306 Neighborhoods in New York city. 


#####  Here we choose Manhattan to explore in this project as an example

In [96]:
borough='Manhattan'
Manhattan_df = ny_nbhs[ny_nbhs['Borough']==borough].reset_index(drop=True)
print("There are {} neighborhoods in borough Manhattan".format(Manhattan_df.shape[0]))
Manhattan_df.head(6)

There are 40 neighborhoods in borough Manhattan


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688
5,Manhattan,Manhattanville,40.816934,-73.957385


In [97]:
borough='Manhattan'
Brgh_df = ny_nbhs[ny_nbhs['Borough']==borough].reset_index(drop=True)
Brgh_df.drop(['Borough'], axis=1, inplace = True)
print(Brgh_df.shape)
Brgh_df.head(10)

(40, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Marble Hill,40.876551,-73.91066
1,Chinatown,40.715618,-73.994279
2,Washington Heights,40.851903,-73.9369
3,Inwood,40.867684,-73.92121
4,Hamilton Heights,40.823604,-73.949688
5,Manhattanville,40.816934,-73.957385
6,Central Harlem,40.815976,-73.943211
7,East Harlem,40.792249,-73.944182
8,Upper East Side,40.775639,-73.960508
9,Yorkville,40.77593,-73.947118


In [98]:
print(" There are {} Neighborhoods in {}. ".format(Brgh_df.shape[0],borough))

 There are 40 Neighborhoods in Manhattan. 


In [99]:
geolocator = Nominatim(user_agent="nyc_agent")
Manh_loc = geolocator.geocode('Manhattan, NY')
Manh_lat = Manh_loc.latitude
Manh_lng = Manh_loc.longitude
print('The coordinate of Manhattan is {}, {}.'.format(Manh_lat, Manh_lng))

The coordinate of Manhattan is 40.7896239, -73.9598939.


In [100]:
# create a map of NYC
nyc_map = folium.Map(location=[Manh_lat, Manh_lng], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(Brgh_df['Latitude'], Brgh_df['Longitude'], Brgh_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(nyc_map)  
    
nyc_map

## Step 2:  Let's leverage the Foursquare API to explore the selected neighborhoods and segment them.

##### Step 2.1  Set up the URL for Foursquare request

In [101]:
LIMIT = 500
radius = 1000

USER_ID = 'VBJRAO5IMFVHJ15014BHKY1WXSLLA1JVQ0L0HQGMAIF4PQLH' 
USER_SECRET = 'GFNHX055SNPUJUVPMUENX3JQKZ5MGXQ5TWQKBHEROMXTR5L3' 
VERSION = '20180605' 


##### Step 2.2  Define functions to extract the category of the venue and venues

In [102]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [103]:
def get_Nbh_Venues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):        
        
        #create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            USER_ID, 
            USER_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)            
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Venue', 'Latitude', 'Longitude', 'Category']
    
    return(nearby_venues)

##### Step 2.3  Explore venues in burough Manhattan 

In [106]:
nbh_venues = get_Nbh_Venues(names=Brgh_df['Neighborhood'],
                            latitudes=Brgh_df['Latitude'],
                            longitudes=Brgh_df['Longitude'])

print(nbh_venues.shape)
nbh_venues.head(6)

(3151, 5)


Unnamed: 0,Neighborhood,Venue,Latitude,Longitude,Category
0,Marble Hill,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,Bikram Yoga,40.876844,-73.906204,Yoga Studio
2,Marble Hill,Tibbett Diner,40.880404,-73.908937,Diner
3,Marble Hill,Starbucks,40.877531,-73.905582,Coffee Shop
4,Marble Hill,Dunkin',40.877136,-73.906666,Donut Shop
5,Marble Hill,Rite Aid,40.875467,-73.908906,Pharmacy


In [107]:
#print(nbh_venues['Category'].unique())

In [108]:
# Extract all of hotel venues as hotel dataframe
htl_df = nbh_venues.loc[nbh_venues["Category"]=="Hotel"].reset_index(drop=True)
print(htl_df.shape)
htl_df.head(6)

(60, 5)


Unnamed: 0,Neighborhood,Venue,Latitude,Longitude,Category
0,Chinatown,Hotel 50 Bowery NYC,40.715936,-73.996789,Hotel
1,Hamilton Heights,Hotel San Fermin B&B,40.822566,-73.944624,Hotel
2,Upper East Side,The Carlyle,40.774413,-73.963301,Hotel
3,Upper East Side,The Surrey,40.774415,-73.963889,Hotel
4,Upper West Side,The Lucerne Hotel,40.783427,-73.978495,Hotel
5,Lincoln Square,The Empire Hotel,40.771545,-73.98263,Hotel


In [109]:
#nbh_venues.groupby('Neighborhood').count().head(6)

In [110]:
#extract all restaurant venues and saved as rst_df
rst_df = nbh_venues[nbh_venues['Category'].str.contains('Restaurant')].reset_index(drop=True)
rst_df['Regional_Style'] = rst_df['Category']
print(rst_df.shape)
rst_df.head(6)

(869, 6)


Unnamed: 0,Neighborhood,Venue,Latitude,Longitude,Category,Regional_Style
0,Marble Hill,Land & Sea Restaurant,40.877885,-73.905873,Seafood Restaurant,Seafood Restaurant
1,Marble Hill,Boston Market,40.87743,-73.905412,American Restaurant,American Restaurant
2,Chinatown,Kiki's,40.714476,-73.992036,Greek Restaurant,Greek Restaurant
3,Chinatown,Spicy Village,40.71701,-73.99353,Chinese Restaurant,Chinese Restaurant
4,Chinatown,The Fat Radish,40.715323,-73.99195,English Restaurant,English Restaurant
5,Chinatown,Wah Fung Number 1 Fast Food 華豐快飯店,40.717278,-73.994177,Chinese Restaurant,Chinese Restaurant


In [111]:
# Let's see how many restaurant categories in this dataframe.
print('There are {} uniques categories.'.format(len(rst_df['Category'].unique())))
#print(rst_df['Category'].unique())

There are 77 uniques categories.


In [112]:
#re ogrizate some restaurant categories
rst_df.loc[(rst_df.Regional_Style == 'New American Restaurant'),'Regional_Style']='American Restaurant'
rst_df.loc[(rst_df.Regional_Style == 'Sushi Restaurant'),'Regional_Style']='Japanese Restaurant'
rst_df.loc[(rst_df.Regional_Style == 'Japanese Curry Restaurant'),'Regional_Style']='Japanese Restaurant'
rst_df.loc[(rst_df.Regional_Style == 'Hotpot Restaurant'),'Regional_Style']='Chinese Restaurant'
rst_df.loc[(rst_df.Regional_Style == 'Dim Sum Restaurant'),'Regional_Style']='Chinese Restaurant'
rst_df.loc[(rst_df.Regional_Style == 'Dumpling Restaurant'),'Regional_Style']='Chinese Restaurant'
rst_df.loc[(rst_df.Regional_Style == 'Shanghai Restaurant'),'Regional_Style']='Chinese Restaurant'
rst_df.loc[(rst_df.Regional_Style == 'Taiwanese Restaurant'),'Regional_Style']='Chinese Restaurant'
rst_df.loc[(rst_df.Regional_Style == 'Cantonese Restaurant'),'Regional_Style']='Chinese Restaurant'
rst_df.loc[(rst_df.Regional_Style == 'Ramen Restaurant'),'Regional_Style']='Chinese Restaurant'
rst_df.loc[(rst_df.Regional_Style == 'Tapas Restaurant'),'Regional_Style']='Chinese Restaurant'
rst_df.loc[(rst_df.Regional_Style == 'Szechuan Restaurant'),'Regional_Style']='Chinese Restaurant'
rst_df.head(6)

Unnamed: 0,Neighborhood,Venue,Latitude,Longitude,Category,Regional_Style
0,Marble Hill,Land & Sea Restaurant,40.877885,-73.905873,Seafood Restaurant,Seafood Restaurant
1,Marble Hill,Boston Market,40.87743,-73.905412,American Restaurant,American Restaurant
2,Chinatown,Kiki's,40.714476,-73.992036,Greek Restaurant,Greek Restaurant
3,Chinatown,Spicy Village,40.71701,-73.99353,Chinese Restaurant,Chinese Restaurant
4,Chinatown,The Fat Radish,40.715323,-73.99195,English Restaurant,English Restaurant
5,Chinatown,Wah Fung Number 1 Fast Food 華豐快飯店,40.717278,-73.994177,Chinese Restaurant,Chinese Restaurant


In [113]:
print('There are {} uniques categories.'.format(len(rst_df['Regional_Style'].unique())))
print(rst_df['Regional_Style'].unique())

There are 65 uniques categories.
['Seafood Restaurant' 'American Restaurant' 'Greek Restaurant'
 'Chinese Restaurant' 'English Restaurant' 'Asian Restaurant'
 'Spanish Restaurant' 'Thai Restaurant' 'Korean Restaurant'
 'Malay Restaurant' 'Vietnamese Restaurant' 'Mexican Restaurant'
 'Austrian Restaurant' 'Vegetarian / Vegan Restaurant' 'Restaurant'
 'Italian Restaurant' 'Caribbean Restaurant' 'Indian Restaurant'
 'Latin American Restaurant' 'Japanese Restaurant' 'Arepa Restaurant'
 'Fast Food Restaurant' 'Empanada Restaurant' 'Mediterranean Restaurant'
 'Falafel Restaurant' 'Cuban Restaurant' 'French Restaurant'
 'Ethiopian Restaurant' 'African Restaurant'
 'Southern / Soul Food Restaurant' 'German Restaurant'
 'Peruvian Restaurant' 'Turkish Restaurant' 'Czech Restaurant'
 'Afghan Restaurant' 'Middle Eastern Restaurant' 'Israeli Restaurant'
 'Caucasian Restaurant' 'South American Restaurant'
 'Scandinavian Restaurant' 'Hawaiian Restaurant' 'Jewish Restaurant'
 'Paella Restaurant' 'Udon

### Step 3, Sort the neighborhoods and the most common restaurant avenues

##### Step 3.1  Count and sort the total restaurant venues by descending order

**Usually visitor would like a neighborhood including the more restaurants the better, So we group venues by neighborhood and then count the total restaurant venues and sort the count number by descending order.**

In [114]:
nbh_group = rst_df.groupby(['Neighborhood']).count()
nbh_group.reset_index(inplace=True)
nbh_group.sort_values(by=['Venue'], inplace=True, ascending=False)
nbh_group = nbh_group[['Neighborhood','Venue']]
nbh_group.columns = ['Neighborhood','Venues Sum']
print(nbh_group.shape)
nbh_group.head(8)

(39, 2)


Unnamed: 0,Neighborhood,Venues Sum
12,Greenwich Village,44
4,Chinatown,37
27,Noho,36
35,Upper West Side,35
8,East Village,35
18,Little Italy,32
26,Murray Hill,31
24,Midtown South,31


##### Step 3.2 Remove those neighborhoons has less 10 restaurants

##### Step 3.3 To analyze the categories of restaurants in each neiboughhoon. we use pandas's get_dummies function to one hot encode the restaurant categories into new dataframe as venues_onehot , in which you can see every neighborhood's count on every restaurant category

In [115]:
# one hot encoding
venues_onehot = pd.get_dummies(rst_df[['Regional_Style']], prefix="",prefix_sep="")
# add neighborhood column back to dataframe
venues_onehot['Neighborhood'] = rst_df['Neighborhood']
# move neighborhood column to the first column
fixed_columns = [venues_onehot.columns[-1]] + list(venues_onehot.columns[:-1])
# check intermidiate variable
# print(fixed_columns)
venues_onehot = venues_onehot[fixed_columns]
venues_cata_group = venues_onehot.groupby('Neighborhood').sum().reset_index()
print(venues_cata_group.shape)
venues_cata_group.head(6)

(39, 66)


Unnamed: 0,Neighborhood,Afghan Restaurant,African Restaurant,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Asian Restaurant,Australian Restaurant,Austrian Restaurant,Brazilian Restaurant,...,Southern / Soul Food Restaurant,Spanish Restaurant,Swiss Restaurant,Thai Restaurant,Theme Restaurant,Turkish Restaurant,Udon Restaurant,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Vietnamese Restaurant
0,Battery Park City,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Carnegie Hill,0,0,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2
2,Central Harlem,0,3,2,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,Chelsea,0,0,5,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,Chinatown,0,0,4,0,0,2,0,1,0,...,0,1,0,1,0,0,0,1,0,3
5,Civic Center,0,0,4,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [116]:
nbh_venues_group = venues_onehot.groupby('Neighborhood').mean().reset_index()
nbh_venues_group = nbh_venues_group.round(3)
nbh_venues_group.head(6)

Unnamed: 0,Neighborhood,Afghan Restaurant,African Restaurant,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Asian Restaurant,Australian Restaurant,Austrian Restaurant,Brazilian Restaurant,...,Southern / Soul Food Restaurant,Spanish Restaurant,Swiss Restaurant,Thai Restaurant,Theme Restaurant,Turkish Restaurant,Udon Restaurant,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Vietnamese Restaurant
0,Battery Park City,0.0,0.0,0.167,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Carnegie Hill,0.0,0.0,0.087,0.0,0.043,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043,0.0,0.087
2,Central Harlem,0.0,0.2,0.133,0.0,0.0,0.0,0.0,0.0,0.0,...,0.067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Chelsea,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0
4,Chinatown,0.0,0.0,0.108,0.0,0.0,0.054,0.0,0.027,0.0,...,0.0,0.027,0.0,0.027,0.0,0.0,0.0,0.027,0.0,0.081
5,Civic Center,0.0,0.0,0.19,0.0,0.0,0.048,0.048,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048,0.0,0.0


In [117]:
def create_columns_(num_top_venues):
    
    indicators = ['st', 'nd', 'rd']    
    # create columns according to number of top venues
    cols = ['Venues Sum','Neighborhood']
    for ind in np.arange(num_top_venues):
        try:
            cols.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
        except:
            cols.append('{}th Most Common Venue'.format(ind+1)) 
    return cols

In [118]:

#Sort every neighborhood ordered by the most common venues
def sort_nbhs_venues(pre_sort_group, nbhs_group_sum):    
    
    num_top_venues = 8
    
    # create a new dataframe
    cols = create_columns_(num_top_venues = num_top_venues)
    nbhs_sorted = pd.DataFrame(columns=cols)
    nbhs_sorted['Neighborhood'] = pre_sort_group['Neighborhood']
    
    # Go through every neighborhood to sort the most common restaurant venues, then update them to nbhs_sorted dataframe
    for ind in range(pre_sort_group.shape[0]):
        
        #get the venues sum value of current neighboorhood
        sum_row = nbhs_group_sum.loc[nbhs_group_sum['Neighborhood'] == pre_sort_group.loc[ind,'Neighborhood']]
        nbhs_sorted.iloc[ind, 0] = sum_row.iloc[0,1] 
        
        # sort the most common restaurant venues
        temp_in = pre_sort_group.loc[ind].T.reset_index()       
        temp_in = temp_in.loc[1:]
        temp_in.columns = ['Venue','Num']      
        temp_in['Num'] = temp_in['Num'].astype(float)
        
        temp_out = temp_in.sort_values('Num', ascending=False).reset_index(drop=True).head(num_top_venues) 
        temp_out['Num'] = temp_out['Num'].astype(str)
        temp_out['Combined'] = temp_out['Venue']+"_"+temp_out['Num']
        temp_out['Combined'].astype(str)   
        temp_out = temp_out.T.reset_index()
        
        #Update to the sorted row        
        nbhs_sorted.iloc[ind, 2:] = list(temp_out.iloc[2,1:])
    
    nbhs_sorted.set_index('Venues Sum',inplace = True)
    nbhs_sorted.sort_values('Venues Sum', ascending=False,inplace = True)
    nbhs_sorted.reset_index(inplace = True)
    return(nbhs_sorted)

##### Step 3.4 Sort every neighborhood by the most common venues

In [119]:
nbhs_sorted =sort_nbhs_venues(pre_sort_group = venues_cata_group, nbhs_group_sum = nbh_group)
print(nbhs_sorted.shape)
nbhs_sorted.head(6)

(39, 10)


Unnamed: 0,Venues Sum,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue
0,44,Greenwich Village,Italian Restaurant_10.0,Chinese Restaurant_5.0,Japanese Restaurant_5.0,American Restaurant_3.0,Indian Restaurant_3.0,Seafood Restaurant_2.0,French Restaurant_2.0,Vietnamese Restaurant_2.0
1,37,Chinatown,Chinese Restaurant_16.0,American Restaurant_4.0,Vietnamese Restaurant_3.0,Malay Restaurant_2.0,Greek Restaurant_2.0,Mexican Restaurant_2.0,Asian Restaurant_2.0,Austrian Restaurant_1.0
2,36,Noho,Italian Restaurant_6.0,American Restaurant_5.0,Japanese Restaurant_5.0,Mexican Restaurant_3.0,French Restaurant_2.0,Southern / Soul Food Restaurant_2.0,Venezuelan Restaurant_1.0,Vegetarian / Vegan Restaurant_1.0
3,35,Upper West Side,Italian Restaurant_5.0,Indian Restaurant_3.0,Japanese Restaurant_3.0,Chinese Restaurant_2.0,American Restaurant_2.0,Vegetarian / Vegan Restaurant_2.0,Thai Restaurant_2.0,Seafood Restaurant_2.0
4,35,East Village,Chinese Restaurant_5.0,Mexican Restaurant_5.0,Korean Restaurant_4.0,Japanese Restaurant_3.0,Italian Restaurant_2.0,Filipino Restaurant_2.0,Vietnamese Restaurant_2.0,Arepa Restaurant_1.0
5,32,Little Italy,Chinese Restaurant_8.0,Italian Restaurant_4.0,Thai Restaurant_3.0,Mediterranean Restaurant_3.0,Seafood Restaurant_2.0,Japanese Restaurant_2.0,Cuban Restaurant_1.0,Vegetarian / Vegan Restaurant_1.0


In [120]:
nbhs_sorted =sort_nbhs_venues(pre_sort_group = nbh_venues_group, nbhs_group_sum = nbh_group)
print(nbhs_sorted.shape)
nbhs_sorted.head(6)

(39, 10)


Unnamed: 0,Venues Sum,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue
0,44,Greenwich Village,Italian Restaurant_0.227,Chinese Restaurant_0.114,Japanese Restaurant_0.114,American Restaurant_0.068,Indian Restaurant_0.068,Seafood Restaurant_0.045,French Restaurant_0.045,Vietnamese Restaurant_0.045
1,37,Chinatown,Chinese Restaurant_0.432,American Restaurant_0.108,Vietnamese Restaurant_0.081,Malay Restaurant_0.054,Greek Restaurant_0.054,Mexican Restaurant_0.054,Asian Restaurant_0.054,Austrian Restaurant_0.027
2,36,Noho,Italian Restaurant_0.167,American Restaurant_0.139,Japanese Restaurant_0.139,Mexican Restaurant_0.083,French Restaurant_0.056,Southern / Soul Food Restaurant_0.056,Venezuelan Restaurant_0.028,Vegetarian / Vegan Restaurant_0.028
3,35,Upper West Side,Italian Restaurant_0.143,Indian Restaurant_0.086,Japanese Restaurant_0.086,Chinese Restaurant_0.057,American Restaurant_0.057,Vegetarian / Vegan Restaurant_0.057,Thai Restaurant_0.057,Seafood Restaurant_0.057
4,35,East Village,Chinese Restaurant_0.143,Mexican Restaurant_0.143,Korean Restaurant_0.114,Japanese Restaurant_0.086,Italian Restaurant_0.057,Filipino Restaurant_0.057,Vietnamese Restaurant_0.057,Arepa Restaurant_0.029
5,32,Little Italy,Chinese Restaurant_0.25,Italian Restaurant_0.125,Thai Restaurant_0.094,Mediterranean Restaurant_0.094,Seafood Restaurant_0.062,Japanese Restaurant_0.062,Cuban Restaurant_0.031,Vegetarian / Vegan Restaurant_0.031


### Step 4,  Cluster neighborhoods

In [121]:
num_top_neighborhood = 10
# Interested restaurant catagories
seleted_cols = ["Chinese Restaurant","Seafood Restaurant","Japanese Restaurant"]
venues_cata_group.set_index("Neighborhood", inplace=True)
seleted_cates = venues_cata_group.loc[:,seleted_cols]
seleted_cates['Sub Total'] = seleted_cates.sum(axis=1)
seleted_cates.sort_values('Sub Total', ascending=False, inplace = True)
seleted_cates = seleted_cates.reset_index()
seleted_cates.head(num_top_neighborhood)

Unnamed: 0,Neighborhood,Chinese Restaurant,Seafood Restaurant,Japanese Restaurant,Sub Total
0,Chinatown,16,0,0,16
1,Greenwich Village,5,2,5,12
2,Little Italy,8,2,2,12
3,Turtle Bay,0,3,7,10
4,Murray Hill,4,1,5,10
5,Yorkville,1,0,7,8
6,East Village,5,0,3,8
7,Manhattanville,4,2,2,8
8,Upper West Side,2,2,3,7
9,Tudor City,1,2,4,7


In [122]:
#selected neighborhoods (sltd_nbhs) list
sltd_nbhs = list(seleted_cates.loc[0:num_top_neighborhood-1, "Neighborhood"])
print(sltd_nbhs)

sltd_rst = rst_df.loc[rst_df['Neighborhood'].isin(sltd_nbhs)]
sltd_rst = sltd_rst.loc[sltd_rst['Regional_Style'].isin(seleted_cols)].reset_index(drop='True')
print(sltd_rst.shape)
sltd_rst.head(6)

['Chinatown', 'Greenwich Village', 'Little Italy', 'Turtle Bay', 'Murray Hill', 'Yorkville', 'East Village', 'Manhattanville', 'Upper West Side', 'Tudor City']
(98, 6)


Unnamed: 0,Neighborhood,Venue,Latitude,Longitude,Category,Regional_Style
0,Chinatown,Spicy Village,40.71701,-73.99353,Chinese Restaurant,Chinese Restaurant
1,Chinatown,Wah Fung Number 1 Fast Food 華豐快飯店,40.717278,-73.994177,Chinese Restaurant,Chinese Restaurant
2,Chinatown,Da Yu Hot Pot 大渝火锅,40.716735,-73.995752,Hotpot Restaurant,Chinese Restaurant
3,Chinatown,Xi'an Famous Foods,40.715232,-73.997263,Chinese Restaurant,Chinese Restaurant
4,Chinatown,"Happy Lamb Hot Pot, Manhattan",40.717639,-73.995187,Hotpot Restaurant,Chinese Restaurant
5,Chinatown,Great N.Y. Noodletown,40.715039,-73.996941,Chinese Restaurant,Chinese Restaurant


In [123]:
#selected neighborhoods geography dataframe(sltd_nbhs_df)
sltd_df = Brgh_df.loc[Brgh_df["Neighborhood"].isin(sltd_nbhs)].reset_index(drop='True')
sltd_df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Chinatown,40.715618,-73.994279
1,Manhattanville,40.816934,-73.957385
2,Yorkville,40.77593,-73.947118
3,Upper West Side,40.787658,-73.977059
4,Murray Hill,40.748303,-73.978332
5,Greenwich Village,40.726933,-73.999914
6,East Village,40.727847,-73.982226
7,Little Italy,40.719324,-73.997305
8,Turtle Bay,40.752042,-73.967708
9,Tudor City,40.746917,-73.971219


In [124]:
sltd_htl = nbh_venues.loc[nbh_venues["Category"]=="Hotel"].reset_index(drop=True)
sltd_htl = sltd_htl.loc[sltd_htl["Neighborhood"].isin(sltd_nbhs)].reset_index(drop=True)
print(sltd_htl.shape)
sltd_htl.head(6)

(11, 5)


Unnamed: 0,Neighborhood,Venue,Latitude,Longitude,Category
0,Chinatown,Hotel 50 Bowery NYC,40.715936,-73.996789,Hotel
1,Upper West Side,The Lucerne Hotel,40.783427,-73.978495,Hotel
2,Murray Hill,"The Renwick Hotel, Curio Collection by Hilton",40.750184,-73.977604,Hotel
3,Murray Hill,Shelburne Hotel & Suites by Affinia,40.748419,-73.97794,Hotel
4,Murray Hill,The William,40.750673,-73.980077,Hotel
5,Murray Hill,"The Langham, New York, Fifth Avenue",40.750144,-73.983532,Hotel


### Cast selected restaurants  and neighborhoods on the map

In [125]:
from geopy.distance import distance
distance_m = 350

dtypes = np.dtype([
          ('htl_venue', str),
          ('count', int),
          ('rst_venues', str),
          ('Latitude', float),
          ('Longitude', float)
          ])
data = np.empty(0, dtype=dtypes)
sltd_htl = pd.DataFrame(data)
sltd_htl['htl_venue'] = htl_df['Venue']
sltd_htl.set_index('htl_venue', inplace=True)


for htl_lat, htl_lng, htl_venue in zip(htl_df['Latitude'], htl_df['Longitude'],htl_df['Venue']):
       
    rst_venue_str = ""
    num = 0
    for rst_lat, rst_lng, rst_venue in zip(sltd_rst['Latitude'], sltd_rst['Longitude'],sltd_rst['Venue']):
        d = distance((htl_lat, htl_lng), (rst_lat, rst_lng)).m
        if (d < distance_m):
            rst_venue_str = rst_venue_str +','+rst_venue
            num = num + 1
    
    sltd_htl.loc[htl_venue] = [num, rst_venue_str, htl_lat, htl_lng]
    
sltd_htl.sort_values('count', ascending=False, inplace = True)
sltd_htl.drop_duplicates(inplace = True)
sltd_htl.reset_index(inplace = True)
rcmd_htl = sltd_htl[sltd_htl['count']>0] 
rcmd_htl.reset_index(inplace = True)

##### Run k-means to cluster the Borough into 5 clusters.

In [126]:
print(rcmd_htl.shape)
rcmd_htl.head(6)

(14, 6)


Unnamed: 0,index,htl_venue,count,rst_venues,Latitude,Longitude
0,0,Hotel 50 Bowery NYC,23.0,",Spicy Village,Wah Fung Number 1 Fast Food 華豐快...",40.715936,-73.996789
1,1,"The Renwick Hotel, Curio Collection by Hilton",13.0,",Kajitsu,Sushi Ryusei,Little Alley,Tempura Mat...",40.750184,-73.977604
2,2,Shelburne Hotel & Suites by Affinia,12.0,",Kajitsu,Sushi Ryusei,Little Alley,Tempura Mat...",40.748419,-73.97794
3,3,citizenM Bowery,5.0,",99 Favor Taste 99號餐廳,Yi Ji Shi Mo Noodle Corp...",40.720599,-73.993574
4,4,The William,5.0,",Kajitsu,Momosan Ramen & Sake,Omusubi Gonbei,C...",40.750673,-73.980077
5,5,Millennium Hilton New York One UN Plaza,4.0,",KaoruMC,Pescatore Seafood,Curry-Ya,Pescatore ...",40.750399,-73.96905


In [151]:
# create a map of NYC centered based on Manhantten's latitude and longitude.  
nyc_map = folium.Map(location=[Manh_lat, Manh_lng], zoom_start=11)

# add selected restaurants as orange circle markers to map
for lat, lng, venue in zip(sltd_rst['Latitude'], sltd_rst['Longitude'],sltd_rst['Venue']): 
    label = folium.Popup(venue, parse_html=True)
    icon = folium.Icon(color='pink',icon='utensils-alt', prefix ='fa')
    folium.Marker([lat, lng], popup=label,  icon= icon ).add_to(nyc_map) 
 

In [152]:

# add selected hotels as green circle markers to map
for cnt, lat, lng, venue in zip(sltd_htl['count'], sltd_htl['Latitude'], sltd_htl['Longitude'],sltd_htl['htl_venue']):
    
    if(cnt >0):
        icon = folium.Icon(color='green',icon='bed',prefix ='fa')
    else:
        icon = folium.Icon(color='lightgray',icon='bed',prefix='fa')                         
    
    folium.Marker([lat, lng], popup= "@" + venue,  icon= icon).add_to(nyc_map)

nyc_map