This notebook contains the code for the CapstoneProject Assignment.

Import Libraries:

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

import matplotlib.pyplot as plt

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans # import k-means from clustering stage

import folium # map rendering library

Load the spatial dataset for NY (The dataset was taken from the 'NYU Spatial Data Repository'), extract the relevant spatial information and create a pandas dataframe

In [None]:
#define path to dataset location
path01 = 'Raw_Data/newyork_data.json'

#open json file into variable
with open(path01) as json_data:
    newyork_data = json.load(json_data)

#extract 'features' data into new variable
neighborhoods_data = newyork_data['features']

#generate a dataframe containing information on Borough, Neighborhood, Lat, Lng
#define the dataframe columns
col_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

#create the dataframe
df_NY = pd.DataFrame(columns=col_names)

#fill the dataframe with the resepctive information
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    df_NY = df_NY.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

#sort dataframe by Boroughs
df_NY = df_NY.sort_values(by=['Borough'])

#display df
df_NY

Explore how many neighborhoods are in each borough

In [None]:
#count the number of neighborhoods in each borough
df_exp_01 = df_NY['Borough'].value_counts()

#plot distribution neigborhoods in each borough
df_exp_01.plot(kind='bar', ylabel='# Neighborhoods')

Display the different neighborhoods on a folium map

In [None]:
#get the coordinates of NY
address = 'New York City, NY'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude_NY = location.latitude
longitude_NY = location.longitude

#generate a map of NY
map_newyork = folium.Map(location=[latitude_NY, longitude_NY], zoom_start=10)

#color scheme for each borough
cd = {'Queens':'Red', 'Brooklyn':'Blue', 'Staten Island':'Green', 'Bronx':'Black', 'Manhattan':'Yellow'}

#add neighborhood markers to the map
for lat, lng, borough, neighborhood in zip(df_NY['Latitude'], df_NY['Longitude'], df_NY['Borough'], df_NY['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color=cd[str(borough)],
        fill=True,
        fill_color=cd[str(borough)],
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

Lets focus on only one of the boroughs -> Queens

In [None]:
df_queens = df_NY[df_NY['Borough'] == 'Queens'].reset_index(drop=True)
df_queens.shape

In [None]:
#get the coordinates of Queens
address = 'Queens, NY'
geolocator = Nominatim(user_agent="queens_explorer")
location = geolocator.geocode(address)
latitude_Queens = location.latitude
longitude_Queens = location.longitude

#generate a map of Queens
map_queens = folium.Map(location=[latitude_Queens, longitude_Queens], zoom_start=11)

#add neighborhood markers to the map
for lat, lng, borough, neighborhood in zip(df_queens['Latitude'], df_queens['Longitude'], df_queens['Borough'], df_queens['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='Green',
        fill=True,
        fill_color='Green',
        fill_opacity=0.7,
        parse_html=False).add_to(map_queens)  
    
map_queens

Initiate Foursquare credentials & settings

In [None]:
#Foursquare ID
CLIENT_ID = 'ENTER YOUR CLIENT ID HERE'

#Foursquare Secret
CLIENT_SECRET = 'ENTER YOUR CLIENT SECRET HERE'

#Foursquare Version
VERSION = '20180605'

#category ID -> here 'food'
CATEGORY_ID = '4d4b7105d754a06374d81259'

#default Foursquare API limit
LIMIT = 50 # A default Foursquare API limit value <-- WAS REDUCED TO 10 DUE TO THE REDUCED AMOUNT OF FREE CALLS PER DAY

Explore venues in all neighborhoods of Queens

In [None]:
#create function
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?categoryId={}&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CATEGORY_ID,
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
#call function and write results to dataframe
df_queens_venues = getNearbyVenues(names=df_queens['Neighborhood'],
                                   latitudes=df_queens['Latitude'],
                                   longitudes=df_queens['Longitude']
                                  )

In [None]:
#Take a look at the new dataframe
print(df_queens_venues.shape)
df_queens_venues.head()

In [None]:
#Explore 
df_queens_venues_grouped = df_queens_venues.groupby('Neighborhood').count().sort_values(by=['Venue Category'], ascending=False)
df_queens_venues_grouped

In [None]:
#plot distribution of venues in each neighborhood
df_queens_venues_grouped['Venue Category'].plot(kind='bar', ylabel='# Food-Venues', xlabel='Neighborhoods', figsize=(20,10), fontsize=15)

In [None]:
#check for unqiue sub-categories within the found results for the food category
print('There are {} uniques categories.'.format(len(df_queens_venues['Venue Category'].unique())))

In [None]:
#one hot encoding
df_queens_onehot = pd.get_dummies(df_queens_venues[['Venue Category']], prefix="", prefix_sep="")

#add neighborhood column back to dataframe
df_queens_onehot['Neighborhood'] = df_queens_venues['Neighborhood'] 

#move neighborhood column to the first column
fixed_columns = [df_queens_onehot.columns[-1]] + list(df_queens_onehot.columns[:-1])
df_queens_onehot = df_queens_onehot[fixed_columns]

df_queens_onehot.head()

In [None]:
#reshape dataframe to get the total amount venues for each cataegory (ranked)
df_queens_onehot_sum = df_queens_onehot.transpose()
df_queens_onehot_sum['sum'] = df_queens_onehot_sum.sum(axis=1)
df_queens_onehot_sum = df_queens_onehot_sum.iloc[1:]
df_queens_onehot_sum = df_queens_onehot_sum.sort_values(by=['sum'], ascending=False)
df_queens_onehot_sum = df_queens_onehot_sum.head(10)

#plot the dataframe
df_queens_onehot_sum['sum'].plot(kind='bar', ylabel='# Food-Venues', xlabel='Neighborhoods')

In [None]:
#group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
df_queens_grouped = df_queens_onehot.groupby('Neighborhood').mean().reset_index()
df_queens_grouped

In [None]:
#function to sort the venues in for each neighborhood in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
#call function to display the top 10 venues for each neighborhood in a df
#no. of venues to display
num_top_venues = 10

#indicators for the ranking (e.g. 1st, 2nd, 3rd etc.)
indicators = ['st', 'nd', 'rd']

#create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

#create a new dataframe
df_queens_venues_sorted = pd.DataFrame(columns=columns)
df_queens_venues_sorted['Neighborhood'] = df_queens_grouped['Neighborhood']

for ind in np.arange(df_queens_grouped.shape[0]):
    df_queens_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_queens_grouped.iloc[ind, :], num_top_venues)

#display dataframe
df_queens_venues_sorted.head()

Cluster neighborhoods

In [None]:
#find optimal k-value (code taken from 'https://predictivehacks.com/k-means-elbow-method-code-for-python/#:~:text=K%2DMeans%20is%20an%20unsupervised,optimal%20for%20the%20specific%20case.')
#Method 1
#elbow method
#data to fit
a = df_queens_grouped.drop('Neighborhood', 1)

#run different k's
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(a)
    distortions.append(kmeanModel.inertia_)

#plot result
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

#-> look for the 'elbow' in the graph below -> looks like 5

In [None]:
#cluster with k-means
#set number of clusters
kclusters = 5

df_queens_grouped_clustering = df_queens_grouped.drop('Neighborhood', 1)

#run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_queens_grouped_clustering)

#check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
#create a new dataframe that includes the cluster + the top 10 venues for each neighborhood
#add clustering labels
df_queens_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_queens_merged = df_queens

#merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
df_queens_merged = df_queens_merged.join(df_queens_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

#drop NaN rows from neighborhoods with no results from the initial search
df_queens_merged = df_queens_merged.dropna()

#convert float to int in cloumn 'Cluster Labels' -> important for floium visualization in the next cell
df_queens_merged = df_queens_merged.astype({"Cluster Labels":'int'})  

#display dataframe
df_queens_merged.head()

In [None]:
#visulaize result on a folium map
# create map
map_clusters = folium.Map(location=[latitude_Queens, longitude_Queens], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_queens_merged['Latitude'], df_queens_merged['Longitude'], df_queens_merged['Neighborhood'], df_queens_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
#cluster 1
df_queens_merged.loc[df_queens_merged['Cluster Labels'] == 0, df_queens_merged.columns[[1] + list(range(5, df_queens_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

In [None]:
#cluster 2
df_queens_merged.loc[df_queens_merged['Cluster Labels'] == 1, df_queens_merged.columns[[1] + list(range(5, df_queens_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

In [None]:
#cluster 3
df_queens_merged.loc[df_queens_merged['Cluster Labels'] == 2, df_queens_merged.columns[[1] + list(range(5, df_queens_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

In [None]:
#cluster 4
df_queens_merged.loc[df_queens_merged['Cluster Labels'] == 3, df_queens_merged.columns[[1] + list(range(5, df_queens_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

In [None]:
#cluster 5
df_queens_merged.loc[df_queens_merged['Cluster Labels'] == 4, df_queens_merged.columns[[1] + list(range(5, df_queens_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])