In [2]:
# import required packages

import numpy as np # numpy library
import pandas as pd # pandas library
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

# import clustering libraries
from kmodes.kmodes import KModes

import folium # map rendering library

In [8]:
# read data obtained from the UK ONS
london_data= pd.read_excel('london_data_v2.xlsx')

In [9]:
london_data.head()

Unnamed: 0,MSOA code,MSOA name,Local authority code,Borough,Net annual income
0,E02000166,Camden 001,E09000007,Camden,37300
1,E02000167,Camden 002,E09000007,Camden,44400
2,E02000168,Camden 003,E09000007,Camden,34500
3,E02000169,Camden 004,E09000007,Camden,40900
4,E02000170,Camden 005,E09000007,Camden,37900


In [10]:
#group data by Borough name to manage a lower number of observations on the map
df = london_data.groupby('Borough').mean().reset_index()
df

Unnamed: 0,Borough,Net annual income
0,Barking and Dagenham,25722.727273
1,Barnet,32931.707317
2,Bexley,32378.571429
3,Brent,27582.352941
4,Bromley,36620.512821
5,Camden,32492.857143
6,City of London,41800.0
7,Croydon,31211.363636
8,Ealing,31084.615385
9,Enfield,28469.444444


In [11]:
# describe data to explore differences between london areas
df.describe()

Unnamed: 0,Net annual income
count,33.0
mean,32069.384795
std,3748.531588
min,25722.727273
25%,30000.0
50%,31211.363636
75%,33486.666667
max,41800.0


In [12]:
# add more information about the location to get more accurate coordinates
df['Borough'] = df['Borough']+', London, UK'
df.head()

Unnamed: 0,Borough,Net annual income
0,"Barking and Dagenham, London, UK",25722.727273
1,"Barnet, London, UK",32931.707317
2,"Bexley, London, UK",32378.571429
3,"Brent, London, UK",27582.352941
4,"Bromley, London, UK",36620.512821


In [13]:
# use geocode to obtain coordinates
geolocator = Nominatim(user_agent="t_explorer")

from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
df ['location'] = df['Borough'].apply(geocode)
df['point'] = df['location'].apply(lambda loc: tuple(loc.point) if loc else None)
df.head()

Unnamed: 0,Borough,Net annual income,location,point
0,"Barking and Dagenham, London, UK",25722.727273,"(London Borough of Barking and Dagenham, Great...","(51.5541171, 0.15050434261994267, 0.0)"
1,"Barnet, London, UK",32931.707317,"(Chipping Barnet, London Borough of Barnet, Lo...","(51.65309, -0.2002261, 0.0)"
2,"Bexley, London, UK",32378.571429,"(Bexley, London Borough of Bexley, London, Gre...","(51.4416793, 0.150488, 0.0)"
3,"Brent, London, UK",27582.352941,"(London Borough of Brent, Greater London, Engl...","(51.563825800000004, -0.2757596561855699, 0.0)"
4,"Bromley, London, UK",36620.512821,"(Bromley, London, Greater London, England, BR1...","(51.4028046, 0.0148142, 0.0)"


In [16]:
# split point column into latitude, longitude and altitude columns
df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['point'].tolist(), index=df.index)

In [17]:
# let's explore the data
df

Unnamed: 0,Borough,Net annual income,location,point,latitude,longitude,altitude
0,"Barking and Dagenham, London, UK",25722.727273,"(London Borough of Barking and Dagenham, Great...","(51.5541171, 0.15050434261994267, 0.0)",51.554117,0.150504,0.0
1,"Barnet, London, UK",32931.707317,"(Chipping Barnet, London Borough of Barnet, Lo...","(51.65309, -0.2002261, 0.0)",51.65309,-0.200226,0.0
2,"Bexley, London, UK",32378.571429,"(Bexley, London Borough of Bexley, London, Gre...","(51.4416793, 0.150488, 0.0)",51.441679,0.150488,0.0
3,"Brent, London, UK",27582.352941,"(London Borough of Brent, Greater London, Engl...","(51.563825800000004, -0.2757596561855699, 0.0)",51.563826,-0.27576,0.0
4,"Bromley, London, UK",36620.512821,"(Bromley, London, Greater London, England, BR1...","(51.4028046, 0.0148142, 0.0)",51.402805,0.014814,0.0
5,"Camden, London, UK",32492.857143,"(Camden Town, London, Greater London, England,...","(51.5423045, -0.1395604, 0.0)",51.542305,-0.13956,0.0
6,"City of London, London, UK",41800.0,"(City of London, Greater London, England, EC2V...","(51.5156177, -0.0919983, 0.0)",51.515618,-0.091998,0.0
7,"Croydon, London, UK",31211.363636,"(Croydon, London, Greater London, England, CR0...","(51.3713049, -0.101957, 0.0)",51.371305,-0.101957,0.0
8,"Ealing, London, UK",31084.615385,"(Ealing, London Borough of Ealing, London, Gre...","(51.5126553, -0.3051952, 0.0)",51.512655,-0.305195,0.0
9,"Enfield, London, UK",28469.444444,"(Enfield, London, Greater London, England, EN2...","(51.6520851, -0.0810175, 0.0)",51.652085,-0.081018,0.0


In [18]:
# get geographical coordinates of London
address = 'London, UK'

geolocator = Nominatim(user_agent="t_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of London are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of London are 51.5073219, -0.1276474.


In [20]:
# create map of London using latitude and longitude values
map_london = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df['latitude'], df['longitude'], df['Borough']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_london)  

    
map_london

In [21]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '' # Foursquare API version

In [22]:
# get the neighbourhoods (areas) latitude and longitude values
area_latitude = df.loc[0, 'latitude'] # area latitude value
area_longitude = df.loc[0, 'longitude'] # area longitude value

area_name = df.loc[0, 'Borough'] # area name

print('Latitude and longitude values of {} are {}, {}.'.format(area_name, 
                                                               area_latitude, 
                                                               area_longitude))

Latitude and longitude values of Barking and Dagenham, London, UK are 51.5541171, 0.15050434261994267.


In [23]:
# get top 100 venue data within a radius of 500 meters

LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [24]:
#create a dataframe of London venues
london_venues = getNearbyVenues(names=df['Borough'],
                                   latitudes=df['latitude'],
                                   longitudes=df['longitude']
                                  )

Barking and Dagenham, London, UK
Barnet, London, UK
Bexley, London, UK
Brent, London, UK
Bromley, London, UK
Camden, London, UK
City of London, London, UK
Croydon, London, UK
Ealing, London, UK
Enfield, London, UK
Greenwich, London, UK
Hackney, London, UK
Hammersmith and Fulham, London, UK
Haringey, London, UK
Harrow, London, UK
Havering, London, UK
Hillingdon, London, UK
Hounslow, London, UK
Islington, London, UK
Kensington and Chelsea, London, UK
Kingston upon Thames, London, UK
Lambeth, London, UK
Lewisham, London, UK
Merton, London, UK
Newham, London, UK
Redbridge, London, UK
Richmond upon Thames, London, UK
Southwark, London, UK
Sutton, London, UK
Tower Hamlets, London, UK
Waltham Forest, London, UK
Wandsworth, London, UK
Westminster, London, UK


In [25]:
# Let's check how many venues were returned for each area
london_venues.groupby('Borough').count()

Unnamed: 0_level_0,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Barking and Dagenham, London, UK",5,5,5,5,5,5
"Barnet, London, UK",30,30,30,30,30,30
"Bexley, London, UK",9,9,9,9,9,9
"Brent, London, UK",17,17,17,17,17,17
"Bromley, London, UK",43,43,43,43,43,43
"Camden, London, UK",86,86,86,86,86,86
"City of London, London, UK",100,100,100,100,100,100
"Croydon, London, UK",25,25,25,25,25,25
"Ealing, London, UK",93,93,93,93,93,93
"Enfield, London, UK",61,61,61,61,61,61


In [29]:
# one hot encoding
london_onehot = pd.get_dummies(london_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
london_onehot['Borough'] = london_venues['Borough'] 

# move neighborhood column to the first column
fixed_columns = [london_onehot.columns[-1]] + list(london_onehot.columns[:-1])
london_onehot = london_onehot[fixed_columns]

london_onehot.head()

Unnamed: 0,Borough,Afghan Restaurant,African Restaurant,Arcade,Argentinian Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Austrian Restaurant,Auto Garage,...,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Barking and Dagenham, London, UK",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Barking and Dagenham, London, UK",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Barking and Dagenham, London, UK",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Barking and Dagenham, London, UK",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Barking and Dagenham, London, UK",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
#Let's group rows by area and by taking the mean of the frequency of occurrence of each category
london_grouped = london_onehot.groupby('Borough').mean().reset_index()
london_grouped

Unnamed: 0,Borough,Afghan Restaurant,African Restaurant,Arcade,Argentinian Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Austrian Restaurant,Auto Garage,...,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Barking and Dagenham, London, UK",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Barnet, London, UK",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bexley, London, UK",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Brent, London, UK",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bromley, London, UK",0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Camden, London, UK",0.0,0.011628,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,...,0.0,0.034884,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0
6,"City of London, London, UK",0.0,0.0,0.0,0.0,0.03,0.0,0.01,0.0,0.0,...,0.01,0.0,0.0,0.02,0.0,0.03,0.0,0.0,0.0,0.02
7,"Croydon, London, UK",0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Ealing, London, UK",0.0,0.0,0.0,0.0,0.010753,0.0,0.010753,0.0,0.0,...,0.0,0.0,0.010753,0.021505,0.0,0.010753,0.0,0.0,0.0,0.0
9,"Enfield, London, UK",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.032787,0.0,0.0,0.0,0.016393,0.0,0.016393,0.0


In [59]:
# create a pandas dataframe with top 5 venues in descending order
num_top_venues = 5

# write a function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
area_venues_sorted = pd.DataFrame(columns=columns)
area_venues_sorted['Borough'] = london_grouped['Borough']

for ind in np.arange(london_grouped.shape[0]):
    area_venues_sorted.iloc[ind, 1:] = return_most_common_venues(london_grouped.iloc[ind, :], num_top_venues)

area_venues_sorted

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Barking and Dagenham, London, UK",Bus Stop,Convenience Store,Liquor Store,Grocery Store,English Restaurant
1,"Barnet, London, UK",Coffee Shop,Pub,Pharmacy,Convenience Store,Restaurant
2,"Bexley, London, UK",Pub,Greek Restaurant,Breakfast Spot,Toy / Game Store,Train Station
3,"Brent, London, UK",Coffee Shop,Hotel,Supermarket,Platform,Pedestrian Plaza
4,"Bromley, London, UK",Clothing Store,Coffee Shop,Burger Joint,Gym / Fitness Center,Pub
5,"Camden, London, UK",Pub,Coffee Shop,Burger Joint,Italian Restaurant,Café
6,"City of London, London, UK",Coffee Shop,Gym / Fitness Center,French Restaurant,Seafood Restaurant,Clothing Store
7,"Croydon, London, UK",Pub,Coffee Shop,Spanish Restaurant,Malay Restaurant,Gaming Cafe
8,"Ealing, London, UK",Coffee Shop,Pub,Platform,Clothing Store,Bakery
9,"Enfield, London, UK",Clothing Store,Coffee Shop,Optical Shop,Supermarket,Pub


In [60]:
# run k-modes clustering (for categorical data) to obtain 4 groups

london_grouped_clustering = london_grouped.drop('Borough', 1)

km = KModes(n_clusters=4, init='Huang', n_init=5, verbose=0)
clusters = km.fit_predict(london_grouped_clustering)
df_dummy = clusters

In [61]:
# create a new dataframe that includes the cluster as well as the top 5 venues for each area
area_venues_sorted.insert(0, 'Cluster Labels', clusters)

In [62]:
area_venues_sorted

Unnamed: 0,Cluster Labels,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,1,"Barking and Dagenham, London, UK",Bus Stop,Convenience Store,Liquor Store,Grocery Store,English Restaurant
1,0,"Barnet, London, UK",Coffee Shop,Pub,Pharmacy,Convenience Store,Restaurant
2,0,"Bexley, London, UK",Pub,Greek Restaurant,Breakfast Spot,Toy / Game Store,Train Station
3,0,"Brent, London, UK",Coffee Shop,Hotel,Supermarket,Platform,Pedestrian Plaza
4,0,"Bromley, London, UK",Clothing Store,Coffee Shop,Burger Joint,Gym / Fitness Center,Pub
5,0,"Camden, London, UK",Pub,Coffee Shop,Burger Joint,Italian Restaurant,Café
6,0,"City of London, London, UK",Coffee Shop,Gym / Fitness Center,French Restaurant,Seafood Restaurant,Clothing Store
7,0,"Croydon, London, UK",Pub,Coffee Shop,Spanish Restaurant,Malay Restaurant,Gaming Cafe
8,0,"Ealing, London, UK",Coffee Shop,Pub,Platform,Clothing Store,Bakery
9,2,"Enfield, London, UK",Clothing Store,Coffee Shop,Optical Shop,Supermarket,Pub


In [63]:
# merge venues data frame with df to add latitude/longitude and Net annual income for each area

london_merged = df
london_merged = london_merged.join(area_venues_sorted.set_index('Borough'), on='Borough')
london_merged.head()

Unnamed: 0,Borough,Net annual income,location,point,latitude,longitude,altitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Barking and Dagenham, London, UK",25722.727273,"(London Borough of Barking and Dagenham, Great...","(51.5541171, 0.15050434261994267, 0.0)",51.554117,0.150504,0.0,1,Bus Stop,Convenience Store,Liquor Store,Grocery Store,English Restaurant
1,"Barnet, London, UK",32931.707317,"(Chipping Barnet, London Borough of Barnet, Lo...","(51.65309, -0.2002261, 0.0)",51.65309,-0.200226,0.0,0,Coffee Shop,Pub,Pharmacy,Convenience Store,Restaurant
2,"Bexley, London, UK",32378.571429,"(Bexley, London Borough of Bexley, London, Gre...","(51.4416793, 0.150488, 0.0)",51.441679,0.150488,0.0,0,Pub,Greek Restaurant,Breakfast Spot,Toy / Game Store,Train Station
3,"Brent, London, UK",27582.352941,"(London Borough of Brent, Greater London, Engl...","(51.563825800000004, -0.2757596561855699, 0.0)",51.563826,-0.27576,0.0,0,Coffee Shop,Hotel,Supermarket,Platform,Pedestrian Plaza
4,"Bromley, London, UK",36620.512821,"(Bromley, London, Greater London, England, BR1...","(51.4028046, 0.0148142, 0.0)",51.402805,0.014814,0.0,0,Clothing Store,Coffee Shop,Burger Joint,Gym / Fitness Center,Pub


In [64]:
# create a sorted dataframe by net income 

df2 = london_merged.sort_values(['Net annual income'])
df2

Unnamed: 0,Borough,Net annual income,location,point,latitude,longitude,altitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Barking and Dagenham, London, UK",25722.727273,"(London Borough of Barking and Dagenham, Great...","(51.5541171, 0.15050434261994267, 0.0)",51.554117,0.150504,0.0,1,Bus Stop,Convenience Store,Liquor Store,Grocery Store,English Restaurant
24,"Newham, London, UK",26275.675676,"(London Borough of Newham, Greater London, Eng...","(51.52999955, 0.02931796029382208, 0.0)",51.53,0.029318,0.0,0,Pub,Bus Station,Café,Electronics Store,Flea Market
11,"Hackney, London, UK",27000.0,"(Hackney, London, Greater London, England, E9 ...","(51.5432402, -0.0493621, 0.0)",51.54324,-0.049362,0.0,0,Pub,Coffee Shop,Café,Brewery,Supermarket
3,"Brent, London, UK",27582.352941,"(London Borough of Brent, Greater London, Engl...","(51.563825800000004, -0.2757596561855699, 0.0)",51.563826,-0.27576,0.0,0,Coffee Shop,Hotel,Supermarket,Platform,Pedestrian Plaza
9,"Enfield, London, UK",28469.444444,"(Enfield, London, Greater London, England, EN2...","(51.6520851, -0.0810175, 0.0)",51.652085,-0.081018,0.0,2,Clothing Store,Coffee Shop,Optical Shop,Supermarket,Pub
29,"Tower Hamlets, London, UK",28475.0,(Air Training Corps 31 (Tower Hamlets) Squadro...,"(51.5256294, -0.0335853, 0.0)",51.525629,-0.033585,0.0,2,Pub,Coffee Shop,Pizza Place,Bus Stop,Burger Joint
18,"Islington, London, UK",29604.347826,"(Islington, London, Greater London, England, N...","(51.5384287, -0.0999051, 0.0)",51.538429,-0.099905,0.0,0,Pub,Mediterranean Restaurant,Burger Joint,Bakery,French Restaurant
30,"Waltham Forest, London, UK",29785.714286,"(London Borough of Waltham Forest, Greater Lon...","(51.59816935, -0.01783667461048707, 0.0)",51.598169,-0.017837,0.0,0,Pub,Bus Stop,Art Gallery,Auto Garage,Lounge
13,"Haringey, London, UK",30000.0,"(London Borough of Haringey, Greater London, E...","(51.587929849999995, -0.10541010599099046, 0.0)",51.58793,-0.10541,0.0,0,Café,Park,Fast Food Restaurant,Coffee Shop,Bistro
10,"Greenwich, London, UK",30403.030303,"(Greenwich, London, Greater London, England, S...","(51.4820845, -0.0045417, 0.0)",51.482084,-0.004542,0.0,0,Pub,Boat or Ferry,Burger Joint,Pizza Place,History Museum


In [65]:
# Let's check whether average incomes differ between clusters
df2.groupby('Cluster Labels').mean().reset_index()

Unnamed: 0,Cluster Labels,Net annual income,latitude,longitude,altitude
0,0,32483.268996,51.503977,-0.152211,0.0
1,1,30484.280303,51.455814,-0.011568,0.0
2,2,28472.222222,51.588857,-0.057301,0.0
3,3,30845.16129,51.57632,0.04541,0.0


In [66]:
# Let's obtain the scipy.stats library to conduct inferential analysis
from scipy.stats import f_oneway

In [83]:
# For simplicity, let's create a dataframe only containing 'Net annual income' and 'Cluster Labels'
df3 = df2[['Net annual income', 'Cluster Labels']] 
df3

Unnamed: 0,Net annual income,Cluster Labels
0,25722.727273,1
24,26275.675676,0
11,27000.0,0
3,27582.352941,0
9,28469.444444,2
29,28475.0,2
18,29604.347826,0
30,29785.714286,0
13,30000.0,0
10,30403.030303,0


In [99]:
# Let's create numpy arrays of each cluster
cluster0=df3[df3['Cluster Labels']== 0]
cluster0.drop(['Cluster Labels'], axis=1, inplace=True)
cluster0
arr0 = cluster0.to_numpy()
cluster1=df3[df3['Cluster Labels']== 1]
cluster1.drop(['Cluster Labels'], axis=1, inplace=True)
cluster1
arr1 = cluster1.to_numpy()
cluster2=df3[df3['Cluster Labels']== 2]
cluster2.drop(['Cluster Labels'], axis=1, inplace=True)
cluster2
arr2 = cluster2.to_numpy()
cluster3=df3[df3['Cluster Labels']== 3]
cluster3.drop(['Cluster Labels'], axis=1, inplace=True)
cluster3
arr3 = cluster3.to_numpy()

In [101]:
# Let's compute analysis of variance analysis
f_oneway(cluster0, cluster1, cluster2, cluster3)

F_onewayResult(statistic=array([2.29061318]), pvalue=array([0.14136551]))

In [102]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
kclusters=4
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df2['latitude'], df2['longitude'], df2['Borough'], df2['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster Labels' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters