## JUPYTER NOTEBOOK FOR APPLIED DATA SCIENCE CAPSTONE PROJECT

In [1]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup # this module helps in web scrapping.
import folium # plotting library

**Obtaining Paris museums and monuments site data from wikipedia**

In [2]:
url = "https://en.wikipedia.org/wiki/Tourism_in_Paris#:~:text=Top%20sights%20include%20Notre%20Dame,'Orsay%20(3%20million)."

_Create a BeautifulSoup object_

In [3]:
html_data = requests.get(url).text
soup = BeautifulSoup(html_data, 'html.parser')

*Get the table from the url and create a pandas data frame*

In [4]:
table=soup.find('table')
df = pd.read_html(str(table), flavor='bs4')[0]
df.head()

Unnamed: 0,Rank,Change 07/06,Museums and Monuments,2007,2006,Variation 07/06
0,1,=,Notre Dame de Paris,13650000,13650000,—
1,2,=,Basilique du Sacré-Cœur,10500000,10500000,—
2,3,=,The Louvre,8260000,8348000,-1.1%
3,4,=,Eiffel Tower,6797410,6695135,1.5%
4,5,=,Pompidou Centre,5509425,5133506,7.3%


**Note:** The data frame shows the list of top 20 Paris museums and monuments with the number of visitors for the years 2006 and 2007, highlighting the variation in numbers of visitors and change in the ranking from 2006 to 2007. The = sign means there was no change in ranking, while __ means there was no variation in the number of visitors.

In [5]:
df.shape

(20, 6)

In [6]:
df.rename(columns={'2007':'Visitors in 2007','2006':'Visitors in 2006'},inplace=True)

In [7]:
df.head()

Unnamed: 0,Rank,Change 07/06,Museums and Monuments,Visitors in 2007,Visitors in 2006,Variation 07/06
0,1,=,Notre Dame de Paris,13650000,13650000,—
1,2,=,Basilique du Sacré-Cœur,10500000,10500000,—
2,3,=,The Louvre,8260000,8348000,-1.1%
3,4,=,Eiffel Tower,6797410,6695135,1.5%
4,5,=,Pompidou Centre,5509425,5133506,7.3%


**Geospatial data of Paris museums and monuments from Foursquare**

In [8]:
geolocator = Nominatim(user_agent="touristic_site")

df['Major_Dist_Coord']= df['Museums and Monuments'].apply(geolocator.geocode).apply(lambda x: (x.latitude, x.longitude))
df[['Latitude', 'Longitude']] = df['Major_Dist_Coord'].apply(pd.Series)
df.drop(['Major_Dist_Coord'], axis=1, inplace=True)
df.head()

Unnamed: 0,Rank,Change 07/06,Museums and Monuments,Visitors in 2007,Visitors in 2006,Variation 07/06,Latitude,Longitude
0,1,=,Notre Dame de Paris,13650000,13650000,—,48.852937,2.35005
1,2,=,Basilique du Sacré-Cœur,10500000,10500000,—,48.886806,2.343015
2,3,=,The Louvre,8260000,8348000,-1.1%,48.861147,2.338028
3,4,=,Eiffel Tower,6797410,6695135,1.5%,48.85826,2.294499
4,5,=,Pompidou Centre,5509425,5133506,7.3%,48.860592,2.352474


In [62]:
df

Unnamed: 0,Rank,Change 07/06,Museums and Monuments,Visitors in 2007,Visitors in 2006,Variation 07/06,Latitude,Longitude
0,1,=,Notre Dame de Paris,13650000,13650000,—,48.852937,2.35005
1,2,=,Basilique du Sacré-Cœur,10500000,10500000,—,48.886806,2.343015
2,3,=,The Louvre,8260000,8348000,-1.1%,48.861147,2.338028
3,4,=,Eiffel Tower,6797410,6695135,1.5%,48.85826,2.294499
4,5,=,Pompidou Centre,5509425,5133506,7.3%,48.860592,2.352474
5,6,+1,Musée d'Orsay,3166509,3009203,5.2%,48.859918,2.326583
6,7,-1,Cité des Sciences et de l'Industrie,3030628,3055000,-0.8%,48.895752,2.387976
7,8,=,Chapel of Our Lady of the Miraculous Medal,2000000,2000000,—,48.850844,2.322925
8,9,+1,Arc de Triomphe,1543295,1330738,16.0%,48.873779,2.295037
9,10,+2,Musée du Quai Branly,1379623,952770,44.8%,48.860994,2.29797


In [9]:
address = 'Paris'

geolocator = Nominatim(user_agent="touristic_site")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Paris is {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Paris is 48.8566969, 2.3514616.


In [61]:
# create map of Paris using latitude and longitude. 
#You can click on the markers on the map to see which location each represents.
map_Paris = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to the map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Museums and Monuments']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Paris) 
    
map_Paris

*Using Foursquare API, I will explore the museums and monuments of Paris and segment them.*

Define Foursquare Credentials and Version

In [11]:
CLIENT_ID = 'GKKKQWGBKPHFNXAREUXWOXL2XFUWGW52ZW00O21BE5T5OZMZ' # your Foursquare ID
CLIENT_SECRET = 'MQ1MDMWQFHXLKQJVQ1MJVKDC341UWZXW2DWJNUVX1ZZN23PZ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: GKKKQWGBKPHFNXAREUXWOXL2XFUWGW52ZW00O21BE5T5OZMZ
CLIENT_SECRET:MQ1MDMWQFHXLKQJVQ1MJVKDC341UWZXW2DWJNUVX1ZZN23PZ


In [12]:
df.loc[0, 'Museums and Monuments']

'Notre Dame de Paris'

*Explore the venues around Notre Dame de Paris*

In [13]:
site_latitude = df.loc[0, 'Latitude'] # site latitude value
site_longitude = df.loc[0, 'Longitude'] # site longitude value

site_name = df.loc[0, 'Museums and Monuments'] # site name

print('Latitude and longitude values of {} are {}, {}.'.format(site_name, 
                                                               site_latitude, 
                                                               site_longitude))

Latitude and longitude values of Notre Dame de Paris are 48.85293705, 2.3500501225000026.


In [14]:
LIMIT = 25

radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    site_latitude, 
    site_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=GKKKQWGBKPHFNXAREUXWOXL2XFUWGW52ZW00O21BE5T5OZMZ&client_secret=MQ1MDMWQFHXLKQJVQ1MJVKDC341UWZXW2DWJNUVX1ZZN23PZ&v=20180605&ll=48.85293705,2.3500501225000026&radius=500&limit=25'

In [15]:
results = requests.get(url).json()
#results

In [16]:
# function that extracts the category of a single venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [17]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Shakespeare & Company,Bookstore,48.852568,2.347096
1,Comme chai Toi,French Restaurant,48.851749,2.349319
2,Tours de la Cathédrale Notre-Dame de Paris,Scenic Lookout,48.85323,2.349207
3,Sola,Japanese Restaurant,48.851569,2.348391
4,Au Vieux Paris d'Arcole,French Restaurant,48.854196,2.350312


In [18]:
nearby_venues.shape

(25, 4)

In [19]:
# function that extracts the category of all the venues
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Site', 
                  'Site Latitude', 
                  'Site Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [20]:
Touristic_site_venues = getNearbyVenues(names=df['Museums and Monuments'], latitudes=df['Latitude'], longitudes=df['Longitude'], radius=500)

Notre Dame de Paris
Basilique du Sacré-Cœur
The Louvre
Eiffel Tower
Pompidou Centre
Musée d'Orsay
Cité des Sciences et de l'Industrie
Chapel of Our Lady of the Miraculous Medal
Arc de Triomphe
Musée du Quai Branly
Muséum d'Histoire Naturelle
Musée de l'Armée
Sainte Chapelle
Musée Grévin
Institut du Monde Arabe
Musée Rodin
Musée de l'Orangerie
Petit Palais
Tour Montparnasse
Panthéon


In [21]:
print(f'Total number of sites is {Touristic_site_venues.shape[0]+1}')
Touristic_site_venues.head()

Total number of sites is 485


Unnamed: 0,Site,Site Latitude,Site Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Notre Dame de Paris,48.852937,2.35005,Shakespeare & Company,48.852568,2.347096,Bookstore
1,Notre Dame de Paris,48.852937,2.35005,Comme chai Toi,48.851749,2.349319,French Restaurant
2,Notre Dame de Paris,48.852937,2.35005,Tours de la Cathédrale Notre-Dame de Paris,48.85323,2.349207,Scenic Lookout
3,Notre Dame de Paris,48.852937,2.35005,Sola,48.851569,2.348391,Japanese Restaurant
4,Notre Dame de Paris,48.852937,2.35005,Au Vieux Paris d'Arcole,48.854196,2.350312,French Restaurant


Let's check how many venues were returned for each neighborhood

In [22]:
Touristic_site_venues.groupby('Site').count()

Unnamed: 0_level_0,Site Latitude,Site Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Arc de Triomphe,25,25,25,25,25,25
Basilique du Sacré-Cœur,25,25,25,25,25,25
Chapel of Our Lady of the Miraculous Medal,25,25,25,25,25,25
Cité des Sciences et de l'Industrie,25,25,25,25,25,25
Eiffel Tower,25,25,25,25,25,25
Institut du Monde Arabe,25,25,25,25,25,25
Musée Grévin,25,25,25,25,25,25
Musée Rodin,25,25,25,25,25,25
Musée d'Orsay,25,25,25,25,25,25
Musée de l'Armée,25,25,25,25,25,25


Let's find out how many unique categories can be curated from all the returned venues

In [23]:
print('There are {} uniques categories.'.format(len(Touristic_site_venues['Venue Category'].unique())))

There are 120 uniques categories.


Analyze each site

In [24]:
# one hot encoding
site_onehot = pd.get_dummies(Touristic_site_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
site_onehot['Site'] = Touristic_site_venues['Site'] 

# move neighborhood column to the first column
fixed_columns = [site_onehot.columns[-1]] + list(site_onehot.columns[:-1])
site_onehot = site_onehot[fixed_columns]

site_onehot.head()

Unnamed: 0,Site,American Restaurant,Antique Shop,Argentinian Restaurant,Art Gallery,Art Museum,Asian Restaurant,Auvergne Restaurant,Bagel Shop,Bakery,...,Tourist Information Center,Toy / Game Store,Track,Tram Station,Trattoria/Osteria,Vegetarian / Vegan Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Notre Dame de Paris,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Notre Dame de Paris,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Notre Dame de Paris,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Notre Dame de Paris,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Notre Dame de Paris,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
site_onehot.shape

(484, 121)

_Next, let's group rows by site and by taking the mean of the frequency of occurrence of each category_

In [26]:
site_grouped = site_onehot.groupby('Site').mean().reset_index()
site_grouped

Unnamed: 0,Site,American Restaurant,Antique Shop,Argentinian Restaurant,Art Gallery,Art Museum,Asian Restaurant,Auvergne Restaurant,Bagel Shop,Bakery,...,Tourist Information Center,Toy / Game Store,Track,Tram Station,Trattoria/Osteria,Vegetarian / Vegan Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Arc de Triomphe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.04,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0
1,Basilique du Sacré-Cœur,0.0,0.0,0.0,0.04,0.04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Chapel of Our Lady of the Miraculous Medal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0
3,Cité des Sciences et de l'Industrie,0.04,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,...,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0
4,Eiffel Tower,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.04,...,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Institut du Monde Arabe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0,0.0
6,Musée Grévin,0.0,0.04,0.04,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.04,0.0
7,Musée Rodin,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0
8,Musée d'Orsay,0.0,0.0,0.0,0.04,0.08,0.04,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Musée de l'Armée,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


*Let's print each site along with the top 5 most common venues*

In [27]:
num_top_venues = 5

for venue in site_grouped['Site']:
    print("----"+venue+"----")
    temp = site_grouped[site_grouped['Site'] == venue].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Arc de Triomphe----
                venue  freq
0   French Restaurant  0.20
1               Hotel  0.20
2  Italian Restaurant  0.12
3       Jewelry Store  0.08
4           Roof Deck  0.04


----Basilique du Sacré-Cœur----
                       venue  freq
0          French Restaurant  0.24
1  Middle Eastern Restaurant  0.08
2                      Plaza  0.08
3         Italian Restaurant  0.04
4             Breakfast Spot  0.04


----Chapel of Our Lady of the Miraculous Medal----
                venue  freq
0   French Restaurant  0.20
1  Italian Restaurant  0.12
2         Coffee Shop  0.08
3              Garden  0.08
4               Hotel  0.08


----Cité des Sciences et de l'Industrie----
                 venue  freq
0            Rock Club  0.08
1          Music Venue  0.08
2  American Restaurant  0.04
3             Gym Pool  0.04
4          Pizza Place  0.04


----Eiffel Tower----
                          venue  freq
0             French Restaurant  0.24
1            Italian Res

**Let's put that into a pandas dataframe**

*First, let's write a function to sort the venues in descending order.*

In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

*Now let's create the new dataframe and display the top 10 venues for each neighborhood.*

In [29]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Site']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
site_venues_sorted = pd.DataFrame(columns=columns)
site_venues_sorted['Site'] = site_grouped['Site']

for ind in np.arange(site_grouped.shape[0]):
    site_venues_sorted.iloc[ind, 1:] = return_most_common_venues(site_grouped.iloc[ind, :], num_top_venues)

site_venues_sorted.head()

Unnamed: 0,Site,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Arc de Triomphe,Hotel,French Restaurant,Italian Restaurant,Jewelry Store,Steakhouse,Molecular Gastronomy Restaurant,Moroccan Restaurant,Movie Theater,Cocktail Bar,Roof Deck
1,Basilique du Sacré-Cœur,French Restaurant,Plaza,Middle Eastern Restaurant,Pizza Place,Bar,Chocolate Shop,History Museum,Bistro,Scenic Lookout,Gift Shop
2,Chapel of Our Lady of the Miraculous Medal,French Restaurant,Italian Restaurant,Coffee Shop,Hotel,Garden,Chocolate Shop,Cupcake Shop,Peruvian Restaurant,Bistro,Tailor Shop
3,Cité des Sciences et de l'Industrie,Music Venue,Rock Club,American Restaurant,Steakhouse,Performing Arts Venue,Multiplex,Movie Theater,Pizza Place,Plaza,Concert Hall
4,Eiffel Tower,French Restaurant,Hotel,Italian Restaurant,Garden,Beach Bar,Library,Falafel Restaurant,Monument / Landmark,Plaza,Restaurant


**Cluster Neighborhoods**

*Run k-means to cluster the neighborhood into 10 clusters.*

In [30]:
# set number of clusters
kclusters = 4

site_grouped_clustering = site_grouped.drop('Site', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(site_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 1, 2, 3, 3, 2, 2, 2])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [31]:
# add clustering labels
site_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
site_merged = df.copy()
site_merged.rename(columns={'Museums and Monuments':'Site'}, inplace=True)

In [32]:
# merge site_grouped with df to add latitude/longitude for each neighborhood
site_merged = site_merged.join(site_venues_sorted.set_index('Site'), on='Site')

site_merged.head() # check the last columns!

Unnamed: 0,Rank,Change 07/06,Site,Visitors in 2007,Visitors in 2006,Variation 07/06,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,=,Notre Dame de Paris,13650000,13650000,—,48.852937,2.35005,3,French Restaurant,Bookstore,Bakery,Yoga Studio,Ice Cream Shop,Japanese Restaurant,Lebanese Restaurant,Falafel Restaurant,Park,Pastry Shop
1,2,=,Basilique du Sacré-Cœur,10500000,10500000,—,48.886806,2.343015,2,French Restaurant,Plaza,Middle Eastern Restaurant,Pizza Place,Bar,Chocolate Shop,History Museum,Bistro,Scenic Lookout,Gift Shop
2,3,=,The Louvre,8260000,8348000,-1.1%,48.861147,2.338028,1,Plaza,French Restaurant,Historic Site,Hotel,Cocktail Bar,Church,Chinese Restaurant,Restaurant,Cheese Shop,Shoe Store
3,4,=,Eiffel Tower,6797410,6695135,1.5%,48.85826,2.294499,2,French Restaurant,Hotel,Italian Restaurant,Garden,Beach Bar,Library,Falafel Restaurant,Monument / Landmark,Plaza,Restaurant
4,5,=,Pompidou Centre,5509425,5133506,7.3%,48.860592,2.352474,1,Coffee Shop,Restaurant,Lebanese Restaurant,French Restaurant,Liquor Store,Clothing Store,Cosmetics Shop,Chocolate Shop,Burger Joint,Bubble Tea Shop


**Let's visualize the resulting clusters**

In [64]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(site_merged['Latitude'], site_merged['Longitude'], site_merged['Site'], site_merged['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-2],
        fill=True,
        fill_color=rainbow[cluster-2],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

*Now, I can examine each cluster and determine the discriminating venue categories that distinguish each cluster.*

In [50]:
#cluster 0
site_merged.loc[site_merged['Cluster Labels'] == 0, site_merged.columns[[2] + list(range(5, site_merged.shape[1]))]].drop(columns=['Variation 07/06',"Latitude","Longitude"], axis=1)

Unnamed: 0,Site,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Muséum d'Histoire Naturelle,0,Garden,Science Museum,Playground,Military Base,French Restaurant,Plaza,Chinese Restaurant,Concert Hall,Corsican Restaurant,Comedy Club


In [51]:
#cluster 1
site_merged.loc[site_merged['Cluster Labels'] == 1, site_merged.columns[[2] + list(range(5, site_merged.shape[1]))]].drop(columns=['Variation 07/06',"Latitude","Longitude"], axis=1)

Unnamed: 0,Site,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,The Louvre,1,Plaza,French Restaurant,Historic Site,Hotel,Cocktail Bar,Church,Chinese Restaurant,Restaurant,Cheese Shop,Shoe Store
4,Pompidou Centre,1,Coffee Shop,Restaurant,Lebanese Restaurant,French Restaurant,Liquor Store,Clothing Store,Cosmetics Shop,Chocolate Shop,Burger Joint,Bubble Tea Shop
6,Cité des Sciences et de l'Industrie,1,Music Venue,Rock Club,American Restaurant,Steakhouse,Performing Arts Venue,Multiplex,Movie Theater,Pizza Place,Plaza,Concert Hall
12,Sainte Chapelle,1,Plaza,Falafel Restaurant,Wine Bar,Hotel,Restaurant,Indie Movie Theater,Burger Joint,Moroccan Restaurant,Bookstore,Lebanese Restaurant
16,Musée de l'Orangerie,1,Hotel,Art Museum,Bookstore,Hotel Bar,Plaza,Fountain,Tea Room,Dessert Shop,Scandinavian Restaurant,Perfume Shop
19,Panthéon,1,Italian Restaurant,Plaza,Hotel,Sandwich Place,Fountain,Sicilian Restaurant,Monument / Landmark,Café,Restaurant,Pizza Place


In [52]:
#cluster 2
site_merged.loc[site_merged['Cluster Labels'] == 2, site_merged.columns[[2] + list(range(5, site_merged.shape[1]))]].drop(columns=['Variation 07/06',"Latitude","Longitude"], axis=1)

Unnamed: 0,Site,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Basilique du Sacré-Cœur,2,French Restaurant,Plaza,Middle Eastern Restaurant,Pizza Place,Bar,Chocolate Shop,History Museum,Bistro,Scenic Lookout,Gift Shop
3,Eiffel Tower,2,French Restaurant,Hotel,Italian Restaurant,Garden,Beach Bar,Library,Falafel Restaurant,Monument / Landmark,Plaza,Restaurant
5,Musée d'Orsay,2,French Restaurant,Hotel,Garden,Art Museum,Bookstore,Historic Site,Fountain,Food Truck,Exhibit,Pizza Place
7,Chapel of Our Lady of the Miraculous Medal,2,French Restaurant,Italian Restaurant,Coffee Shop,Hotel,Garden,Chocolate Shop,Cupcake Shop,Peruvian Restaurant,Bistro,Tailor Shop
8,Arc de Triomphe,2,Hotel,French Restaurant,Italian Restaurant,Jewelry Store,Steakhouse,Molecular Gastronomy Restaurant,Moroccan Restaurant,Movie Theater,Cocktail Bar,Roof Deck
9,Musée du Quai Branly,2,French Restaurant,Italian Restaurant,Art Museum,Chinese Restaurant,Hotel,Market,Hotel Bar,Monument / Landmark,Bookstore,Coffee Shop
11,Musée de l'Armée,2,French Restaurant,History Museum,Plaza,Cocktail Bar,Hotel,Historic Site,Garden,Greek Restaurant,Dessert Shop,Korean Restaurant
15,Musée Rodin,2,French Restaurant,Plaza,History Museum,Hotel,Garden,Café,Historic Site,Burger Joint,Coffee Shop,Italian Restaurant
17,Petit Palais,2,Garden,French Restaurant,Hotel,Historic Site,Plaza,Café,Boutique,Lounge,Tailor Shop,Beer Garden


In [53]:
#cluster 3
site_merged.loc[site_merged['Cluster Labels'] == 3, site_merged.columns[[1] + list(range(5, site_merged.shape[1]))]].drop(columns=['Variation 07/06',"Latitude","Longitude"], axis=1)

Unnamed: 0,Change 07/06,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,=,3,French Restaurant,Bookstore,Bakery,Yoga Studio,Ice Cream Shop,Japanese Restaurant,Lebanese Restaurant,Falafel Restaurant,Park,Pastry Shop
13,+3,3,French Restaurant,Hotel,Wine Bar,Steakhouse,Ice Cream Shop,Women's Store,Fish & Chips Shop,Corsican Restaurant,New American Restaurant,Concert Hall
14,-1,3,French Restaurant,Ice Cream Shop,Wine Bar,Museum,Indie Movie Theater,Boat or Ferry,Historic Site,Japanese Restaurant,Garden,Escape Room
18,+3,3,Japanese Restaurant,Creperie,French Restaurant,Chinese Restaurant,Caribbean Restaurant,Museum,Argentinian Restaurant,Scenic Lookout,Italian Restaurant,Auvergne Restaurant


**Conclusion:**

From the cluster values, it can be seen the clusters have been grouped according to the frequency of the most common venues.