# **Exploring European Restaurant Market**
## **Peer-graded Assignment: Capstone Project - The Battle of Neighborhoods**

**Simon G.**

**June 7, 2020**

---

## **Variables you might want to change**

In [1]:
number_of_cities = 500 #up to 500 possible at the moment
radius = 5000 #radius of search request for venues
LIMIT = 1000 #number of venues pulled for each city
num_top_venues = 3 #number of top venues that pop up for the ranking list, e.g. the 3 highest rated venues will pop up
kclusters = 3 #number of clusters the algorithm should find

## **Lets start coding!**
### *Importing libraries*

In [2]:
#import libraries and packages
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim

import json 
import requests # library to handle requests
from pandas import json_normalize

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium

#!conda install -c conda-forge lxml --yes
import lxml

print('Libraries imported.')

Libraries imported.


### *Get the largest cities in Europe*

In [3]:
#webpage
url_list = ['http://www.citymayors.com/features/euro_cities1.html', 'http://www.citymayors.com/features/euro_cities2.html', 'http://www.citymayors.com/features/euro_cities3.html', 'http://www.citymayors.com/features/euro_cities4.html', 'http://www.citymayors.com/features/euro_cities5.html']
i=0
for url in url_list:
    df_raw = pd.read_html(url)
    df_cities = df_raw[1]

    #fix header
    new_header = df_cities.iloc[0]
    df_cities=df_cities[1:]
    df_cities.columns = new_header

    #append df if more urls are used
    if i > 0:
        df_all_cities = pd.concat([df_all_cities, df_cities])
    else:
        df_all_cities = df_cities
    i+=1
    print ('{} cities in list downloaded'.format(df_all_cities.shape[0]))

#get list of address
def get_address(city,country):
    list_address = []
    for city, country in zip(city, country):
        temp_string = str(city + ", " + country)
        list_address.append(temp_string)
    return list_address

list_address = get_address(city=df_all_cities['City'],
                                   country=df_all_cities['Country']
                                  )
list_address = list_address[:number_of_cities]

100 cities in list downloaded
200 cities in list downloaded
300 cities in list downloaded
400 cities in list downloaded
500 cities in list downloaded


### *Get the coordinates for each city in the list*

In [4]:
#function to extract cooridnates
def get_latlon(address):
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return address, latitude, longitude

#empty lists that are getting filled 
list_city = []
list_country = []
list_lat = []
list_lon = []

#looping through the address list
for address in list_address:
    try:
        a,b,c = get_latlon(address)
        d = a.split(',')
        list_city.append(d[0])
        list_country.append(d[1])
        list_lat.append(b)
        list_lon.append(c)
        #print('The geograpical coordinate of {} are {}, {}.'.format(a,b, c))
    except:
        print('No coordinates found for {}. Skipping!'.format(address))
        
#putting all list into a dataframe
df1 = pd.DataFrame({'City':list_city, 'Country':list_country, 'Latitude':list_lat, 'Longitude':list_lon})
print (df1.shape)
df1.head()

No coordinates found for Ekaterinoburg, Russia. Skipping!
No coordinates found for Yaroslave, Russia. Skipping!
No coordinates found for Makeyevka, Ukraine. Skipping!
No coordinates found for Vinnutsya, Ukraine. Skipping!
No coordinates found for Dneprodzerzhinsk, Ukraine. Skipping!
No coordinates found for Chernovtsy, Ukraine. Skipping!
No coordinates found for Ioshkap-Ola, Russia. Skipping!
No coordinates found for Rhondda with Cynon & Taff, UK. Skipping!
No coordinates found for Krementchug, Ukraine. Skipping!
No coordinates found for Nizhenvartovsk, Russia. Skipping!
No coordinates found for Ternopol, Ukraine. Skipping!
No coordinates found for Syktivkar, Russia. Skipping!
No coordinates found for Belaya Tserkov, Ukraine. Skipping!
No coordinates found for Starsy Oskol, Russia. Skipping!
No coordinates found for Piraiévs, Greece. Skipping!
No coordinates found for Uzno-Sakhalinsk, Russia. Skipping!
No coordinates found for Podgorica, Serbia. Skipping!
(483, 4)


Unnamed: 0,City,Country,Latitude,Longitude
0,MOSKVA (Moscow),Russia,55.750446,37.617494
1,LONDON,UK,51.507322,-0.127647
2,St Petersburg,Russia,59.960674,30.158655
3,BERLIN,Germany,52.517037,13.38886
4,MADRID,Spain,40.416705,-3.703582


## *Using Foursquare to get json file with venues from each city*

In [5]:
#credentials
#CLIENT_ID = 'QAXHZLGZ1B3LIEE5TWIXLLRB254ZZO1GTOYEPXZRLLLPS43E' # your Foursquare ID
#CLIENT_SECRET = 'RLBEV2B4C3Q3F3S1IKSO1SAHE1OY4PA1CXPA0JLGQ0DMTBLF' # your Foursquare Secret

CLIENT_ID = 'M4Z1JXYNUBJDUEZ3ZZZMTUX1EQRINLVAUPQQQIEJ5WEXNJLZ'
CLIENT_SECRET = 'TEE1OMDV35A0JIUQC0SLRD1P0R0RRY0YJMX53OL04J1K1BOE'
VERSION = '20180605' # Foursquare API version



In [6]:
#function to extract the information
def getNearbyVenues(city, latitudes, longitudes, radius=radius):
    
    venues_list=[]
    for city, lat, lng in zip(city, latitudes, longitudes):
    
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
            venues_list.append([(
                    city, 
                    lat, 
                    lng,
                    v['venue']['name'], 
                    v['venue']['categories'][0]['name']) for v in results])
        except:
            print ('No results can be acquired for {}, dropping city'.format(city))

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue Name',  
                  'Venue Category']
    
    return(nearby_venues)

In [7]:
#calling foursquare for information, using the function above
all_city_venues = getNearbyVenues(city=df1['City'],
                                   latitudes=df1['Latitude'],
                                   longitudes=df1['Longitude']
                                  )

No results can be acquired for LONDON, dropping city
No results can be acquired for SKOPLJE, dropping city
No results can be acquired for Bolton, dropping city
No results can be acquired for Padova, dropping city
No results can be acquired for Solihull, dropping city


### *Cleaning the data received as not all is relevant for us*

In [8]:
#cleaning dataframe of rows that are not relevant, for instance: containing no restaurant string
df2 = all_city_venues[all_city_venues['Venue Category'].str.contains(pat = 'Restaurant')].reset_index(drop=True)
df2 = df2[~df2['Venue Category'].isin(['Restaurant'])].reset_index(drop=True)

#print some detail information about cleaning process
print ('All venues of all cities: {}'.format(all_city_venues.shape))
print ('Only relevant venues of all cities: {}'.format(df2.shape))
num_city = len(df2['City'].unique())
print('{} cities found'.format(num_city))
num_venue = len(df2['Venue Category'].unique())
print('{} unique kind of restaurants found'.format(num_venue))
df2.head()

All venues of all cities: (38446, 5)
Only relevant venues of all cities: (5916, 5)
450 cities found
108 unique kind of restaurants found


Unnamed: 0,City,City Latitude,City Longitude,Venue Name,Venue Category
0,MOSKVA (Moscow),55.750446,37.617494,Beluga (Белуга),Russian Restaurant
1,MOSKVA (Moscow),55.750446,37.617494,Cafe Pushkin (Кафе Пушкинъ),Russian Restaurant
2,MOSKVA (Moscow),55.750446,37.617494,Lao Lee,Vietnamese Restaurant
3,MOSKVA (Moscow),55.750446,37.617494,Mission,Vegetarian / Vegan Restaurant
4,MOSKVA (Moscow),55.750446,37.617494,Bô,Vietnamese Restaurant


## **Preparing data for clustering**
### *Starting with one hot encoding*

In [9]:
#one hot encoding
df3 = pd.get_dummies(df2[['Venue Category']], prefix="", prefix_sep="") 
df3['City'] = df2['City']
fixed_columns = [df3.columns[-1]] + list(df3.columns[:-1])
df3 = df3[fixed_columns]

#compare dataframe after tranform with the one above
print ('{} columns found, is it matching with the unique values of restaurants?'.format(df3.shape[1]))
print ('{} rows found, is it matching with the relevant values of venues?'.format(df3.shape[0]))
df3.head()

109 columns found, is it matching with the unique values of restaurants?
5916 rows found, is it matching with the relevant values of venues?


Unnamed: 0,City,Afghan Restaurant,African Restaurant,Alsatian Restaurant,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Asian Restaurant,Australian Restaurant,Austrian Restaurant,Bavarian Restaurant,Belarusian Restaurant,Belgian Restaurant,Brazilian Restaurant,Bulgarian Restaurant,Cajun / Creole Restaurant,Cambodian Restaurant,Caribbean Restaurant,Caucasian Restaurant,Chinese Restaurant,Comfort Food Restaurant,Cretan Restaurant,Cuban Restaurant,Czech Restaurant,Dim Sum Restaurant,Doner Restaurant,Dumpling Restaurant,Dutch Restaurant,Eastern European Restaurant,Empanada Restaurant,English Restaurant,Ethiopian Restaurant,Falafel Restaurant,Fast Food Restaurant,Filipino Restaurant,French Restaurant,German Restaurant,Gluten-free Restaurant,Greek Restaurant,Grilled Meat Restaurant,Halal Restaurant,Hawaiian Restaurant,Himalayan Restaurant,Hungarian Restaurant,Indian Chinese Restaurant,Indian Restaurant,Indonesian Restaurant,Israeli Restaurant,Italian Restaurant,Japanese Restaurant,Jewish Restaurant,Kebab Restaurant,Korean Restaurant,Kosher Restaurant,Kurdish Restaurant,Latin American Restaurant,Lebanese Restaurant,Ligurian Restaurant,Malay Restaurant,Mediterranean Restaurant,Mexican Restaurant,Meze Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Modern Greek Restaurant,Molecular Gastronomy Restaurant,Mongolian Restaurant,Moroccan Restaurant,New American Restaurant,North Indian Restaurant,Paella Restaurant,Pakistani Restaurant,Persian Restaurant,Peruvian Restaurant,Piedmontese Restaurant,Polish Restaurant,Portuguese Restaurant,Provençal Restaurant,Puglia Restaurant,Ramen Restaurant,Rhenisch Restaurant,Romanian Restaurant,Russian Restaurant,Scandinavian Restaurant,Scottish Restaurant,Seafood Restaurant,Sicilian Restaurant,South American Restaurant,South Indian Restaurant,Southern / Soul Food Restaurant,Southwestern French Restaurant,Spanish Restaurant,Sri Lankan Restaurant,Sushi Restaurant,Swiss Restaurant,Syrian Restaurant,Tapas Restaurant,Tatar Restaurant,Thai Restaurant,Theme Restaurant,Tibetan Restaurant,Turkish Restaurant,Udon Restaurant,Ukrainian Restaurant,Vegetarian / Vegan Restaurant,Veneto Restaurant,Vietnamese Restaurant,West-Ukrainian Restaurant,Yakitori Restaurant
0,MOSKVA (Moscow),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,MOSKVA (Moscow),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,MOSKVA (Moscow),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,MOSKVA (Moscow),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,MOSKVA (Moscow),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


### *Rating the restaurants on base of how often they came up*
Grouping by city and calculate the mean value of each restaurant kind.

In [10]:
df4 =df3.groupby('City').mean().reset_index()
print ('{} rows found, this is equal to the number of cities that will be clustered'.format(df4.shape[0]))
df4.head()

450 rows found, this is equal to the number of cities that will be clustered


Unnamed: 0,City,Afghan Restaurant,African Restaurant,Alsatian Restaurant,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Asian Restaurant,Australian Restaurant,Austrian Restaurant,Bavarian Restaurant,Belarusian Restaurant,Belgian Restaurant,Brazilian Restaurant,Bulgarian Restaurant,Cajun / Creole Restaurant,Cambodian Restaurant,Caribbean Restaurant,Caucasian Restaurant,Chinese Restaurant,Comfort Food Restaurant,Cretan Restaurant,Cuban Restaurant,Czech Restaurant,Dim Sum Restaurant,Doner Restaurant,Dumpling Restaurant,Dutch Restaurant,Eastern European Restaurant,Empanada Restaurant,English Restaurant,Ethiopian Restaurant,Falafel Restaurant,Fast Food Restaurant,Filipino Restaurant,French Restaurant,German Restaurant,Gluten-free Restaurant,Greek Restaurant,Grilled Meat Restaurant,Halal Restaurant,Hawaiian Restaurant,Himalayan Restaurant,Hungarian Restaurant,Indian Chinese Restaurant,Indian Restaurant,Indonesian Restaurant,Israeli Restaurant,Italian Restaurant,Japanese Restaurant,Jewish Restaurant,Kebab Restaurant,Korean Restaurant,Kosher Restaurant,Kurdish Restaurant,Latin American Restaurant,Lebanese Restaurant,Ligurian Restaurant,Malay Restaurant,Mediterranean Restaurant,Mexican Restaurant,Meze Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Modern Greek Restaurant,Molecular Gastronomy Restaurant,Mongolian Restaurant,Moroccan Restaurant,New American Restaurant,North Indian Restaurant,Paella Restaurant,Pakistani Restaurant,Persian Restaurant,Peruvian Restaurant,Piedmontese Restaurant,Polish Restaurant,Portuguese Restaurant,Provençal Restaurant,Puglia Restaurant,Ramen Restaurant,Rhenisch Restaurant,Romanian Restaurant,Russian Restaurant,Scandinavian Restaurant,Scottish Restaurant,Seafood Restaurant,Sicilian Restaurant,South American Restaurant,South Indian Restaurant,Southern / Soul Food Restaurant,Southwestern French Restaurant,Spanish Restaurant,Sri Lankan Restaurant,Sushi Restaurant,Swiss Restaurant,Syrian Restaurant,Tapas Restaurant,Tatar Restaurant,Thai Restaurant,Theme Restaurant,Tibetan Restaurant,Turkish Restaurant,Udon Restaurant,Ukrainian Restaurant,Vegetarian / Vegan Restaurant,Veneto Restaurant,Vietnamese Restaurant,West-Ukrainian Restaurant,Yakitori Restaurant
0,AMSTERDAM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.076923,0.0,0.0,0.076923,0.076923,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0
1,ATHINAI (Athens),0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.066667,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266667,0.0,0.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Aachen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.043478,0.26087,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.217391,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.086957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.086957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.043478,0.0,0.0
3,Abakan,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0
4,Aberdeen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.166667,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### *Find and highlight the kind of restaurants that are most common for each city*
### *Further, also run kmean clustering and add columns of clusters to dataframe*

In [11]:
#function to get the highest values
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#string extensions
indicators = ['st', 'nd', 'rd']

#create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

#create a new dataframe
df5 = pd.DataFrame(columns=columns)
df5['City'] = df4['City']

#loop through function
for ind in np.arange(df4.shape[0]):
    df5.iloc[ind, 1:] = return_most_common_venues(df4.iloc[ind, :], num_top_venues)

#drop first row for clustering
df6 = df4.drop('City', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df6)

# add clustering labels
df5.insert(0, 'Cluster Labels', kmeans.labels_)
df5.head()

Unnamed: 0,Cluster Labels,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,1,AMSTERDAM,French Restaurant,Korean Restaurant,Mexican Restaurant
1,1,ATHINAI (Athens),Meze Restaurant,Greek Restaurant,Italian Restaurant
2,1,Aachen,German Restaurant,Italian Restaurant,Sushi Restaurant
3,0,Abakan,Italian Restaurant,Sushi Restaurant,Dumpling Restaurant
4,1,Aberdeen,Seafood Restaurant,Italian Restaurant,Mexican Restaurant


### *Cleaning up and display proper dataframe*

In [12]:
# merging the former df with the sorted one
df7 = df5.join(df1.set_index('City'), on='City')
df7 = df7[['City', 'Country', 'Latitude','Longitude', 'Cluster Labels', '1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue']]
df7.head() 

Unnamed: 0,City,Country,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,AMSTERDAM,Netherlands,52.37276,4.893604,1,French Restaurant,Korean Restaurant,Mexican Restaurant
1,ATHINAI (Athens),Greece,37.987228,23.764359,1,Meze Restaurant,Greek Restaurant,Italian Restaurant
2,Aachen,Germany,50.776351,6.083862,1,German Restaurant,Italian Restaurant,Sushi Restaurant
3,Abakan,Russia,53.720902,91.442435,0,Italian Restaurant,Sushi Restaurant,Dumpling Restaurant
4,Aberdeen,UK,57.148243,-2.092809,1,Seafood Restaurant,Italian Restaurant,Mexican Restaurant


## **Finally create the map**

In [13]:
#create map, coordinates on Karsruhe, Germany
latitude = 49.006889
longitude = 8.403653
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=3)

color = ['red', 'green', 'orange']

#add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df7['Latitude'], df7['Longitude'], df7['City'], df7['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.Marker(location=[lat, lon],popup=label,icon=folium.Icon(color=color[cluster], icon='circle')).add_to(map_clusters)

### *Identify cluster names*

In [14]:
df_label1 = df7.loc[df7['Cluster Labels'] == 0, df7.columns[[0] + list(range(4, df7.shape[1]))]]
df_label1.head(5)

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
3,Abakan,0,Italian Restaurant,Sushi Restaurant,Dumpling Restaurant
11,Arad,0,Italian Restaurant,Eastern European Restaurant,Fast Food Restaurant
14,Astrakhan,0,Italian Restaurant,Dumpling Restaurant,Fast Food Restaurant
15,Augsburg,0,Italian Restaurant,German Restaurant,Turkish Restaurant
18,BRATISLAVA,0,Vegetarian / Vegan Restaurant,Italian Restaurant,Vietnamese Restaurant


In [15]:
df_label2 = df7.loc[df7['Cluster Labels'] == 1, df7.columns[[0] + list(range(4, df7.shape[1]))]]
df_label2.head()

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,AMSTERDAM,1,French Restaurant,Korean Restaurant,Mexican Restaurant
1,ATHINAI (Athens),1,Meze Restaurant,Greek Restaurant,Italian Restaurant
2,Aachen,1,German Restaurant,Italian Restaurant,Sushi Restaurant
4,Aberdeen,1,Seafood Restaurant,Italian Restaurant,Mexican Restaurant
5,Alcalá de Henares,1,Mediterranean Restaurant,Italian Restaurant,Fast Food Restaurant


In [16]:
df_label3 = df7.loc[df7['Cluster Labels'] == 2, df7.columns[[0] + list(range(4, df7.shape[1]))]]
df_label3.head()

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
12,Arkhangelsk,2,Fast Food Restaurant,Caucasian Restaurant,Argentinian Restaurant
29,Barnsley,2,Fast Food Restaurant,Indian Restaurant,English Restaurant
30,Basildon,2,American Restaurant,Fast Food Restaurant,English Restaurant
32,Belgorod,2,Fast Food Restaurant,Ukrainian Restaurant,Italian Restaurant
36,Bialystok,2,Fast Food Restaurant,Sushi Restaurant,Turkish Restaurant


In [17]:
#df7.loc[df7['Cluster Labels'] == 3, df7.columns[[0] + list(range(4, df7.shape[1]))]]

In [18]:
# create a legend for clusters
legend_html =   '''
                <div style="position: fixed; 
                            top: 50px; left: 50px; width: 230px; height: 140px; 
                            border:4px solid grey; z-index:9999; font-size:16px;
                            ">
                            &nbsp; <b> Categories </b><br>
                            &nbsp; <i class="fa fa-map-marker fa-2x" style="color:red"></i> Group 1 unspecific  &nbsp; <br>
                            &nbsp; <i class="fa fa-map-marker fa-2x" style="color:green"></i> Group 2 unspecific &nbsp; <br>
                            &nbsp; <i class="fa fa-map-marker fa-2x" style="color:orange"></i> Group 3 unspecific &nbsp; <br>
                              
                </div>
                ''' 

map_clusters.get_root().html.add_child(folium.Element(legend_html))

map_clusters

## **Clustering Markers**
### *to reduce the amount of Markers on the map*

In [19]:
from folium import plugins
latitude = 49.006889
longitude = 8.403653
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=3)

color = ['red', 'green', 'orange']

# instantiate a mark cluster object for the incidents in the dataframe
cities_1 = plugins.MarkerCluster(icon_create_function = '''
    function(cluster) {
    return L.divIcon({html: '<div><span><b>' + cluster.getChildCount() + '</b></span></div>',
                      className: 'marker-cluster marker-cluster-large',
                      iconSize: new L.Point(40, 40)});
    }
''').add_to(map_clusters)


cities_2 = plugins.MarkerCluster(icon_create_function = '''
    function(cluster) {
    return L.divIcon({html: '<div><span><b>' + cluster.getChildCount() + '</b></span></div>',
                      className: 'marker-cluster marker-cluster-small',
                      iconSize: new L.Point(40, 40)});
    }
''').add_to(map_clusters)

cities_3 = plugins.MarkerCluster(icon_create_function = '''
    function(cluster) {
    return L.divIcon({html: '<div><span><b>' + cluster.getChildCount() + '</b></span></div>',
                      className: 'marker-cluster marker-cluster-medium',
                      iconSize: new L.Point(40, 40)});
    }
''').add_to(map_clusters)


# loop through the dataframe and add each data point to the mark cluster
for lat, lon, name, cluster_labels, in zip(df7['Latitude'], df7['Longitude'], df7['City'], df7['Cluster Labels']):#df_incidents.Y, df_incidents.X, df_incidents.Category):
    if cluster_labels == 0:
        folium.Marker(
            location=[lat, lon],
            icon=folium.Icon(color=color[0], icon='circle'),
            popup=name,
        ).add_to(cities_1)

# loop through the dataframe and add each data point to the mark cluster
for lat, lon, name, cluster_labels, in zip(df7['Latitude'], df7['Longitude'], df7['City'], df7['Cluster Labels']):#df_incidents.Y, df_incidents.X, df_incidents.Category):
    if cluster_labels == 1:
        folium.Marker(
            location=[lat, lon],
            icon=folium.Icon(color=color[1], icon='circle'),
            popup=name,
        ).add_to(cities_2)

# loop through the dataframe and add each data point to the mark cluster
for lat, lon, name, cluster_labels, in zip(df7['Latitude'], df7['Longitude'], df7['City'], df7['Cluster Labels']):#df_incidents.Y, df_incidents.X, df_incidents.Category):
    if cluster_labels == 2:
        folium.Marker(
            location=[lat, lon],
            icon=folium.Icon(color=color[2], icon='circle'),
            popup=name,
        ).add_to(cities_3)

In [20]:
# create a legend for clusters
legend_html =   '''
                <div style="position: fixed; 
                            top: 50px; left: 50px; width: 230px; height: 140px; 
                            border:4px solid grey; z-index:9999; font-size:16px;
                            ">
                            &nbsp; <b> Categories </b><br>
                            &nbsp; <i class="fa fa-map-marker fa-2x" style="color:red"></i> Group 1 unspecific  &nbsp; <br>
                            &nbsp; <i class="fa fa-map-marker fa-2x" style="color:green"></i> Group 2 unspecific &nbsp; <br>
                            &nbsp; <i class="fa fa-map-marker fa-2x" style="color:orange"></i> Group 3 unspecific &nbsp; <br>
                              
                </div>
                ''' 

map_clusters.get_root().html.add_child(folium.Element(legend_html))
        
# display map
map_clusters

## **Preprep restaurants before for clustering**

### *create dictionary for cuisines, this is highly subjective and for everyone different*

In [21]:
cuisine_dict = {'North American cuisine' : ['New American Restaurant','Grilled Meat Restaurant','American Restaurant', 'Fast Food Restaurant'],
                'South American cuisine' : ['Mexican Restaurant','Caribbean Restaurant','Argentinian Restaurant', 'Latin American Restaurant'],
                'North European cuisine' : ['Scandinavian Restaurant','Polish Restaurant','Hungarian Restaurant','English Restaurant','Dutch Restaurant','Czech Restaurant','Bulgarian Restaurant','Bavarian Restaurant','Austrian Restaurant','French Restaurant', 'German Restaurant', 'Modern European Restaurant', 'Romanian Restaurant', 'Russian Restaurant', 'Scottish Restaurant'],
                'South European cuisine' : ['Sicilian Restaurant','Cretan Restaurant','Italian Restaurant', 'Greek Restaurant', 'Mediterranean Restaurant', 'Portuguese Restaurant', 'Seafood Restaurant', 'Spanish Restaurant'],
                'Far Eastern cuisine' : ['Middle Eastern Restaurant','Kebab Restaurant','Halal Restaurant','Eastern European Restaurant', 'Doner Restaurant'],
                'Asian cusine' : ['Ramen Restaurant','Asian Restaurant', 'Chinese Restaurant', 'Indian Restaurant', 'Japanese Restaurant', 'Sushi Restaurant', 'Thai Restaurant','Vietnamese Restaurant']}

In [22]:
# function to loop through the dictionary
def sum_rest(cuisine, list_restaurants):
    df_temp = df4[list_restaurants]
    df_all_cuisines[cuisine] = df_temp.sum(axis=1)

In [23]:
df_all_cuisines = df4[['City']]
for key in cuisine_dict:
    cuisine = key
    list_restaurants = cuisine_dict[key]
    sum_rest(cuisine,list_restaurants)
df_all_cuisines.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,City,North American cuisine,South American cuisine,North European cuisine,South European cuisine,Far Eastern cuisine,Asian cusine
0,AMSTERDAM,0.0,0.230769,0.307692,0.153846,0.0,0.0
1,ATHINAI (Athens),0.0,0.0,0.2,0.333333,0.0,0.133333
2,Aachen,0.0,0.0,0.304348,0.347826,0.0,0.173913
3,Abakan,0.0,0.0,0.0,0.285714,0.0,0.571429
4,Aberdeen,0.0,0.083333,0.166667,0.416667,0.0,0.25


## **Next round of clustering, with adjusted datatable**

In [24]:
#function to get the highest values
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#string extensions
indicators = ['st', 'nd', 'rd']

#create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

#create a new dataframe
df10 = pd.DataFrame(columns=columns)
df10['City'] = df_all_cuisines['City']

#loop through function
for ind in np.arange(df_all_cuisines.shape[0]):
    df10.iloc[ind, 1:] = return_most_common_venues(df_all_cuisines.iloc[ind, :], num_top_venues)

#drop first row for clustering
df11 = df_all_cuisines.drop('City', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df11)

# add clustering labels
df10.insert(0, 'Cluster Labels', kmeans.labels_)
df10.head()

Unnamed: 0,Cluster Labels,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,0,AMSTERDAM,North European cuisine,South American cuisine,South European cuisine
1,0,ATHINAI (Athens),South European cuisine,North European cuisine,Asian cusine
2,0,Aachen,South European cuisine,North European cuisine,Asian cusine
3,0,Abakan,Asian cusine,South European cuisine,Far Eastern cuisine
4,1,Aberdeen,South European cuisine,Asian cusine,North European cuisine


In [25]:
# merging the former df with the sorted one
df12 = df10.join(df1.set_index('City'), on='City')
df12 = df12[['City', 'Country', 'Latitude','Longitude', 'Cluster Labels', '1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue']]
df12.head() 

Unnamed: 0,City,Country,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,AMSTERDAM,Netherlands,52.37276,4.893604,0,North European cuisine,South American cuisine,South European cuisine
1,ATHINAI (Athens),Greece,37.987228,23.764359,0,South European cuisine,North European cuisine,Asian cusine
2,Aachen,Germany,50.776351,6.083862,0,South European cuisine,North European cuisine,Asian cusine
3,Abakan,Russia,53.720902,91.442435,0,Asian cusine,South European cuisine,Far Eastern cuisine
4,Aberdeen,UK,57.148243,-2.092809,1,South European cuisine,Asian cusine,North European cuisine


In [26]:
#create map, coordinates on Karsruhe, Germany
latitude = 49.006889
longitude = 8.403653
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=3)

color = ['red', 'green', 'orange']

#add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df12['Latitude'], df12['Longitude'], df12['City'], df12['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.Marker(location=[lat, lon],popup=label,icon=folium.Icon(color=color[cluster], icon='circle')).add_to(map_clusters)

## **Identify labels for legend**

In [27]:
df_label1 = df12.loc[df12['Cluster Labels'] == 0, df12.columns[[0] + list(range(4, df12.shape[1]))]]
df_label1.head()

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,AMSTERDAM,0,North European cuisine,South American cuisine,South European cuisine
1,ATHINAI (Athens),0,South European cuisine,North European cuisine,Asian cusine
2,Aachen,0,South European cuisine,North European cuisine,Asian cusine
3,Abakan,0,Asian cusine,South European cuisine,Far Eastern cuisine
8,Angarsk,0,Asian cusine,North American cuisine,North European cuisine


In [28]:
df_label2 = df12.loc[df12['Cluster Labels'] == 1, df12.columns[[0] + list(range(4, df12.shape[1]))]]
df_label2.head()

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
4,Aberdeen,1,South European cuisine,Asian cusine,North European cuisine
5,Alcalá de Henares,1,South European cuisine,North American cuisine,Asian cusine
6,Alicante,1,South European cuisine,Asian cusine,North European cuisine
7,Almería,1,South European cuisine,Asian cusine,North American cuisine
11,Arad,1,South European cuisine,Far Eastern cuisine,North American cuisine


In [29]:
df_label3 = df12.loc[df12['Cluster Labels'] == 2, df12.columns[[0] + list(range(4, df12.shape[1]))]]
df_label3.head()

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
12,Arkhangelsk,2,North American cuisine,Far Eastern cuisine,North European cuisine
29,Barnsley,2,North American cuisine,Asian cusine,North European cuisine
30,Basildon,2,North American cuisine,South European cuisine,North European cuisine
32,Belgorod,2,North American cuisine,South European cuisine,Asian cusine
34,Berezniki,2,Far Eastern cuisine,South European cuisine,North American cuisine


In [30]:
# create a legend for clusters
legend_html =   '''
                <div style="position: fixed; 
                            top: 50px; left: 50px; width: 350px; height: 140px; 
                            border:4px solid grey; z-index:9999; font-size:16px;
                            ">
                            &nbsp; <b> Categories </b><br>
                            &nbsp; <i class="fa fa-map-marker fa-2x" style="color:red"></i> Asian cuisine &nbsp; <br>
                            &nbsp; <i class="fa fa-map-marker fa-2x" style="color:green"></i> South European cuisine &nbsp; <br>
                            &nbsp; <i class="fa fa-map-marker fa-2x" style="color:orange"></i> Far Eastern with South European cuisine &nbsp; <br>
                              
                </div>
                ''' 

map_clusters.get_root().html.add_child(folium.Element(legend_html))

map_clusters

## **Cluster cities together of the same category**

In [31]:
from folium import plugins
latitude = 49.006889
longitude = 8.403653
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=3)

color = ['red', 'green', 'orange']

# instantiate a mark cluster object for the incidents in the dataframe
cities_1 = plugins.MarkerCluster(icon_create_function = '''
    function(cluster) {
    return L.divIcon({html: '<div><span><b>' + cluster.getChildCount() + '</b></span></div>',
                      className: 'marker-cluster marker-cluster-large',
                      iconSize: new L.Point(40, 40)});
    }
''').add_to(map_clusters)


cities_2 = plugins.MarkerCluster(icon_create_function = '''
    function(cluster) {
    return L.divIcon({html: '<div><span><b>' + cluster.getChildCount() + '</b></span></div>',
                      className: 'marker-cluster marker-cluster-small',
                      iconSize: new L.Point(40, 40)});
    }
''').add_to(map_clusters)

cities_3 = plugins.MarkerCluster(icon_create_function = '''
    function(cluster) {
    return L.divIcon({html: '<div><span><b>' + cluster.getChildCount() + '</b></span></div>',
                      className: 'marker-cluster marker-cluster-medium',
                      iconSize: new L.Point(40, 40)});
    }
''').add_to(map_clusters)


# loop through the dataframe and add each data point to the mark cluster
for lat, lon, name, cluster_labels, in zip(df12['Latitude'], df12['Longitude'], df12['City'], df12['Cluster Labels']):#df_incidents.Y, df_incidents.X, df_incidents.Category):
    if cluster_labels == 0:
        folium.Marker(
            location=[lat, lon],
            icon=folium.Icon(color=color[0], icon='circle'),
            popup=name,
        ).add_to(cities_1)

# loop through the dataframe and add each data point to the mark cluster
for lat, lon, name, cluster_labels, in zip(df12['Latitude'], df12['Longitude'], df12['City'], df12['Cluster Labels']):#df_incidents.Y, df_incidents.X, df_incidents.Category):
    if cluster_labels == 1:
        folium.Marker(
            location=[lat, lon],
            icon=folium.Icon(color=color[1], icon='circle'),
            popup=name,
        ).add_to(cities_2)

# loop through the dataframe and add each data point to the mark cluster
for lat, lon, name, cluster_labels, in zip(df12['Latitude'], df12['Longitude'], df12['City'], df12['Cluster Labels']):#df_incidents.Y, df_incidents.X, df_incidents.Category):
    if cluster_labels == 2:
        folium.Marker(
            location=[lat, lon],
            icon=folium.Icon(color=color[2], icon='circle'),
            popup=name,
        ).add_to(cities_3)

In [32]:
# create a legend for clusters
legend_html =   '''
                <div style="position: fixed; 
                            top: 50px; left: 50px; width: 350px; height: 140px; 
                            border:4px solid grey; z-index:9999; font-size:16px;
                            ">
                            &nbsp; <b> Categories </b><br>
                            &nbsp; <i class="fa fa-map-marker fa-2x" style="color:red"></i> Asian cuisine &nbsp; <br>
                            &nbsp; <i class="fa fa-map-marker fa-2x" style="color:green"></i> South European cuisine &nbsp; <br>
                            &nbsp; <i class="fa fa-map-marker fa-2x" style="color:orange"></i> Far Eastern with South European cuisne &nbsp; <br>
                              
                </div>
                ''' 

map_clusters.get_root().html.add_child(folium.Element(legend_html))

# display map
map_clusters

**to avoid github issue of folium map you can use:**
https://nbviewer.jupyter.org/
and enter the github link