### Clustering by Cuisine

In [122]:
import requests #
import pandas as pd 
import numpy as np 
from geopy.geocoders import Nominatim

In [186]:
#list of cities to cluster
cities = ["Tokyo", "Delhi", "Cairo", "Mexico City", "New York", "Los Angeles", "London", "Berlin",
         "Sydney", "Melbourne", "Paris", "Rome", "Seoul", "Lisbon", "Barcelona", "Moscow"]
lat = []
long = []

for city in cities:
    
    geolocator = Nominatim(user_agent="foursquare_agent")
    location = geolocator.geocode(city)
    lat.append(location.latitude)
    long.append(location.longitude)

data = pd.DataFrame({"City":cities, "Lat": lat, "Long": long}) 
data.sort_values("City", ascending = True, inplace = True)
data.head()

Unnamed: 0,City,Lat,Long
14,Barcelona,41.382894,2.177432
7,Berlin,52.517037,13.38886
2,Cairo,30.048819,31.243666
1,Delhi,28.651718,77.221939
13,Lisbon,38.707751,-9.136592


In [154]:
#Foursquare Credentials
CLIENT_ID = #hidden 
CLIENT_SECRET = #hidden 
VERSION = '20180604'
LIMIT = 100
RADIUS = 10000
search_query = 'restaurant'

venues_list=[]
for name, lat, long in zip(data.City, data.Lat, data.Long):
    #Search for the top 50 resturants within 10km of the city centre
    url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        lat, 
        long, 
        VERSION, 
        search_query,
        radius, 
        LIMIT)
    
    #make request
    results = requests.get(url).json()["response"]["venues"]
    
    #get relevant data
    for venue in results:
        if (len(venue["categories"]) == 0):
            cuisine = None;
        else:
            cuisine = venue["categories"][0]["shortName"]
            
        venues_list.append([(
            name, 
            lat, 
            long, 
            venue['name'], 
            cuisine)])

city_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
city_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Cuisine']
    

Your credentails:
CLIENT_ID: #hidden 
CLIENT_SECRET:#hidden 


In [155]:
city_venues

Unnamed: 0,City,City Latitude,City Longitude,Venue,Cuisine
0,Tokyo,35.682839,139.759455,Restaurant Rosette (レストラン ロゼット),French
1,Tokyo,35.682839,139.759455,The Restaurant by AMAN (ザ･レストラン by アマン),Mediterranean
2,Tokyo,35.682839,139.759455,MOTIF RESTAURANT & BAR,Restaurant
3,Tokyo,35.682839,139.759455,Restaurant ベラージュ,French
4,Tokyo,35.682839,139.759455,Tokyo Athlete Restaurant (東京アスリート食堂),Japanese
...,...,...,...,...,...
795,Moscow,55.750446,37.617494,Restaurant Russian,B & B
796,Moscow,55.750446,37.617494,Novikov,Asian
797,Moscow,55.750446,37.617494,The Waiters,Restaurant
798,Moscow,55.750446,37.617494,Yoko,Japanese


In [184]:
city_venues.groupby("City").count()["Cuisine"]

City
Barcelona      49
Berlin         50
Cairo          50
Delhi          47
Lisbon         50
London         50
Los Angeles    48
Melbourne      50
Mexico City    48
Moscow         46
New York       49
Paris          50
Rome           49
Seoul          49
Sydney         50
Tokyo          50
Name: Cuisine, dtype: int64

In [159]:
print('{} unique cuisines.'.format(len(city_venues['Cuisine'].unique())))

113 unique cuisines.


In [163]:
#One hot encoding
cities_onehot = pd.get_dummies(city_venues[['Cuisine']], prefix="", prefix_sep="")

#Add city column back
cities_onehot['City'] = city_venues['City'] 
fixed_columns = [cities_onehot.columns[-1]] + list(cities_onehot.columns[:-1])
cities_onehot = cities_onehot[fixed_columns]

#Group cities and take mean of frequency
city_cuisines = cities_onehot.groupby('City').mean().reset_index()
city_cuisines

Unnamed: 0,City,African,American,Arepas,Asian,Australian,Austrian,B & B,BBQ,Bar,...,Tibetan,Turkish,Vegetarian / Vegan,Vietnamese,Wine Bar,Wine Shop,Winery,Wings,Yemeni Restaurant,Yoshoku
0,Barcelona,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Berlin,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,...,0.0,0.06,0.02,0.04,0.0,0.0,0.0,0.0,0.0,0.0
2,Cairo,0.06,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0
3,Delhi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Lisbon,0.02,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,London,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.0,0.0,0.0
6,Los Angeles,0.0,0.04,0.0,0.04,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0
7,Melbourne,0.0,0.0,0.0,0.08,0.04,0.0,0.0,0.0,0.04,...,0.0,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0
8,Mexico City,0.02,0.02,0.0,0.02,0.0,0.0,0.0,0.0,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Moscow,0.0,0.02,0.0,0.02,0.0,0.0,0.02,0.0,0.0,...,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0


In [180]:
#Number of rankings
venues = 5

# create columns according to number of top venues
columns = ["City", "1", "2", "3", "4", "5"]

# create a new dataframe
top_cuisines = pd.DataFrame(columns = columns)
top_cuisines['City'] = city_cuisines['City']

for i in range(0, len(top_cuisines)):
    row = city_cuisines.iloc[i, :]
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    top_cuisines.iloc[i, 1:] = row_categories_sorted.index.values[0:venues]

Unnamed: 0,City,1,2,3,4,5
0,Barcelona,Restaurant,Spanish,Mediterranean,Chinese,Seafood
1,Berlin,Restaurant,Breakfast,German,French,Turkish
2,Cairo,Restaurant,Middle Eastern,Falafel,Yemeni Restaurant,Kebab
3,Delhi,Indian,Restaurant,Chinese,North Indian,Diner
4,Lisbon,Portuguese,Restaurant,Indian,Asian,Himalayan
5,London,Restaurant,Chinese,English,Indian,Italian
6,Los Angeles,Mexican,Chinese,Food,Japanese,American
7,Melbourne,Chinese,Korean,Indian,Asian,Restaurant
8,Mexico City,Restaurant,Mexican,Bar,Buffet,Chinese
9,Moscow,Restaurant,Russian,Seafood,Nightclub,Italian


In [181]:
from sklearn.cluster import KMeans

#Get clusters
k = 5
kmeans = KMeans(n_clusters = k, random_state=0).fit(city_cuisines.drop("City" , 1))

#Insert Clusters into our ranked cuisine data
top_cuisines.insert(0, 'Cluster Labels', kmeans.labels_)

In [182]:
top_cuisines

Unnamed: 0,Cluster Labels,City,1,2,3,4,5
0,2,Barcelona,Restaurant,Spanish,Mediterranean,Chinese,Seafood
1,2,Berlin,Restaurant,Breakfast,German,French,Turkish
2,2,Cairo,Restaurant,Middle Eastern,Falafel,Yemeni Restaurant,Kebab
3,4,Delhi,Indian,Restaurant,Chinese,North Indian,Diner
4,2,Lisbon,Portuguese,Restaurant,Indian,Asian,Himalayan
5,2,London,Restaurant,Chinese,English,Indian,Italian
6,1,Los Angeles,Mexican,Chinese,Food,Japanese,American
7,1,Melbourne,Chinese,Korean,Indian,Asian,Restaurant
8,2,Mexico City,Restaurant,Mexican,Bar,Buffet,Chinese
9,2,Moscow,Restaurant,Russian,Seafood,Nightclub,Italian


In [188]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

world_map = folium.Map(location=[30, 30], zoom_start = 1.5)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(data['Lat'], data['Long'], top_cuisines["City"], top_cuisines["Cluster Labels"]):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(world_map)
       
world_map