#  Introduction/Business Problem

A person that has spent the last 5 years is Seoul, Korea decides to relocate to the US. They loved living in Seoul and want to figure out which city in the US will be more similar to Seoul. They narrowed it done to three options New York, Los Angeles and Chicago. They hired a data scientist (us) to help them decide which of those city will offer a smilar experience to living in Seoul. Specifically they are interested on how the type of venues compare in each city, their numbers what are the top 5 more popular venues.

# Data

I will gather the venue data for the four cities Seoul, New York, Los Angeles and Chicago from Foursquare. Then I will get the all the venues for all 4 cities, I will group them by category and compare how the top 5 category venues across teh cities compare in type and number.

# Workflow

### Imports

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# !conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

### Retrieve the Data

In [2]:
# Get the location data
cities = ["Thessaloniki" , "New York City", "Los Angeles", "Chicago"]
geolocator = Nominatim(user_agent="ny_explorer")
latitude = []
longitude = []
for i in cities:
    location = geolocator.geocode(i)
    latitude.append(location.latitude)
    longitude.append(location.longitude)
df = pd.DataFrame({'Cities':cities,'Latitude':latitude, 'Longitude':longitude})
df.head()

Unnamed: 0,Cities,Latitude,Longitude
0,Thessaloniki,40.640317,22.935272
1,New York City,40.712728,-74.006015
2,Los Angeles,34.053691,-118.242767
3,Chicago,41.875562,-87.624421


In [3]:
# Define Foursquare Credentials and Version
CLIENT_ID = 'RX2OMW2CT0HUGIRTUUMCEWXZS1T2NKANELQURZILTXCH2AX2' # your Foursquare ID
CLIENT_SECRET = 'EIXCEB3II0YKAZ1C1DZF5T4YSFXVMPCCG1LFOHY3Z1GTRXOP' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [5]:
# Get venue ddata for all the cities
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

LIMIT = 100 # Limit of number of venues returned by Foursquare API
radius = 15   # number of kilometers to expand, limited by Foursquare API limit
venues = getNearbyVenues(names=df.Cities, latitudes=df.Latitude, longitudes=df.Longitude, radius=radius*1000)
venues.shape
venues.head()

Thessaloniki


KeyError: 'groups'

In [None]:
# One hot encodding
venues_onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
venues_onehot["Postal Code"] = venues["Postal Code"]
# move neighborhood column to the first column
fixed_columns = [venues_onehot.columns[-1]] + list(venues_onehot.columns[:-1])
venues_onehot = venues_onehot[fixed_columns]
venues_onehot

In [None]:
venues_grouped = venues_onehot.groupby('Postal Code').mean().reset_index()
venues_grouped

In [None]:
# Each  neighborhood with top 10 most common venues

num_top_venues = 10

for city in venues_grouped['Postal Code']:
    print("----"+city+"----")
    temp = venues_grouped[venues_grouped['Postal Code'] == city].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
#  Run clustering 
# set number of clusters
kclusters = 1

venues_grouped_clustering = venues_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(venues_grouped_clustering)


# neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# toronto_merged = neighborhoods

# # merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
# toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Postal Code'), on='Postal Code')

# toronto_merged.head() # check the last columns!