# Battle of the Neighborhoods
## Applied Data Science Capstone Project

This notebook contains the work done for the final assignment of the Capstone Project. 

## Part 0: Import packages

In [1]:
#%%capture

import numpy as np
import pandas as pd

# Packages to get relevant information
import zipfile
from bs4 import BeautifulSoup
import requests

# Visualization on a map
import folium 

# Clustering
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans

## Part 1: Obtain information through webscraping and Foursquare

Input all the countries and capitals given by the client. 

In [2]:
citiesData = [{ 'Country'  : 'Austria',          'Capital' : 'Vienna' }, 
              { 'Country'  : 'Belgium',          'Capital' : 'Brussels' }, 
              { 'Country'  : 'Czech Republic',   'Capital' : 'Prague' }, 
              { 'Country'  : 'Denmark',          'Capital' : 'Copenhagen' }, 
              { 'Country'  : 'Finland',          'Capital' : 'Helsinki' }, 
              { 'Country'  : 'France',           'Capital' : 'Paris' }, 
              { 'Country'  : 'Germany',          'Capital' : 'Berlin' }, 
              { 'Country'  : 'Italy',            'Capital' : 'Rome' }, 
              { 'Country'  : 'Ireland',          'Capital' : 'Dublin' }, 
              { 'Country'  : 'Netherlands',      'Capital' : 'Amsterdam' }, 
              { 'Country'  : 'Norway',           'Capital' : 'Oslo' }, 
              { 'Country'  : 'Poland',           'Capital' : 'Warsaw' }, 
              { 'Country'  : 'Portugal',         'Capital' : 'Lisbon' }, 
              { 'Country'  : 'Spain',            'Capital' : 'Madrid' }, 
              { 'Country'  : 'Sweden',           'Capital' : 'Stockholm' }, 
              { 'Country'  : 'Switzerland',      'Capital' : 'Bern' }, 
         #     { 'Country'  : 'Finland',          'Capital' : 'Helsinki' }, 
         #     { 'Country'  : 'France',           'Capital' : 'Paris' }, 
         #     { 'Country'  : 'Germany',          'Capital' : 'Berlin' }, 
         #     { 'Country'  : 'Italy',            'Capital' : 'Rome' }, 
         #     { 'Country'  : 'Ireland',          'Capital' : 'Dublin' }, 
         #     { 'Country'  : 'Netherlands',      'Capital' : 'Amsterdam' }, 
         #     { 'Country'  : 'Norway',           'Capital' : 'Oslo' }, 
         #     { 'Country'  : 'Poland',           'Capital' : 'Warsaw' }, 
         #     { 'Country'  : 'Portugal',         'Capital' : 'Lisbon' }, 
         #     { 'Country'  : 'Spain',            'Capital' : 'Madrid' }, 
         #     { 'Country'  : 'Sweden',           'Capital' : 'Stockholm' }, 
         #     { 'Country'  : 'Switzerland',      'Capital' : 'Bern' }, 
              { 'Country'  : 'United Kingdom',   'Capital' : 'London' }]
cities = pd.DataFrame(citiesData)

### Obtain latitude and longitude for all cities.

In [3]:
url_latitudeLongitude = 'https://simplemaps.com/static/data/world-cities/basic/simplemaps_worldcities_basicv1.73.zip'
pathZip = './worldCities.zip'
pathFiles = '.'

def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

download_url(url_latitudeLongitude, pathZip)

with zipfile.ZipFile(pathZip, 'r') as zip_ref:
    zip_ref.extractall(pathFiles)
    
worldCities = pd.read_csv( pathFiles + '/worldcities.csv' )

In [4]:
# Initialize variables for latitude and longitude
latitude  = np.empty(cities.shape[0])
longitude = np.empty(cities.shape[0])

# Loop over all capitals
for cityIndex, capital, country in zip(cities.index, cities['Capital'], cities['Country']):
    if country == 'Czech Republic':
        country = 'Czechia'
        
    citiesInCountry = worldCities[worldCities['country'] == country]
    latitude[cityIndex]  = citiesInCountry['lat'][citiesInCountry['city'] == capital]
    longitude[cityIndex] = citiesInCountry['lng'][citiesInCountry['city'] == capital]
    
# Add latitude/longitude to cities dataframe
cities['Latitude']  = latitude
cities['Longitude'] = longitude

cities

Unnamed: 0,Country,Capital,Latitude,Longitude
0,Austria,Vienna,48.2083,16.3731
1,Belgium,Brussels,50.8467,4.3517
2,Czech Republic,Prague,50.0833,14.4167
3,Denmark,Copenhagen,55.6786,12.5635
4,Finland,Helsinki,60.1756,24.9342
5,France,Paris,48.8566,2.3522
6,Germany,Berlin,52.5167,13.3833
7,Italy,Rome,41.8931,12.4828
8,Ireland,Dublin,53.3425,-6.2658
9,Netherlands,Amsterdam,52.35,4.9166


Plot this on a map.

In [5]:
# Create map of centered on the mean latitude and longitude values of the neighborhoods
latitude_mean  = cities['Latitude'].mean()
longitude_mean = cities['Longitude'].mean()

mapEurope = folium.Map(location=[latitude_mean, longitude_mean], 
                       tiles='Stamen Watercolor', 
                       zoom_start=4)

# Add markers to map for each neighborhood
for latitude, longitude, label in zip(cities['Latitude'], cities['Longitude'], cities['Capital']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='#0C855A',
        fill=True,
        fill_color='#17FFAE',
        fill_opacity=0.7,
        parse_html=False).add_to(mapEurope)  

# Display map
mapEurope

### Proportion of vegeterians/vegans per country
Obtain the percentage of the population who are vegetarian and vegan from the Wikipedia page on ["Vegetarianism by Country"](https://en.wikipedia.org/wiki/Vegetarianism_by_country). If there is a range of percentages, the upper bound is taken. 

In [6]:
# Get the html 
url = 'https://en.wikipedia.org/wiki/Vegetarianism_by_country'
html = requests.get(url).text

# Turn into a beautiful soup
soup = BeautifulSoup(html, 'html5lib')

# Find all html tables
tables = soup.find_all('table')

# Find the correct table index
for index,table in enumerate(tables):
    if ("Vegetarians (" in str(table)):
        tableIndex = index

table = tables[tableIndex]


In [7]:
tableContents = [];

# Find all `tr` tags
dataRows = table.find_all('tr')
rows = []
for row in dataRows:
    data = row.find_all('td')
    value = [element.text.strip() for element in data]
    # Remove data arrays that are empty
    if len(value) == 0:
        continue
        
    cell = {}
    cell['Country'] = value[0]
    if "% –" in str(value[1]):
        cell['VegetariansPercentage'] = float(value[1].split('%')[-2].split(' – ')[1])
    else:
        cell['VegetariansPercentage'] = float(value[1].split('%')[0])
    
    if str(value[4]) == '':
        cell['VegansPercentage'] = np.nan
    else:
        cell['VegansPercentage'] = float(value[4].split('%')[0])
    tableContents.append(cell)
    
vegetarians = pd.DataFrame(tableContents)

cities = cities.set_index('Country').join(vegetarians.set_index('Country'), on='Country').reset_index()

# Calculate the average ratio of vegetarians to vegans to fill in the missing value for Austria, Vienna
meanVegetarians = cities[['VegetariansPercentage']][cities['Country'] != 'Austria'].mean()
meanVegans      = cities[['VegansPercentage']][cities['Country'] != 'Austria'].mean()
cities.loc[cities['Country'] == 'Austria', 'VegansPercentage'] = cities.loc[cities['Country'] == 'Austria', 'VegetariansPercentage'] / float(meanVegetarians) * float(meanVegans) 
cities


Unnamed: 0,Country,Capital,Latitude,Longitude,VegetariansPercentage,VegansPercentage
0,Austria,Vienna,48.2083,16.3731,10.0,2.864782
1,Belgium,Brussels,50.8467,4.3517,7.0,1.0
2,Czech Republic,Prague,50.0833,14.4167,5.0,1.0
3,Denmark,Copenhagen,55.6786,12.5635,10.0,4.0
4,Finland,Helsinki,60.1756,24.9342,11.0,2.0
5,France,Paris,48.8566,2.3522,5.2,1.1
6,Germany,Berlin,52.5167,13.3833,12.0,2.0
7,Italy,Rome,41.8931,12.4828,8.9,2.2
8,Ireland,Dublin,53.3425,-6.2658,8.4,2.0
9,Netherlands,Amsterdam,52.35,4.9166,5.0,1.0


### Number of existing vegetarian/vegan restaurants in city center
Obtain venue information for the city center of each capital. In this report, the city center is defined as a circle with a radius of 2 km. The venue information is obtained using Foursquare.

In [8]:
CLIENT_ID     = 'W345G4OUK4TUKIHHNU5OISVS0CBXJMALDN1MQAIUP42AT15E' 
CLIENT_SECRET = 'QFSQJGIOVRDV04UYE5OOU22RES510Z4SV23BOXPBOITC1CKA'
ACCESS_TOKEN  = 'ERLITQ2TBC3ZKLTAQG5X00FGI1RFLQX33TJJIFBOVTENKSBS' 
VERSION       = '20210701' 

In [9]:
def getCityVegaRestaurants(capitals, latitudes, longitudes, radius, limit):
    
    venues = []
    for capital, latitude, longitude in zip(capitals, latitudes, longitudes):           
        # Create the API request URL
        #url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&query=restaurant'.format(
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&query=vegan'.format(
        #url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&section=food'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            latitude, 
            longitude, 
            radius, 
            limit)
            
        # Make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # Save the venue's name, location and category in the venues-list
        venues.append([(
            capital, 
            latitude, 
            longitude, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']) for venue in results])

    venueDataFrame = pd.DataFrame([item for venue in venues for item in venue])
    venueDataFrame.columns = ['Capital', 
                              'Capital Latitude', 
                              'Capital Longitude', 
                              'Venue', 
                              'Venue Latitude', 
                              'Venue Longitude', 
                              'Venue Category']
    
    return(venueDataFrame)

# change radius for final run!

In [10]:
radius = 1500
limit  = 100
venues = getCityVegaRestaurants(cities['Capital'], cities['Latitude'], cities['Longitude'], radius=radius, limit=limit)
venues.head(5)

Unnamed: 0,Capital,Capital Latitude,Capital Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Vienna,48.2083,16.3731,Veggiezz,48.213091,16.372378,Vegetarian / Vegan Restaurant
1,Vienna,48.2083,16.3731,Makro1,48.21035,16.376971,Vegetarian / Vegan Restaurant
2,Vienna,48.2083,16.3731,Swing Kitchen,48.198613,16.366252,Vegetarian / Vegan Restaurant
3,Vienna,48.2083,16.3731,Maschu Maschu,48.211896,16.375328,Vegetarian / Vegan Restaurant
4,Vienna,48.2083,16.3731,Harvest,48.215718,16.379926,Vegetarian / Vegan Restaurant


How many venues is that for each capital? 

In [11]:
restaurantCount = venues.groupby('Capital')[['Venue']].count()
cities = cities.set_index('Capital').join(restaurantCount, on='Capital').reset_index()
cities.rename(columns = {'Venue' : 'NumberOfVegaRestaurants'}, inplace=True)
cities

Unnamed: 0,Capital,Country,Latitude,Longitude,VegetariansPercentage,VegansPercentage,NumberOfVegaRestaurants
0,Vienna,Austria,48.2083,16.3731,10.0,2.864782,61
1,Brussels,Belgium,50.8467,4.3517,7.0,1.0,55
2,Prague,Czech Republic,50.0833,14.4167,5.0,1.0,70
3,Copenhagen,Denmark,55.6786,12.5635,10.0,4.0,71
4,Helsinki,Finland,60.1756,24.9342,11.0,2.0,95
5,Paris,France,48.8566,2.3522,5.2,1.1,67
6,Berlin,Germany,52.5167,13.3833,12.0,2.0,57
7,Rome,Italy,41.8931,12.4828,8.9,2.2,36
8,Dublin,Ireland,53.3425,-6.2658,8.4,2.0,47
9,Amsterdam,Netherlands,52.35,4.9166,5.0,1.0,23


In [12]:

cities.rename(columns = {'Venue' : 'NumberOfVegaRestaurants'}, inplace=True)
cities

Unnamed: 0,Capital,Country,Latitude,Longitude,VegetariansPercentage,VegansPercentage,NumberOfVegaRestaurants
0,Vienna,Austria,48.2083,16.3731,10.0,2.864782,61
1,Brussels,Belgium,50.8467,4.3517,7.0,1.0,55
2,Prague,Czech Republic,50.0833,14.4167,5.0,1.0,70
3,Copenhagen,Denmark,55.6786,12.5635,10.0,4.0,71
4,Helsinki,Finland,60.1756,24.9342,11.0,2.0,95
5,Paris,France,48.8566,2.3522,5.2,1.1,67
6,Berlin,Germany,52.5167,13.3833,12.0,2.0,57
7,Rome,Italy,41.8931,12.4828,8.9,2.2,36
8,Dublin,Ireland,53.3425,-6.2658,8.4,2.0,47
9,Amsterdam,Netherlands,52.35,4.9166,5.0,1.0,23


### Disposable income
Obtain the **median** disposable income per person for each country (after taxes and transfers), as given on the Wikipedia page ["Disposable household and per capita income"](https://en.wikipedia.org/wiki/Disposable_household_and_per_capita_income). 

In [13]:
# Get the html 
url = 'https://en.wikipedia.org/wiki/Disposable_household_and_per_capita_income'
html = requests.get(url).text

# Turn into a beautiful soup
soup = BeautifulSoup(html, 'html5lib')

# Find all html tables
tables = soup.find_all('table')

# Find the correct table index
for index,table in enumerate(tables):
    if ("2016 median household" in str(table)):
        tableIndex = index

table = tables[tableIndex]

In [14]:
tableContents = [];

# Find all `tr` tags
dataRows = table.find_all('tr')
rows = []
for row in dataRows:
    data = row.find_all('td')
    value = [element.text.strip() for element in data]
    
    # Remove data arrays that are empty
    if len(value) == 0:
        continue
    
    cell = {}
    cell['Country'] = value[1]
    if " (20" in str(row):
        cell['DisposableIncome'] = float(value[2].split('$')[1].replace(',','').split(' (20')[0])
    else:
        cell['DisposableIncome'] = float(value[2].split('$')[1].replace(',',''))
        
    tableContents.append(cell)
    
disposableIncome = pd.DataFrame(tableContents)

cities = cities.set_index('Country').join(disposableIncome.set_index('Country'), on='Country').reset_index()

cities

Unnamed: 0,Country,Capital,Latitude,Longitude,VegetariansPercentage,VegansPercentage,NumberOfVegaRestaurants,DisposableIncome
0,Austria,Vienna,48.2083,16.3731,10.0,2.864782,61,32496.0
1,Belgium,Brussels,50.8467,4.3517,7.0,1.0,55,29361.0
2,Czech Republic,Prague,50.0833,14.4167,5.0,1.0,70,17984.0
3,Denmark,Copenhagen,55.6786,12.5635,10.0,4.0,71,28926.0
4,Finland,Helsinki,60.1756,24.9342,11.0,2.0,95,26774.0
5,France,Paris,48.8566,2.3522,5.2,1.1,67,25865.0
6,Germany,Berlin,52.5167,13.3833,12.0,2.0,57,27569.0
7,Italy,Rome,41.8931,12.4828,8.9,2.2,36,23023.0
8,Ireland,Dublin,53.3425,-6.2658,8.4,2.0,47,25933.0
9,Netherlands,Amsterdam,52.35,4.9166,5.0,1.0,23,29571.0


## Part 2: Cluster cities
Cities will be clustered based on the percentage of vegetarians/vegans, current number of vegetarian restaurants and the median disposable income per household. Clustering will be performed using the k-means algorithm. 

In [15]:
numberOfClusters = 5

# Get the relevant features
features = cities[['VegetariansPercentage','VegansPercentage','NumberOfVegaRestaurants','DisposableIncome']]

# Scale the features
features = RobustScaler().fit(features).transform(features)

features
# Run k-means clustering
clusters = KMeans(n_clusters=numberOfClusters, random_state=0).fit(features)

# Add to the dataframe
cities.insert(0, 'Cluster', clusters.labels_)

Display these clusters on the map.

In [16]:
mapEuropeClustered = folium.Map(location=[latitude_mean, longitude_mean], 
                                tiles='Stamen Watercolor', 
                                zoom_start=4)

colors = ['purple', 'blue', 'cyan', 'yellow', 'orange', 'red']

# Add markers to map for each neighborhood
for latitude, longitude, label, cluster in zip(cities['Latitude'], cities['Longitude'], cities['Capital'], cities['Cluster']):
    labelText = label + ', cluster ' + str(cluster)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color=colors[cluster],
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7,
        parse_html=False).add_to(mapEuropeClustered)  

# Display map
mapEuropeClustered

Examine the obtained clusters.

In [17]:
cities.loc[cities['Cluster'] == 0]

Unnamed: 0,Cluster,Country,Capital,Latitude,Longitude,VegetariansPercentage,VegansPercentage,NumberOfVegaRestaurants,DisposableIncome
2,0,Czech Republic,Prague,50.0833,14.4167,5.0,1.0,70,17984.0
5,0,France,Paris,48.8566,2.3522,5.2,1.1,67,25865.0
12,0,Portugal,Lisbon,38.7452,-9.1604,1.2,0.6,11,15403.0
13,0,Spain,Madrid,40.4189,-3.6919,1.5,0.2,62,21788.0


Cluster 0 contains the cities with low numbers of vegetarians and vegans, as well as relatively low disposable income. Surprisingly, the number of vegetarian/vegan restaurants is medium-to-high for most cities in this cluster.

In [18]:
cities.loc[cities['Cluster'] == 1]

Unnamed: 0,Cluster,Country,Capital,Latitude,Longitude,VegetariansPercentage,VegansPercentage,NumberOfVegaRestaurants,DisposableIncome
0,1,Austria,Vienna,48.2083,16.3731,10.0,2.864782,61,32496.0
1,1,Belgium,Brussels,50.8467,4.3517,7.0,1.0,55,29361.0
3,1,Denmark,Copenhagen,55.6786,12.5635,10.0,4.0,71,28926.0
4,1,Finland,Helsinki,60.1756,24.9342,11.0,2.0,95,26774.0
6,1,Germany,Berlin,52.5167,13.3833,12.0,2.0,57,27569.0
7,1,Italy,Rome,41.8931,12.4828,8.9,2.2,36,23023.0
8,1,Ireland,Dublin,53.3425,-6.2658,8.4,2.0,47,25933.0
14,1,Sweden,Stockholm,59.3294,18.0686,12.0,4.0,65,29765.0


Cluster 1 contains the cities with a high percentage of vegetarians and vegans, a medium-to-high number of current vegetarian restaurants and a medium-to-high disposable income. 

In [19]:
cities.loc[cities['Cluster'] == 2]

Unnamed: 0,Cluster,Country,Capital,Latitude,Longitude,VegetariansPercentage,VegansPercentage,NumberOfVegaRestaurants,DisposableIncome
16,2,United Kingdom,London,51.5072,-0.1275,21.3,4.4,100,22603.0


Cluster 2 contains the city with a very high number of vegetarians and vegans, a current number of vegetarian/vegan restaurants that is at the limit of what Foursquare can offer (so it is likely even higher!) and a medium disposable income.

In [20]:
cities.loc[cities['Cluster'] == 3]

Unnamed: 0,Cluster,Country,Capital,Latitude,Longitude,VegetariansPercentage,VegansPercentage,NumberOfVegaRestaurants,DisposableIncome
9,3,Netherlands,Amsterdam,52.35,4.9166,5.0,1.0,23,29571.0
10,3,Norway,Oslo,59.9111,10.7528,9.0,4.0,19,35542.0
15,3,Switzerland,Bern,46.948,7.4474,5.0,1.0,9,37749.0


Cluster 3 contains the cities with a medium number of vegetarians and vegans, a low number of vegetarian restaurants and a high disposable income. 