In [None]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from shapely.geometry import MultiPoint,Point, Polygon
import pandana
import osmnx as ox
import shapely.wkt
from descartes import PolygonPatch
import geopandas as gpd
import shapely
import geojsonio
import geojson
import json
import pandas as pd
import numpy as np
import osmapi as osm
import requests

import networkx as nx

from pandana.loaders import osm

In [None]:
airbnb = pd.read_csv("airbnb3.csv")

In [None]:
# Define Foursquare Credentials 

# CLIENT_ID = # client ID
# CLIENT_SECRET =  # client secret
VERSION = '20190629'
LIMIT = 5

In [None]:
# Function inspired by Graciela Carrillo

def getNearbyVenues(names, latitudes, longitudes, radius, categoryId):
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name,lat,lng)

        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}&sortByDistance=1'.format(
            CLIENT_ID, # in the url, we specify ByDistance=1, which means the results will be sorted according to geometry distance
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT,
            categoryId)
        
        # make the GET request
        results = requests.get(url).json()["response"]["groups"][0]["items"]
       
         # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v["venue"]["name"],
            v["venue"]["location"]["lat"],
            v["venue"]["location"]["lng"],
            v["venue"]["location"]["distance"],
            v["venue"]["categories"][0]["name"]) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Id', 
                  'Room_Latitude', 
                  'Room_Longitude', 
                  'Venue', 
                  'Venue_Latitude', 
                  'Venue_Longitude',
                  'distance',
                  'Venue_Category']
    
    return(nearby_venues)

Venues we want to consider: 
#### Grocery store/Supermarket
Codes: 4bf58dd8d48988d118951735, 52f2ab2ebcbc57f1066b8b46
#### Restaurant - Food category (all different types of restaurants)
Code: 4d4b7105d754a06374d81259
#### Parks
Code: 4bf58dd8d48988d163941735


Source:https://developer.foursquare.com/docs/build-with-foursquare/categories

# Restaurants

In [None]:
%%time
# Define the range for smoother and faster retrieving of data
# Run this code several times to get all the data
# airbnb1 = airbnb[a:b]
i = 0
restaurants1 = pd.DataFrame()
for k in range(10,len(airbnb1),10):
    dataset = pd.DataFrame(airbnb1[i:k])
    restaurants = getNearbyVenues(names=dataset['id'],
                                       latitudes=dataset['latitude'],
                                       longitudes=dataset['longitude'],
                                     radius = 500,
                                    categoryId = "4d4b7105d754a06374d81259" # food 
                                      )
    restaurants1 = restaurants1.append(restaurants)
    i = i+10   
    
restaurants_2_1 = restaurants_2_1.append(restaurants1)

In [None]:
# Checking if all listings are in the restaurant dataset
listis_x = []
for x in df.id:
    if x not in list(restaurants_2_1.Id):
        print(x)
        listis_x.append(x)

edge_listings = pd.DataFrame()
for x in listis_x:
    y = pd.DataFrame(airbnb.loc[airbnb.id == x])
    edge_listings = edge_listings.append(y)

In [None]:
# As no restaurants are in the radius of 500 for the edge listings, we will consider radius of 1000m
airbnb1 = edge_listings 
i = 0
restaurants1 = pd.DataFrame()
restaurants = getNearbyVenues(names=airbnb1['id'],
                                   latitudes=airbnb1['latitude'],
                                   longitudes=airbnb1['longitude'],
                                    radius = 1000,
                                    categoryId = "4d4b7105d754a06374d81259" # food 
                                  )
restaurants1 = restaurants1.append(restaurants) 
    
restaurants_2_1 = restaurants_2_1.append(restaurants1)

In [None]:
# Again, check if all listing have some restaurants in 1000m
listis_y = []
for x in df.id:
    if x not in list(restaurants_2_1.Id):
        print(x)
        listis_y.append(x)

edge_listings2 = pd.DataFrame()
for x in listis_y:
    y = pd.DataFrame(airbnb.loc[airbnb.id == x])
    edge_listings2 = edge_listings2.append(y)

In [None]:
# As no restaurants are in the radius of 1000 for some listings, we will consider radius of 1500m
airbnb1 = edge_listings2

restaurants1 = pd.DataFrame()
restaurants = getNearbyVenues(names=airbnb1['id'],
                                   latitudes=airbnb1['latitude'],
                                   longitudes=airbnb1['longitude'],
                                   radius = 1500,
                                   categoryId = "4d4b7105d754a06374d81259" # food 
                                  )
restaurants1 = restaurants1.append(restaurants) 
    
restaurants_2_1 = restaurants_2_1.append(restaurants1)

In [None]:
restaurants_2_1 = restaurants_2_1.drop_duplicates()

In [None]:
restaurants_2_1.to_csv("restaurants_all.csv")

In [None]:
# In this stage, we have for each listing, 5 nearest  RECOMMENDED restaurants by geometry distance
# Note: For some listings, its smaller number of restaurants, as we specify radius of 500m in the first step

In [None]:
# Now, we define the network of streets, convert latitude and longitude to nodes representation, so we can 
# compute the walking distance with NetworkX package

## Walking distance to restaurant

In [None]:
restaurant_clean = pd.read_csv("restaurants_all.csv")

In [None]:
# First we will drop some categories, that does not represent restaurants, this is very subjective
restaurant_clean = restaurant_clean.drop(restaurant_clean[(restaurant_clean.Venue_Category == "Bagel Shop")|
                                                     (restaurant_clean.Venue_Category =="Bakery")|
                                                     (restaurant_clean.Venue_Category == "Buffet")|
                                                     (restaurant_clean.Venue_Category == "Cafeteria")|
                                                     (restaurant_clean.Venue_Category == "Café")|
                                                     (restaurant_clean.Venue_Category == "Donut Shop")|
                                                     (restaurant_clean.Venue_Category == "Fast Food Restaurant")|
                                                     (restaurant_clean.Venue_Category == "Food Truck")|
                                                     (restaurant_clean.Venue_Category == "Food Stand")|
                                                     (restaurant_clean.Venue_Category == "Hot Dog Joint")|
                                                     (restaurant_clean.Venue_Category == "Pet Café")|
                                                     (restaurant_clean.Venue_Category == "Snack Place")].index)

In [None]:
lis = []
for x in list(airbnb.id):
    if x not in list(restaurant_clean.Id):
        lis.append(x)
        
print(len(lis))

new_restaurants = pd.DataFrame()
for x in lis:
    y = pd.DataFrame(airbnb.loc[airbnb.id == x])
    new_restaurants = new_restaurants.append(y)

In [None]:
# For such listing (45 of them), we will find restaurants in 1500m, instead of 500, and we set the limit on 10, othewise
# same results would be retrieved
LIMIT = 10
airbnb1 = new_restaurants
i = 0
restaurants_1500 = pd.DataFrame()
restaurants = getNearbyVenues(names=airbnb1['id'],
                                   latitudes=airbnb1['latitude'],
                                   longitudes=airbnb1['longitude'],
                                    radius = 1500,
                                    categoryId = "4d4b7105d754a06374d81259" # food 
                                  )
restaurants_1500 = restaurants_1500.append(restaurants) 

In [None]:
# Again, we dropp all the categories that we want
restaurants_1500 = restaurants_1500.drop(restaurants_1500[(restaurants_1500.Venue_Category == "Bagel Shop")|
                                                     (restaurants_1500.Venue_Category =="Bakery")|
                                                     (restaurants_1500.Venue_Category == "Buffet")|
                                                     (restaurants_1500.Venue_Category == "Cafeteria")|
                                                     (restaurants_1500.Venue_Category == "Café")|
                                                     (restaurants_1500.Venue_Category == "Donut Shop")|
                                                     (restaurants_1500.Venue_Category == "Fast Food Restaurant")|
                                                     (restaurants_1500.Venue_Category == "Food Truck")|
                                                     (restaurants_1500.Venue_Category == "Food Stand")|
                                                     (restaurants_1500.Venue_Category == "Hot Dog Joint")|
                                                     (restaurants_1500.Venue_Category == "Pet Café")|
                                                     (restaurants_1500.Venue_Category == "Snack Place")].index)

In [None]:
restaurant_clean = restaurant_clean.append(restaurants_1500)

In [None]:
lis2 = []
for x in list(airbnb.id):
    if x not in list(restaurant_clean.Id):
        lis2.append(x)
        
len(lis2) # all listings has information about restaurant nearby

In [None]:
%%time
# Define the network of streets in Prague
# prague = osmnx.graph.graph_from_bbox(49.941901,14.224435,50.17743,14.706787)
G = ox.graph_from_place("Prague", network_type = "walk")
fig, ax = ox.plot_graph(ox.project_graph(G))
fig.show()

In [None]:
%%time
# Note ox.get_nearest_node accept the coordinates in formate (lat,long), which is different form shapely package (long,lat)
# Apply function nearest_node to find nearest node to each GPS coordinates for restaurants

def nearest_node(row):
    return ox.get_nearest_node(G, (row.Venue_Latitude, row.Venue_Longitude))

restaurant_clean['restaurants_nodes1'] = restaurant_clean.apply(nearest_node, axis=1) 

In [None]:
# Map Airbnb nodes from original dataset

d_airbnb_nodes = {}
for i, node in zip(airbnb.id, airbnb.air_nodes_true):
    d_airbnb_nodes[i] = node

restaurant_clean["airbnb_node"] = restaurant_clean["Id"] # Initiate column where we will map airbnb node

restaurant_clean.airbnb_node = restaurant_clean.airbnb_node.map(d_airbnb_nodes) # Map airbnb nodes

In [None]:
# count the distances
def distance(row):
    return nx.shortest_path_length(G, source=row.airbnb_node, target=row.restaurants_nodes1, weight='length')

restaurant_clean['distance_to_restaurant'] = restaurant_clean.apply(distance, axis=1)

In [None]:
# Sort, so we get the shortest distance for each listing
restaurant_clean_final = restaurant_clean.sort_values(by = ["Id", "distance_to_restaurant"]).drop_duplicates(subset = ['airbnb_node', "Id"])

In [None]:
# Rename, so we can merge it with original dataset
restaurant_clean_final = restaurant_clean_final.rename(columns={'Id': 'id'})

In [None]:
airbnb = pd.merge(airbnb, restaurant_clean_final, on = "id", how = "outer")

In [None]:
# Drop unnecessary columns
airbnb = airbnb.drop(["Unnamed: 0_x", "Unnamed: 0.1","Unnamed: 0_y"],axis = 1)
to_drop = ["Room_Latitude","Room_Longitude", "Venue", "Venue_Latitude", "Venue_Longitude", "Venue_Category"]
airbnb = airbnb.drop(to_drop, axis = 1)

In [None]:
airbnb = airbnb.rename(columns={'distance': 'geometry_distance_to_restaurant'})
airbnb.loc[airbnb['geometry_distance_to_restaurant'] >= 1000, "geometry_distance_to_restaurant"] = 1000

In [None]:
# We will set the highest distance to 1000m for all listing
airbnb.loc[airbnb['distance_to_restaurant'] >= 1000, "distance_to_restaurant"] = 1000

# Grocery Stores

Grocery store/Supermarket
Codes: 4bf58dd8d48988d118951735, 52f2ab2ebcbc57f1066b8b46

In [None]:
supermarket1 = pd.DataFrame()

In [None]:
%%time

# Define the range for smoother and faster retrieving of data
# Run this code several times to get all the data

# airbnb1 = airbnb[a:b]
i = 0
supermarket1 = pd.DataFrame()
for k in range(10,len(airbnb1),10):
    dataset = pd.DataFrame(airbnb1[i:k])
    supermarket = getNearbyVenues(names=dataset['id'],
                                       latitudes=dataset['latitude'],
                                       longitudes=dataset['longitude'],
                                       radius = 500,
                                       categoryId = "4bf58dd8d48988d118951735,52f2ab2ebcbc57f1066b8b46" # grocery store, supermarket 
                                      )
    supermarket1 = supermarket1.append(supermarket)
    i = i+10   
    
supermarkets_2_1 = supermarkets_2_1.append(supermarket1)

In [None]:
%%time
# Apply function nearest_node to find nearest node to each GPS coordinates for supermarkets

supermarkets_2_1['supermarket_nodes1'] = supermarkets_2_1.apply(nearest_node, axis=1) 

In [None]:
supermarkets_2_1.groupby("Venue_Category").count()

In [None]:
supermarkets_2_1.to_csv("supermarkets1.csv")

### Look for missing information for each room in supermarket dataset:

In [None]:
supermarkets = pd.read_csv("supermarkets1.csv")

In [None]:
supermarkets = supermarkets.drop_duplicates()

In [None]:
# We dropp all the categories that, do not represent the grocery store or supermarket
supermarkets = supermarkets.drop(supermarkets[(supermarkets.Venue_Category == "Café")|
                                                (supermarkets.Venue_Category =="Clothing Store")|
                                                (supermarkets.Venue_Category == "Italian Restaurant")|
                                                (supermarkets.Venue_Category == "Health Food Store")|
                                                (supermarkets.Venue_Category == "Gourmet Shop")].index)

In [None]:
lis = []
for x in list(airbnb.id):
    if x not in list(supermarkets.Id):
        lis.append(x)

print(len(lis))

supermarkets_need = pd.DataFrame()
for x in lis:
    y = pd.DataFrame(airbnb.loc[airbnb.id == x])
    supermarkets_need = supermarkets_need.append(y)

In [None]:
# Find supermarket in 1000 for listings, that does not have supermarket in 500m
# We will increase the radius to 1000m
# And increase the limit to 10

LIMIT = 10
airbnb1 = supermarkets_need
supermarket1 = pd.DataFrame()
supermarket = getNearbyVenues(names=airbnb1['id'],
                                   latitudes=airbnb1['latitude'],
                                   longitudes=airbnb1['longitude'],
                                   radius = 1000,
                                   categoryId = "4bf58dd8d48988d118951735,52f2ab2ebcbc57f1066b8b46" # grocery store, supermarket 
                                  )
supermarket1 = supermarket1.append(supermarket)


In [None]:
# Apply function nearest_node to find nearest node to each GPS coordinates for the rest of supermarkets

supermarket1['supermarket_nodes1'] = supermarket1.apply(nearest_node, axis=1) 

In [None]:
# Add new supermarkets (range 1000m) into the dataset
supermarkets = supermarkets.append(supermarket1)

In [None]:
supermarkets.groupby("Venue_Category").count()

In [None]:
# Again, we dropp all the categories that, do not represent the grocery store or supermarket
supermarkets = supermarkets.drop(supermarkets[(supermarkets.Venue_Category == "Café")|
                                              (supermarkets.Venue_Category =="Clothing Store")].index)

In [None]:
supermarkets = supermarkets.drop("Unnamed: 0", axis = 1)

In [None]:
# Map Airbnb nodes from original dataset
d_airbnb_nodes = {}
for i, node in zip(airbnb.id, airbnb.air_nodes_true):
    d_airbnb_nodes[i] = node

supermarkets["airbnb_node"] = supermarkets["Id"] # Initiate column where we will map airbnb node

supermarkets.airbnb_node = supermarkets.airbnb_node.map(d_airbnb_nodes) # Map airbnb nodes

In [None]:
# Compute walking distance to supermarket
def distance(row):
    return nx.shortest_path_length(G, source = row.airbnb_node, target = row.supermarket_nodes1, weight='length')

supermarkets['distance_to_supermarket'] = supermarkets.apply(distance, axis=1)

In [None]:
# Sort to get the shortest walking distance for each listing
supermarkets_final = supermarkets.sort_values(by = ["Id", "distance_to_supermarket"]).drop_duplicates(subset = ['airbnb_node', "Id"])

In [None]:
# Rename column, so we can merge it with original dataset
supermarkets_final = supermarkets_final.rename(columns={'Id': 'id'})

In [None]:
# Check if all listings has supermarket near, if not, value 1000 will be assign as the maximum value of distance (meaning 1000 and more)
lis = []
for x in list(airbnb.id):
    if x not in list(supermarkets_final.id):
        lis.append(x)

print(len(lis))

airbnb_no_sup = pd.DataFrame()
for x in lis:
    y = pd.DataFrame(airbnb.loc[airbnb.id == x])
    airbnb_no_sup = airbnb_no_sup.append(y)

In [None]:
airbnb = pd.merge(airbnb, supermarkets_final, on = "id", how = "outer")

In [None]:
airbnb["distance_to_supermarket"].isna().sum()

In [None]:
# Fill nan with 1000, meaning these listings has supermarket far away
airbnb['distance_to_supermarket'] = airbnb['distance_to_supermarket'].fillna(1000)

In [None]:
airbnb.loc[airbnb['distance_to_supermarket'] >= 1000, "distance_to_supermarket"] = 1000

In [None]:
airbnb = airbnb.rename(columns={'distance': 'geometry_distance_to_supermarket'})
airbnb['geometry_distance_to_supermarket'] = airbnb['geometry_distance_to_supermarket'].fillna(1000)

In [None]:
airbnb.loc[airbnb['geometry_distance_to_supermarket'] >= 1000, "geometry_distance_to_supermarket"] = 1000

In [None]:
# Drop unnecessary columns
to_drop = ["Room_Latitude","Room_Longitude", "Venue", "Venue_Latitude", "Venue_Longitude", "Venue_Category", "airbnb_node_y",
          "supermarket_nodes1", "restaurants_nodes1"]

In [None]:
airbnb = airbnb.drop(to_drop, axis = 1)

In [None]:
airbnb = airbnb.rename(columns={'airbnb_node_x': 'airbnb_node'})

# Parks

In [None]:
parks_2_1 = pd.DataFrame()

In [None]:
# Define the range for smoother and faster retrieving of data
# Run this code several times to get all the data
# For park, we set range 1000m 
LIMIT = 3
# airbnb1 = airbnb[a:b]
i = 0
park1 = pd.DataFrame()
for k in range(10,len(airbnb1),10):
    dataset = pd.DataFrame(airbnb1[i:k])
    parks = getNearbyVenues(names=dataset['id'],
                                       latitudes=dataset['latitude'],
                                       longitudes=dataset['longitude'],
                                       radius = 1000,
                                       categoryId = "4bf58dd8d48988d163941735" # Park
                                      )
    park1 = park1.append(parks)
    i = i+10   
    
parks_2_1 = parks_2_1.append(park1)

In [None]:
only_parks = parks_2_1[parks_2_1.Venue_Category == "Park"]

In [None]:
lis = []
for x in list(airbnb.id):
    if x not in list(only_parks.Id):
        lis.append(x)

print(len(lis))

parks_need = pd.DataFrame()
for x in lis:
    y = pd.DataFrame(airbnb.loc[airbnb.id == x])
    parks_need = parks_need.append(y)

In [None]:
# The rest

LIMIT = 30
airbnb1 = parks_need
i = 0
park1 = pd.DataFrame()
dataset = airbnb1
parks = getNearbyVenues(names=dataset['id'],
                                   latitudes=dataset['latitude'],
                                   longitudes=dataset['longitude'],
                                   radius = 1000,
                                   categoryId = "4bf58dd8d48988d163941735" # Park
                                  )
park1 = park1.append(parks) 

In [None]:
only_parks_rest = park1[park1.Venue_Category == "Park"]

In [None]:
parks_all = only_parks.append(only_parks_rest)

In [None]:
lis = []
for x in list(airbnb.id):
    if x not in list(parks_all.Id):
        lis.append(x)

print(len(lis))

parks_need_rest = pd.DataFrame()
for x in lis:
    y = pd.DataFrame(airbnb.loc[airbnb.id == x])
    parks_need_rest = parks_need_rest.append(y) # 200 listings does not have park within 1000

# Walking distance to nearest park

In [None]:
%%time
# Apply function nearest_node to find nearest node to each GPS coordinates of parks

parks_all['parks_node1'] = parks_all.apply(nearest_node, axis=1) 

In [None]:
# Map Airbnb nodes from original dataset
d_airbnb_nodes = {}
for i, node in zip(airbnb.id, airbnb.air_nodes_true):
    d_airbnb_nodes[i] = node

parks_all["airbnb_node"] = parks_all["Id"] # Initiate column where we will map airbnb node

parks_all.airbnb_node = parks_all.airbnb_node.map(d_airbnb_nodes) # Map airbnb nodes

In [None]:
# Compute walking distance to park
def distance(row):
    return nx.shortest_path_length(G, source=row.airbnb_node, target=row.parks_node1, weight='length')

parks_all['distance_to_park'] = parks_all.apply(distance, axis=1)

In [None]:
# Sort to get the shortest walking distance for each listing
parks_all = parks_all.sort_values(by = ["Id", "distance_to_park"]).drop_duplicates(subset = ['airbnb_node', "Id"])

In [None]:
# Rename column, so we can merge it with original dataset
parks_all = parks_all.rename(columns={'Id': 'id'})

In [None]:
# Merge
airbnb = pd.merge(airbnb, parks_all, on = "id", how = "outer")

In [None]:
# Fill nan with 1000, meaning these listings has supermarket far away
airbnb['distance_to_park'] = airbnb['distance_to_park'].fillna(1000)

In [None]:
# Some of the distances is higher than 1000 (walking distance)
# These values will be considered as 1000 (e.g.)

In [None]:
airbnb.loc[airbnb['distance_to_park'] >= 1000, "distance_to_park"] = 1000

In [None]:
airbnb = airbnb.rename(columns={'distance': 'geometry_distance_to_park'})
airbnb['geometry_distance_to_park'] = airbnb['geometry_distance_to_park'].fillna(1000)

In [None]:
# Drop unnecessary columns
to_drop = ["Unnamed: 0", "Room_Latitude","Room_Longitude", "Venue", "Venue_Latitude", "Venue_Longitude", "Venue_Category",
          "parks_node1", "airbnb_node"]

In [None]:
airbnb = airbnb.drop(to_drop, axis = 1)

In [None]:
airbnb.to_csv("airbnb4.csv")