In [None]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from shapely.geometry import MultiPoint,Point, Polygon
import osmnx as ox
import shapely.wkt
from descartes import PolygonPatch
import geopandas as gpd
import shapely
import geojsonio
import geojson
import json
import pandas as pd
import numpy as np
import osmapi as osm
from shapely.ops import nearest_points
import networkx as nx

from pandana.loaders import osm 


In [None]:
airbnb = pd.read_csv("airbnb2.csv") # import airbnb dataset

# Public transport stations

In [None]:
# First we need to extract the right data

In [None]:
with open('stops.json', encoding='utf-8') as fh: # open the file
    data = json.load(fh)

In [None]:
data.keys()

In [None]:
gdf = gpd.GeoDataFrame(data) # we will recieve a geopandas data frame

In [None]:
a = dict(gdf["stopGroups"]) # we need only data from "stopGroups"

In [None]:
stations = gpd.GeoDataFrame(a) # from dict to geodataframe

In [None]:
stations = stations.T # need to be transpose

In [None]:
stations["coordinates"] = list(zip(stations.avgLon, stations.avgLat)) # shapely input has to be in order (long,lat)

In [None]:
stations["coordinates"] = stations["coordinates"].apply(Point) # make all of the tuples POINT, for purpose of using 
# shapely package

In [None]:
stations_Prague = stations[stations["municipality"] == "Praha"] # 1408 stations

In [None]:
stations_Prague["cis"].unique() # cis refers to the unique value of the station

# Euclidean distance to 5 nearest stations

In [None]:
# As coordinations are not saved as shapely.geometry object in csv, we must convert them to this type

# Note: Shapely geometry requires GPS in format (long, lat)

airbnb["coordinates"]  = airbnb["coordinates"].apply(shapely.wkt.loads)

# Make MultiPoint from all stations in Prague, so we can measure distance from one Point to all others
gps_stations = list(stations_Prague.coordinates)
all_stations = MultiPoint(gps_stations) # Multipoint of all public transport stations

In [None]:
%%time
# First we compute the 5 nearest station for each airbnb listing (by euclidean distance with shapely geometry)
euclidean_stations = []
for point in airbnb["coordinates"]:
    destinations = all_stations
   
    nearest_geoms1 = nearest_points(point, destinations)
    
    destinations = destinations - nearest_geoms1[1]
    nearest_geoms2 = nearest_points(point, destinations)
    
    destinations = destinations - nearest_geoms2[1]
    nearest_geoms3 = nearest_points(point, destinations)
    
    destinations = destinations - nearest_geoms3[1]
    nearest_geoms4 = nearest_points(point, destinations)
    
    destinations = destinations - nearest_geoms4[1]
    nearest_geoms5 = nearest_points(point, destinations)
    
    euclidean_stations.append([nearest_geoms1[1],nearest_geoms2[1],nearest_geoms3[1],nearest_geoms4[1],nearest_geoms5[1]])

In [None]:
col1 = []
col2 = []
col3 = []
col4 = []
col5 = []

for x in euclidean_stations:
    for k in range(len(x)):
        if k == 0:
            col1.append(x[k])
        elif k == 1:
            col2.append(x[k])
        elif k == 2:
            col3.append(x[k])
        elif k == 3:
            col4.append(x[k])
        elif k == 4:
            col5.append(x[k])

In [None]:
airbnb["col1"],airbnb["col2"],airbnb["col3"],airbnb["col4"],airbnb["col5"] = col1, col2, col3, col4,col5

In [None]:
airbnb[["col1", "col2","col3","col4","col5"]] = airbnb[["col1", "col2","col3","col4","col5"]].astype(str)

In [None]:
airbnb[["col1", "col2","col3","col4","col5"]] = airbnb[["col1", "col2","col3","col4","col5"]].replace({"POINT ": ""}, regex = True)

In [None]:
airbnb[["col1", "col2","col3","col4","col5"]] = airbnb[["col1", "col2","col3","col4","col5"]].replace({" ": ","}, regex = True).replace({"\(": ""}, regex = True).replace({"\)": ""}, regex = True)

In [None]:
airbnb[['col1x','col1y']] = (airbnb.col1.str.split(",",expand=True)).astype(float)
airbnb[['col2x','col2y']] = (airbnb.col2.str.split(",",expand=True)).astype(float)
airbnb[['col3x','col3y']] = (airbnb.col3.str.split(",",expand=True)).astype(float)
airbnb[['col4x','col4y']] = (airbnb.col4.str.split(",",expand=True)).astype(float)
airbnb[['col5x','col5y']] = (airbnb.col5.str.split(",",expand=True)).astype(float)

# Walking distance with network of streets

In [None]:
# Inspired by https://github.com/smmaurer/cp255

In [None]:
%%time
G = ox.graph_from_place("Prague",network_type = "walk")
fig, ax = ox.plot_graph(ox.project_graph(G))
fig.show()

In [None]:
# Original Airbnb points

In [None]:
# !!! Note: ox.get_nearest_node accept the coordinates in format (lat,long), which is different form shapely package (long,lat)
# Find the nearest node for each Airbnb GPS
%%time
def nearest_node(row):
    return ox.get_nearest_node(G, (row.latitude, row.longitude))

airbnb['air_nodes_true'] = airbnb.apply(nearest_node, axis=1) 

In [None]:
# Define function for each of the 5 nearest nodes
# Here, y represents latitude, and x longitude

def nearest_node1(row):
    return ox.get_nearest_node(G, (row.col1y, row.col1x))

def nearest_node2(row):
    return ox.get_nearest_node(G, (row.col2y, row.col2x))

def nearest_node3(row):
    return ox.get_nearest_node(G, (row.col3y, row.col3x))

def nearest_node4(row):
    return ox.get_nearest_node(G, (row.col4y, row.col4x))

def nearest_node5(row):
    return ox.get_nearest_node(G, (row.col5y, row.col5x))


In [None]:
%%time
airbnb['col1_nodes_true'] = airbnb.apply(nearest_node1, axis=1)
airbnb['col2_nodes_true'] = airbnb.apply(nearest_node2, axis=1)
airbnb['col3_nodes_true'] = airbnb.apply(nearest_node3, axis=1)
airbnb['col4_nodes_true'] = airbnb.apply(nearest_node4, axis=1)
airbnb['col5_nodes_true'] = airbnb.apply(nearest_node5, axis=1)

In [None]:
# First initilize list with true nodes 5 times
airbnb_nodes = []
ids =  []
k = 0
for k in range(5):
    for o in airbnb.air_nodes_true:
        airbnb_nodes.append(o)
        k = k+1

# Then also for ids (to leave the room id for each node) 
ids =  []
for k in range(5):
    for o in airbnb.id:
        ids.append(o)
    k = k+1

In [None]:
df_dist_station = pd.DataFrame({'airbnb_nodes': airbnb_nodes, 'id': ids})

In [None]:
# Append together all 5 nearest stations (nodes)
new_df = airbnb.col1_nodes_true # initiate

new_df= new_df.append([airbnb.col2_nodes_true,airbnb.col3_nodes_true,airbnb.col4_nodes_true,airbnb.col5_nodes_true]) # append all 

new_df = list(new_df)

In [None]:
df_dist_station["nearest_station_nodes"] = new_df

In [None]:
# Find the shortest path
%%time
def distance(row):
    return nx.shortest_path_length(G, source=row.airbnb_nodes, target=row.nearest_station_nodes, weight='length')

df_dist_station['distance_to_station'] = df_dist_station.apply(distance, axis=1)

In [None]:
# Drop duplicates, so we get only the nearest one
df_dist_station = df_dist_station.sort_values(by = ["id", "distance_to_station"]).drop_duplicates(subset = ['airbnb_nodes', "id"])

In [None]:
airbnb = pd.merge(airbnb, df_dist_station, on = "id")

In [None]:
columns_to_drop = ['Unnamed: 0','Unnamed: 0.1','col1_nodes','col2_nodes','col3_nodes', 'col4_nodes', 'col5_nodes', 'airbnb_nodes_x',
                   'airbnb_nodes_y',
                  'col1_nodes_true','col2_nodes_true','col3_nodes_true','col4_nodes_true','col5_nodes_true',
                  'col1x','col1y','col2x','col2y','col3x','col3y','col4x','col4y','col5x','col5y',
                  'col1','col2','col3','col4','col5',
                  'nearest_station_nodes','distance']

In [None]:
airbnb = airbnb.drop(columns_to_drop, axis = 1)

In [None]:
airbnb.to_csv("airbnb3.csv")