### Using Google Maps API

In [44]:
import pandas as pd
from pandas import DataFrame
import time
from googlemaps import Client

api_key = 'AIzaSyBzFFEQjBZ31RarU08DGmzaCkIftejrslM'
base_path = '/home/hareesh/Research/big_data/project/airbnb/'
gmaps = Client(api_key)

In [45]:
df = pd.read_csv(base_path+'notebooks/listings_with_amenities_dataframe.csv')
df = df.ix[:, ['id', 'latitude', 'longitude']] # Only this info is required for Googlemaps API

In [46]:
neighbourhood_facilities = ['bus stop', 'train station', 'restaurant', 'hospital', 'atm', 
                            'shopping mall', 'cineplex', 'night club',
                            'museum', 'art gallery', 'park']

df_neighbourhood = pd.DataFrame(columns=['id', 'bus_stop_lat', 'bus_stop_lon', 
                                         'train_station_lat', 'train_station_lon', 
                                         'restaurant_lat', 'restaurant_lon',
                                         'hospital_lat', 'hospital_lon', 
                                         'atm_lat', 'atm_lon',
                                         'shopping_mall_lat', 'shopping_mall_lon', 
                                         'cineplex_lat', 'cineplex_lon', 
                                         'night club_lat', 'night club_lon',
                                         'museum_lat', 'museum_lon', 
                                         'art gallery_lat', 'art gallery_lon', 
                                         'park_lat', 'park_lon'])

In [None]:
%%time
# Test Script
i=0
id_ = int(df.loc[i]['id'])
lat_ = df.loc[i]['latitude']
lon_ = df.loc[i]['longitude']
loc = gmaps.places(query = 'park', location = (lat_, lon_))['results'][0]['geometry']['location']

In [None]:
%%time 
for i in range(30):
    id_ = int(df.loc[i]['id'])
    lat_ = df.loc[i]['latitude']
    lon_ = df.loc[i]['longitude']

    lat_lon_list = [[id_]]
    for facility in neighbourhood_facilities:
        try:
            loc = gmaps.places(query = facility, location = (lat_, lon_))['results'][0]['geometry']['location']
            lat_lon_list.append([loc['lat'], loc['lng']])
            time.sleep(2)
        except Exception as e:
            lat_lon_list.append([0,0])
            print i, str(e)
            continue
            
    lat_lon_list = [item for sublist in lat_lon_list for item in sublist]
    df_neighbourhood.loc[i] = lat_lon_list

Due to API restrictions, there is a limit on the number of hits. Hence, we decided to use OSM :

### Using OpenStreetMap API

In [4]:
import pandas as pd
from pandas import DataFrame
import time
import numpy as np
import cPickle as pickle 
from tqdm import tqdm

base_path = '/home/hareesh/Research/big_data/project/airbnb/'
import overpy
from geopy.distance import great_circle
api = overpy.Overpass()

In [2]:
df = pd.read_csv(base_path+'notebooks/listings_with_amenities_dataframe.csv')
df = df.ix[:, ['id', 'latitude', 'longitude']] # Only this info is required for OSM Overpass API

In [3]:
neighbourhood_facilities = ['restaurant', 'atm', 'cinema', 'hospital', 'nightclub', 
                            'park', 'mall', 'gallery', 'museum', 'supermarket', 'bus_stop']

df_neighbourhood = pd.DataFrame(-1, index=range(df.shape[0]), columns=['id']+neighbourhood_facilities)
df_neighbourhood['id'] = df['id']

In [4]:
neighbourhood = {'restaurant':['amenity',2000],
    'atm':['amenity',5000],
    'cinema':['amenity',15000],
    'hospital':['amenity',15000],
    'nightclub':['amenity',10000],

    'park':['leisure',10000],

    'mall':['shop',10000],
    'museum':['tourism',10000],
    'gallery':['tourism',10000],

    'supermarket':['shop',5000],
    'bus_stop':['highway',1000]}

In [5]:
%%time
result = api.query("""
(
  node["shop"="supermarket"](around:2000, 43.69607, -79.44855);
);
out body;
""")

print len(result.nodes)
for node in result.nodes:
    print node.lat, node.lon

7
43.6807442 -79.4514642
43.7052158 -79.4425168
43.7037166 -79.4408628
43.6926868 -79.4657378
43.6921105 -79.4403261
43.6894510 -79.4345640
43.6995876 -79.4314530
CPU times: user 8 ms, sys: 12 ms, total: 20 ms
Wall time: 735 ms


In [None]:
%%time
for facility in neighbourhood_facilities:
    # facility_dist = []
    facility_type = neighbourhood[facility][0]
    circle_radius = neighbourhood[facility][1]

    for i in tqdm(range(9787, df.shape[0])):
        id_ = int(df.loc[i]['id'])
        lat_ = df.loc[i]['latitude']
        lon_ = df.loc[i]['longitude']

        input_query = '''(node["'''+facility_type+'''"="'''+facility+'''"](around:'''+str(circle_radius)+','+str(lat_)+','+str(lon_)+');'+''');out body;'''

        try:
            result = api.query(input_query)
            dist = []
            for node in result.nodes:
                dist.append(great_circle((lat_, lon_), (node.lat, node.lon)).meters)
            facility_dist.append(np.int(min(dist)))
        except Exception as e:
            facility_dist.append(-1)
            print str(e)
        
    df_neighbourhood[facility] = facility_dist


In [14]:
# Merge all amenities data from the original airbnb scraped dataframe with 
# all neighbourhood facilites data from openstreet maps
# into a single dataframe for exploratory analysis later
df = pd.read_csv(base_path+'notebooks/listings_with_amenities_dataframe.csv')
df = pd.concat([df,df_neighbourhood.ix[:,1:]], axis=1)
df.shape

(9795, 81)

In [53]:
df.head(2) # All data in 1 dataframe

Unnamed: 0,id,name,summary,latitude,longitude,space,description,neighborhood_overview,notes,transit,...,atm,cinema,hospital,nightclub,park,mall,gallery,museum,supermarket,bus_stop
0,8238835,Cozy BR + EnSuite @ York University,We are located at York University. Close to Yo...,43.76874,-79.502254,The house is located less than 100 meters from...,We are located at York University. Close to Yo...,"Our house is ideal for students, professionals...","Please note, this room does not have a window....",The bus stop is right in front of our home on ...,...,509,5812,5400,1060,3269,7227,-1,9607,1709,72
1,16162206,"Large sunny bedroom, 3d floor (3A)",This room is one of 4 bedrooms available in a ...,43.765573,-79.492943,,This room is one of 4 bedrooms available in a ...,,,,...,1212,5480,4588,1151,2694,6488,-1,9476,1213,77


In [15]:
with open('df_amenities_neighbourhood_all.pkl', 'wb') as f:
    pickle.dump(df,f)

In [16]:
with open('df_amenities_neighbourhood_all.pkl', 'rb') as f: # Retrieve saved pickle object
    df = pickle.load(f)