#### Code reference https://www.xbyte.io/how-to-use-python-to-scrape-real-estate-website-data-using-web-scraping-and-making-data-wrangling.php

In [1]:
import pandas as pd

In [2]:
property = pd.read_csv('../../data/curated/API_clean_price_with_SA2_using_geolocation.csv').iloc[:,1:]
print(property.shape)
property.iloc[1000:1050, :]

(14694, 18)


Unnamed: 0,id,time,listing_type,price,property_type,area,building_area,land_area,bedrooms,bathrooms,carspaces,street_address,suburb,postcode,latitude,longitude,weekly_rent,SA2_CODE
1000,16047502,2022-08-19T10:22:24,Rent,$450_per_week,ApartmentUnitFlat,Melbourne City Council - Greater Area,0.0,0.0,1.0,1.0,1,"S2609/231 Harbour Esplanade, Docklands",DOCKLANDS,3008,-37.813576,144.943,450,206041118.0
1001,16046673,2022-08-18T16:18:11,Rent,$500_weekly,ApartmentUnitFlat,Melbourne City Council - Greater Area,0.0,0.0,1.0,1.0,0,"1501/628 Flinders Street, Docklands",DOCKLANDS,3008,-37.82097,144.953247,500,206041118.0
1002,16045684,2022-08-18T12:07:57,Rent,$620_weekly,ApartmentUnitFlat,Melbourne City Council - Greater Area,0.0,0.0,2.0,2.0,1,"1610S/883 Collins Street, Docklands",DOCKLANDS,3008,-37.821213,144.9425,620,206041118.0
1003,16003855,2022-08-18T11:36:27,Rent,$700_per_week,ApartmentUnitFlat,Melbourne City Council - Greater Area,0.0,0.0,2.0,2.0,1,"2703/100 Harbour Esplanade, Docklands",DOCKLANDS,3008,-37.818577,144.947372,700,206041118.0
1004,16044098,2022-08-17T16:06:14,Rent,$650PW_$2824PCM,ApartmentUnitFlat,Melbourne City Council - Greater Area,0.0,0.0,2.0,1.0,1,"1212N/889 Collins Street, Docklands",DOCKLANDS,3008,-37.821163,144.94194,650,206041118.0
1005,16043764,2022-08-17T15:02:48,Rent,$400_per_week,ApartmentUnitFlat,Melbourne City Council - Greater Area,0.0,0.0,1.0,1.0,0,"402E/888 Collins Street, Docklands",DOCKLANDS,3008,-37.82073,144.942261,400,206041118.0
1006,16042012,2022-08-16T16:45:04,Rent,$550,House,Melbourne City Council - Greater Area,0.0,0.0,1.0,1.0,0,"2009N/889 Collins Street, Docklands",DOCKLANDS,3008,-37.821163,144.94194,550,206041118.0
1007,16039501,2022-08-15T15:53:03,Rent,$550_per_week,ApartmentUnitFlat,Melbourne City Council - Greater Area,0.0,0.0,2.0,1.0,1,"S1501/231 Harbour Esplanade, Docklands",DOCKLANDS,3008,-37.813576,144.943,550,206041118.0
1008,16039206,2022-08-15T14:58:31,Rent,$400_Per_Week,ApartmentUnitFlat,Melbourne City Council - Greater Area,0.0,0.0,1.0,1.0,0,"709/838 Bourke Street, Docklands",DOCKLANDS,3008,-37.819813,144.943268,400,206041118.0
1009,16038423,2022-08-15T12:00:33,Rent,$550.00,ApartmentUnitFlat,Melbourne City Council - Greater Area,0.0,0.0,2.0,2.0,1,"910/673 La Trobe Street, Docklands",DOCKLANDS,3008,-37.813873,144.949051,550,206041118.0


In [13]:
property[['latitude', 'longitude', 'street_address']].to_numpy()

array([[-37.8175163, 144.966492, '1007/238 Flinders St, Melbourne'],
       [-37.8145638, 144.95228600000002,
        '1211/260 Spencer Street, Melbourne'],
       [-37.81068, 144.959274, '504/350 La Trobe Street, Melbourne'],
       ...,
       [-38.628715500000006, 145.73580900000002,
        '14 Inverloch Parade, Inverloch'],
       [-38.6341248, 145.730682, '10 Hopetoun Street, Inverloch'],
       [-38.63146210000001, 145.72937, 'Inverloch']], dtype=object)

In [5]:
import geopy
from geopy.geocoders import GoogleV3
import googlemaps
import re

In [42]:
def rowwise_driving_distance(origin_latitude, origin_longitude, gmaps, pattern, destination_latitude, destination_longitude):
    driving_distance_matrix = gmaps.distance_matrix([str(origin_latitude) + " " + str(origin_longitude)], [str(destination_latitude) + " " + str(destination_longitude)], mode='driving')
    return pd.to_numeric(pattern.findall(driving_distance_matrix['rows'][0]['elements'][0]['distance']['text'])[0])

def rowwise_driving_time(origin_latitude, origin_longitude, gmaps, destination_latitude, destination_longitude):
    driving_distance_matrix = gmaps.distance_matrix([str(origin_latitude) + " " + str(origin_longitude)], [str(destination_latitude) + " " + str(destination_longitude)], mode='driving')
    return pd.to_numeric(driving_distance_matrix['rows'][0]['elements'][0]['duration']['value'])

def rowwise_walking_distance(origin_latitude, origin_longitude, gmaps, pattern, destination_latitude, destination_longitude):
    walking_distance_matrix = gmaps.distance_matrix([str(origin_latitude) + " " + str(origin_longitude)], [str(destination_latitude) + " " + str(destination_longitude)], mode='walking')
    return pd.to_numeric(pattern.findall(walking_distance_matrix['rows'][0]['elements'][0]['distance']['text'])[0])  

def rowwise_walking_time(origin_latitude, origin_longitude, gmaps, destination_latitude, destination_longitude):
    walking_distance_matrix = gmaps.distance_matrix([str(origin_latitude) + " " + str(origin_longitude)], [str(destination_latitude) + " " + str(destination_longitude)], mode='walking')
    return pd.to_numeric(walking_distance_matrix['rows'][0]['elements'][0]['duration']['value'])

In [43]:
import swifter
"""This function takes in a property csv (pandas dataframe) file, and compute distance, time for each property location to Melbourne CBD, using 
the latitude and longitude of each property, and return the imputed csv file"""
def add_distance(property_csv):
    # convert to dict for faster computation
    #property_dict = property_csv[['latitude', 'longitude', 'street_address']].to_dict('records')
    #property_list = property[['latitude', 'longitude', 'street_address']].to_numpy()
    pattern = re.compile(r'\d+.?\d?') # get only numeric distance or time value
    destination_latitude, destination_longitude = -37.810454, 144.962379 # melbourne central latitude and longitude
    gmaps = googlemaps.Client(key='AIzaSyCSmyP2Pxq7lMHE7w27m2he1l-RtreJdAQ') # Philip's api, please don't overuse

    property_csv['dri_dist_km'] = property_csv.swifter.apply(lambda x: rowwise_driving_distance(x['latitude'], x['longitude'], gmaps, pattern, destination_latitude, destination_longitude), axis=1)
    property_csv['dri_time_sec'] = property_csv.swifter.apply(lambda x: rowwise_driving_time(x['latitude'], x['longitude'], gmaps, pattern, destination_latitude, destination_longitude), axis=1)
    property_csv['dri_time_min'] = property_csv['dri_time_sec'] / 60
    property_csv['walk_dist_km'] = property_csv.swifter.apply(lambda x: rowwise_walking_distance(x['latitude'], x['longitude'], gmaps, pattern, destination_latitude, destination_longitude), axis=1)
    property_csv['walk_time_sec'] = property_csv.swifter.apply(lambda x: rowwise_walking_time(x['latitude'], x['longitude'], gmaps, pattern, destination_latitude, destination_longitude), axis=1)
    property_csv['walk_time_min'] = property_csv['walk_time_sec'] / 60
    
    return property_csv
     

In [26]:
import swifter
df['new'] = df.swifter.apply(lambda x : func(x['a'],x['b'],x['c'],x['d'],x['e']),axis=1)

"""This function takes in a property csv (pandas dataframe) file, and compute distance, time for each property location to Melbourne CBD, using 
the latitude and longitude of each property, and return the imputed csv file"""
def add_distance(property_csv):
    # convert to dict for faster computation
    property_dict = property_csv[['latitude', 'longitude', 'street_address']].to_dict('records')
    #property_list = property[['latitude', 'longitude', 'street_address']].to_numpy()
    pattern = re.compile(r'\d+.?\d?') # get only numeric distance or time value
    destination_latitude, destination_longitude = -37.810454, 144.962379 # melbourne central latitude and longitude
    gmaps = googlemaps.Client(key='AIzaSyCSmyP2Pxq7lMHE7w27m2he1l-RtreJdAQ') # Philip's api, please don't overuse
    driving_distance_list_km = []
    driving_time_list_mins = []
    walking_distance_list_km = []
    walking_time_list_mins = []        
    i=0
    for row in property_dict:
        print(f"Row {i}, location {row['street_address']}")
        origin_latitude, origin_longitude = row['latitude'], row['longitude']
        #print(f"Row {i}, location {row[2]}")
        #origin_latitude, origin_longitude = row[0], row[1]

        driving_distance_matrix = gmaps.distance_matrix([str(origin_latitude) + " " + str(origin_longitude)], [str(destination_latitude) + " " + str(destination_longitude)], mode='driving')
        walking_distance_matrix = gmaps.distance_matrix([str(origin_latitude) + " " + str(origin_longitude)], [str(destination_latitude) + " " + str(destination_longitude)], mode='walking')
        
        # individual values
        driving_distance_km = pd.to_numeric(pattern.findall(driving_distance_matrix['rows'][0]['elements'][0]['distance']['text'])[0])
        driving_time_mins = pd.to_numeric(pattern.findall(driving_distance_matrix['rows'][0]['elements'][0]['duration']['text'])[0])
        walking_distance_km = pd.to_numeric(pattern.findall(walking_distance_matrix['rows'][0]['elements'][0]['distance']['text'])[0])
        walking_time_mins = pd.to_numeric(pattern.findall(walking_distance_matrix['rows'][0]['elements'][0]['duration']['text'])[0])

        # store values to corresponding list
        driving_distance_list_km.append(driving_distance_km)
        driving_time_list_mins.append(driving_time_mins)
        walking_distance_list_km.append(walking_distance_km)
        walking_time_list_mins.append(walking_time_mins)

        i += 1
    property_csv['dri_dist_km'] = driving_distance_list_km
    property_csv['dri_time_min'] = driving_time_list_mins
    property_csv['dri_time_sec'] = property_csv['dri_time_min'] / 60
    property_csv['walk_dist_km'] = walking_distance_list_km
    property_csv['walk_time_min'] = walking_time_list_mins
    property_csv['walk_time_sec'] = property_csv['walk_time_min'] / 60
    
    return property_csv
     

In [70]:
def add_distance(row, gmaps, pattern):
    driving_distance_matrix = gmaps.distance_matrix([str(row['latitude']) + " " + str(row['longitude'])], [str(-37.810454) + " " + str(144.962379)], mode='driving')
    walking_distance_matrix = gmaps.distance_matrix([str(row['latitude']) + " " + str(row['longitude'])], [str(-37.810454) + " " + str(144.962379)], mode='walking')
    print(driving_distance_matrix)    
    # individual values
    #row['dri_dist_km'] = pd.to_numeric(pattern.findall(driving_distance_matrix['rows'][0]['elements'][0]['distance']['text'])[0])
    #row['dri_time_sec'] = pd.to_numeric(pattern.findall(driving_distance_matrix['rows'][0]['elements'][0]['duration']['value']))
    #row['dri_time_min'] = row['dri_time_sec'] / 60

    #row['walk_dist_km'] = pd.to_numeric(pattern.findall(walking_distance_matrix['rows'][0]['elements'][0]['distance']['text'])[0])
    #row['walk_time_sec'] = pd.to_numeric(pattern.findall(walking_distance_matrix['rows'][0]['elements'][0]['duration']['value']))
    #row['walk_time_min'] = row['walk_time_sec'] / 60
    
    return pd.to_numeric(pattern.findall(driving_distance_matrix['rows'][0]['elements'][0]['distance']['text'])[0])

In [71]:
%%time
gmaps = googlemaps.Client(key='AIzaSyCSmyP2Pxq7lMHE7w27m2he1l-RtreJdAQ') # Philip's api, please don't overuse
pattern = re.compile(r'\d+.?\d?') # get only numeric distance or time value
property['dri_dist_km'] = property.swifter.apply(add_distance, args=[gmaps, pattern] axis=1)
property.iloc[1000:1030, :]

Pandas Apply:   0%|          | 0/14694 [00:00<?, ?it/s]

{'destination_addresses': ['LG29, Melbourne Central, Swanston St, Melbourne VIC 3000, Australia'], 'origin_addresses': ['238-242 Flinders St, Melbourne VIC 3000, Australia'], 'rows': [{'elements': [{'distance': {'text': '1.6 km', 'value': 1609}, 'duration': {'text': '7 mins', 'value': 449}, 'status': 'OK'}]}], 'status': 'OK'}
{'destination_addresses': ['LG29, Melbourne Central, Swanston St, Melbourne VIC 3000, Australia'], 'origin_addresses': ['260 Spencer St, Melbourne VIC 3000, Australia'], 'rows': [{'elements': [{'distance': {'text': '1.3 km', 'value': 1280}, 'duration': {'text': '6 mins', 'value': 356}, 'status': 'OK'}]}], 'status': 'OK'}
{'destination_addresses': ['LG29, Melbourne Central, Swanston St, Melbourne VIC 3000, Australia'], 'origin_addresses': ['303/350 La Trobe St, Melbourne VIC 3000, Australia'], 'rows': [{'elements': [{'distance': {'text': '0.4 km', 'value': 382}, 'duration': {'text': '2 mins', 'value': 115}, 'status': 'OK'}]}], 'status': 'OK'}
{'destination_addresse

KeyboardInterrupt: 

In [57]:
%%time
gmaps = googlemaps.Client(key='AIzaSyCSmyP2Pxq7lMHE7w27m2he1l-RtreJdAQ') # Philip's api, please don't overuse
driving_distance_matrix = gmaps.distance_matrix([str(property[['latitude']]) + " " + str(property[['longitude']])], [str(-37.810454) + " " + str(144.962379)], mode='driving')
driving_distance_matrix

CPU times: user 28.1 ms, sys: 614 µs, total: 28.7 ms
Wall time: 299 ms


{'destination_addresses': ['-37.810454,144.962379'],
 'origin_addresses': ['Rochester, NY 14692, USA'],
 'rows': [{'elements': [{'status': 'ZERO_RESULTS'}]}],
 'status': 'OK'}

In [44]:
%%time
new_property = add_distance(property)
new_property.iloc[1000:1050, :]

Pandas Apply:   0%|          | 0/14694 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/14694 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [66]:
#gmaps.distance_matrix(property.iloc[1:]['street_address'], 'Melbourne Central')
origin_latitude, origin_longitude = property.iloc[8]['latitude'], property.iloc[8]['longitude']
destination_latitude, destination_longitude = -37.810454, 144.962379 # melbourne central
distance = gmaps.distance_matrix([str(origin_latitude) + " " + str(origin_longitude)], [str(destination_latitude) + " " + str(destination_longitude)], mode='driving')
#gmaps.distance_matrix('1216/628 Flinders Street','Melbourne Central', mode='driving')
#distance['rows'][0]['elements'][0]['distance']
distance
pattern = re.compile(r'\d+.?\d?')
pd.to_numeric(pattern.findall(distance['rows'][0]['elements'][0]['distance']['text'])[0])
pd.to_numeric(pattern.findall(distance['rows'][0]['elements'][0]['duration']['text'])[0])

5