In [1]:
import os
import json
import time
import folium
import googlemaps
import numpy as np
import pandas as pd
import seaborn as sns
import haversine as hs
from haversine import Unit
from datetime import datetime
import openrouteservice as ors
from difflib import SequenceMatcher
from IPython.display import display
from pandas.io.json import json_normalize

## Get API Keys

In [2]:
with open('../data/raw/APIkeys.txt') as file:
    api_keys = file.readlines()
    api_keys = [key.rstrip() for key in api_keys]

## Merge Dataset

In [3]:
# read the dataset after data-cleaning
property_df = pd.read_csv('../data/curated/cleaned_property_data.csv', low_memory = False)
GNR = pd.read_csv('../data/curated/GNR_cleaned.csv', low_memory = False)
count_table = pd.read_csv('../data/curated/count_table.csv', low_memory = False)

#read postcode match suburb
with open('../data/raw/postcode_match_suburb.json') as json_data:
    data = json.load(json_data)
postcode_match = pd.DataFrame.from_dict({'postcode':data.keys(), 'suburb':data.values()})
postcode_match['postcode'] = pd.to_numeric(postcode_match['postcode'])

In [4]:
# display all dataframe
display(property_df.head(10))
display(GNR.head(10))
display(postcode_match.head(10))
property_df.shape

Unnamed: 0,index,address,rent,features,type,furnitured,pool,gym,coordinates,desc,postcode,floor,num_bed,num_bath,num_car_park,rent_weekly
0,0,1414/218-228 A'Beckett Street Melbourne VIC 3000,$400 Per Week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8102832, 144.9566691]",South Melbourne Leasing,3000,14,1,1,0,400
1,1,11a/131 Lonsdale Sreet Melbourne VIC 3000,$350 per week,1 Bed1 Bath− Parking,Studio,Yes,No,No,"[-37.810779, 144.9685513]",Wimpie Santoso,3000,11,1,1,0,350
2,2,911/408 Lonsdale Street Melbourne VIC 3000,$330 per week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8125979, 144.9604012]",Ender Gok,3000,9,1,1,0,330
3,3,918/422 Collins St Melbourne VIC 3000,$600 Per week fully furnished,2 Beds1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8170971, 144.9601487]",Precinct Leasing,3000,9,2,1,0,600
4,4,602/118 Franklin Street Melbourne VIC 3000,$330,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8082052, 144.9589035]",Manuel Flores,3000,6,1,1,0,330
5,5,1112/333-351 Exhibition Street Melbourne VIC 3000,$600 per week,2 Beds2 Baths− Parking,Apartment / Unit / Flat,Yes,No,No,"[-37.80789559999999, 144.9682873]",Mark Faranda,3000,11,2,2,0,600
6,6,3002/288 Spencer St Melbourne VIC 3000,$510,2 Beds1 Bath1 Parking,Apartment / Unit / Flat,No,Yes,Yes,"[-37.813775, 144.9520948]",Leasing Team,3000,30,2,1,1,510
7,7,4/180 Little Collins Street Melbourne VIC 3000,$500 per week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,Yes,No,No,"[-37.8138601, 144.9679067]",Trish Ha,3000,4,1,1,0,500
8,8,1605/565 Flinders Street Melbourne VIC 3000,$500 per week,2 Beds2 Baths1 Parking,Apartment / Unit / Flat,No,No,No,"[-37.8210586, 144.9559072]",Justine Muscat,3000,16,2,2,1,500
9,9,612/408 Lonsdale Street Melbourne VIC 3000,$350,1 Bed1 Bath− Parking,Apartment / Unit / Flat,Yes,No,No,"[-37.8125979, 144.9604012]",Manuel Flores,3000,6,1,1,0,350


Unnamed: 0,PLACE_NAME,FEATURE,LONGITUDE,LATITUDE,geometry,suburb,postcode
0,MOUNT CLEAR SECONDARY COLLEGE,SECONDARY SCHOOL,143.8766,-37.607,POINT (2400824.055420277 2431949.3066944666),MOUNT CLEAR,3357.0
1,VIEWBANK COLLEGE,SECONDARY SCHOOL,145.0865,-37.741,POINT (2507631.5373240355 2417744.9461323526),VIEWBANK,3084.0
2,Yarram Secondary College - Devon North Campus,SECONDARY SCHOOL,146.6478,-38.519,POINT (2643730.207553347 2330161.818601803),DEVON NORTH,
3,Bass Coast College - San Remo Campus,SECONDARY SCHOOL,145.3905,-38.532,POINT (2534061.1052171467 2329809.2720654747),SAN REMO,3925.0
4,SANDRINGHAM SECONDARY COLLEGE BEAUMARIS CAMPUS,SECONDARY SCHOOL,145.0312,-37.978,POINT (2502746.435917485 2391445.676364539),BEAUMARIS,3193.0
5,SANDRINGHAM SECONDARY COLLEGE HIGHETT CAMPUS,SECONDARY SCHOOL,145.0212,-37.955,POINT (2501868.406406302 2393912.2221767586),SANDRINGHAM,3188.0
6,SANDRINGHAM SECONDARY COLLEGE SANDRINGHAM CAMPUS,SECONDARY SCHOOL,145.0254,-37.956,POINT (2502233.993617093 2393881.0541395023),SANDRINGHAM,3191.0
7,BEAUMARIS SECONDARY COLLEGE,SECONDARY SCHOOL,145.034,-37.978,POINT (2502994.186749523 2391421.1723456345),BEAUMARIS,3193.0
8,PENOLA CATHOLIC COLLEGE,SECONDARY SCHOOL,144.9079,-37.709,POINT (2491880.327970704 2421254.8904169593),GLENROY,3041.0
9,SUNSHINE NORTH SECONDARY COLLEGE,SECONDARY SCHOOL,144.8434,-37.774,POINT (2486211.2802575794 2414064.5974380244),SUNSHINE NORTH,3020.0


Unnamed: 0,postcode,suburb
0,3000,[MELBOURNE]
1,3001,[MELBOURNE]
2,3002,[EAST MELBOURNE]
3,3003,[WEST MELBOURNE]
4,3004,"[MELBOURNE, ST KILDA ROAD CENTRAL, ST KILDA RO..."
5,3006,"[SOUTH WHARF, SOUTHBANK]"
6,3008,[DOCKLANDS]
7,3010,[UNIVERSITY OF MELBOURNE]
8,3011,"[FOOTSCRAY, SEDDON, SEDDON WEST]"
9,3012,"[BROOKLYN, KINGSVILLE, MAIDSTONE, TOTTENHAM, W..."


(14527, 16)

In [5]:
# show the all types of property data
property_df['type'].unique()

array(['Apartment / Unit / Flat', 'Studio', 'Townhouse', 'House', 'Villa',
       'New House & Land', 'Penthouse', 'Terrace', 'Semi-Detached',
       'Acreage / Semi-Rural', 'Duplex', 'New Apartments / Off the Plan',
       'Carspace', 'Retirement', 'Rural', 'Farm'], dtype=object)

In [6]:
# delete some unsusal data, treat them as outlier
property_df = property_df[property_df['type'] != 'Carspace']
property_df = property_df[property_df['type'] != 'Retirement']
property_df = property_df[property_df['type'] != 'Farm']
property_df = property_df[property_df['type'] != 'Acreage / Semi-Rural']
property_df = property_df[property_df['type'] != 'Rural']
property_df = property_df[property_df['type'] != 'New House & Land']
# re-classify the property data
property_df['type'] = property_df['type'].replace('Villa','House')
property_df['type'] = property_df['type'].replace('Semi-Detached','House')
property_df['type'] = property_df['type'].replace('Duplex','House')
property_df['type'] = property_df['type'].replace('New Apartments / Off the Plan','Apartment / Unit / Flat')
property_df['type'] = property_df['type'].replace('Terrace','Apartment / Unit / Flat')
property_df['type'] = property_df['type'].replace('Apartment / Unit / Flat', 'Apartment / Unit / Flat / Penhouse')
property_df['type'] = property_df['type'].replace('Penthouse', 'Apartment / Unit / Flat / Penhouse')
property_df = property_df.reset_index(drop=True)

In [7]:
# Change to numerical
property_df = property_df.replace('-', -1)

property_df['furnitured'] = property_df['furnitured'].replace('Yes', 1)
property_df['furnitured'] = property_df['furnitured'].replace('No', 0)

property_df['pool'] = property_df['pool'].replace('Yes', 1)
property_df['pool'] = property_df['pool'].replace('No', 0)

property_df['gym'] = property_df['gym'].replace('Yes', 1)
property_df['gym'] = property_df['gym'].replace('No', 0)

In [8]:
# split the address and get the suburbs
def extract_suburb(address):
    address = address.split(" ")
    if address[-3].isdigit():
        return address[-4]
    else:
        return address[-3]
property_df['suburb'] = property_df["address"].apply(extract_suburb)
property_df['suburb'] = property_df['suburb'].str.upper()  # make letter upper 
property_df['postcode'] = pd.to_numeric(property_df['postcode'])  # make sure the postcodes are int

In [9]:
# select the feature we need
property_df = property_df[['address', 'rent_weekly', 'floor', 'suburb','postcode', 'type', 'furnitured', 'pool',
                           'gym', 'num_bed', 'num_bath', 'num_car_park', 'coordinates']].copy()
# convert coordinates from str to list
property_df['coordinates'] = property_df['coordinates'].apply(eval)

In [10]:
property_df.head(5)

Unnamed: 0,address,rent_weekly,floor,suburb,postcode,type,furnitured,pool,gym,num_bed,num_bath,num_car_park,coordinates
0,1414/218-228 A'Beckett Street Melbourne VIC 3000,400,14,MELBOURNE,3000,Apartment / Unit / Flat / Penhouse,0,0,0,1,1,0,"[-37.8102832, 144.9566691]"
1,11a/131 Lonsdale Sreet Melbourne VIC 3000,350,11,MELBOURNE,3000,Studio,1,0,0,1,1,0,"[-37.810779, 144.9685513]"
2,911/408 Lonsdale Street Melbourne VIC 3000,330,9,MELBOURNE,3000,Apartment / Unit / Flat / Penhouse,0,0,0,1,1,0,"[-37.8125979, 144.9604012]"
3,918/422 Collins St Melbourne VIC 3000,600,9,MELBOURNE,3000,Apartment / Unit / Flat / Penhouse,0,0,0,2,1,0,"[-37.8170971, 144.9601487]"
4,602/118 Franklin Street Melbourne VIC 3000,330,6,MELBOURNE,3000,Apartment / Unit / Flat / Penhouse,0,0,0,1,1,0,"[-37.8082052, 144.9589035]"


In [11]:
# calculate the similarity percentage
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# find the most similar suburb
def most_similar(df,suburb):
    if df['suburb'].shape[0] == 1:
        current_match = [100,df['suburb'].iloc[0][0]]
    else:
        # initialize the most similar suburb
        current_match = [0,"None"]
        # check similarity for each suburb
        for sub in df['suburb']:
            # get the similarity percentage
            simi_percent = similar(suburb, sub)
            # update most similar suburb
            if simi_percent > current_match[0]:
                current_match = [simi_percent, sub]
    # return the most similar suburb
    return current_match[1]

def correct_suburb(suburb_df, property_df):
    # check property_df each row's suburb
    for row in range(property_df.shape[0]):
        # get the property postcode
        postcode = property_df.loc[row, 'postcode']
        # get the postcode and corresponding suburbs
        match_df = suburb_df[suburb_df['postcode'] == postcode]
        sub_lis = list(match_df['suburb'])[0]
        # if the suburb matched, don't change it
        if property_df.loc[row, 'suburb'] in sub_lis:
            pass
        # if the suburb don't matched, replace by the most similar suburb by postcode
        else:
            most_match = most_similar(match_df, property_df.loc[row, 'suburb'])
            property_df.at[row, 'suburb'] = most_match
    return property_df


In [12]:
# correct the suburb names in property data
property_df = correct_suburb(postcode_match,property_df)  

In [13]:
# merge count_table data and property_df
property_df = pd.merge(property_df, count_table, on='suburb', how='left').fillna(0)
# save dataframe as df
property_df.to_csv('../data/curated/final_property.csv',index=False)

In [14]:
property_df

Unnamed: 0,address,rent_weekly,floor,suburb,postcode,type,furnitured,pool,gym,num_bed,...,COAST,GOLF COURSE,HELIPORT,AIRPORT,IRON ORE PROCESSOR,MILL/TIMBER OPERATIONS,BEACH,TRAIN STATION,TRAM STATION,BUS
0,1414/218-228 A'Beckett Street Melbourne VIC 3000,400,14,MELBOURNE,3000,Apartment / Unit / Flat / Penhouse,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,127.0,54.0
1,11a/131 Lonsdale Sreet Melbourne VIC 3000,350,11,MELBOURNE,3000,Studio,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,127.0,54.0
2,911/408 Lonsdale Street Melbourne VIC 3000,330,9,MELBOURNE,3000,Apartment / Unit / Flat / Penhouse,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,127.0,54.0
3,918/422 Collins St Melbourne VIC 3000,600,9,MELBOURNE,3000,Apartment / Unit / Flat / Penhouse,0,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,127.0,54.0
4,602/118 Franklin Street Melbourne VIC 3000,330,6,MELBOURNE,3000,Apartment / Unit / Flat / Penhouse,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,127.0,54.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14500,16B Sandy Mount Avenue Inverloch VIC 3996,550,1,INVERLOCH,3996,House,0,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14501,28 Beachcomber Drive Inverloch VIC 3996,550,1,INVERLOCH,3996,House,1,0,0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14502,14 Inverloch Parade Inverloch VIC 3996,440,1,INVERLOCH,3996,House,0,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14503,Inverloch VIC 3996,580,1,INVERLOCH,3996,House,0,0,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# calculate distances

In [15]:
property_df = pd.read_csv('../data/curated/final_property.csv', low_memory=False)
property_df['coordinates'] = property_df['coordinates'].apply(eval)

In [119]:
# check if the distance bewteen two points are smaller or equal to 1.5 km
def around_1500m(loc, df):
    max_long = loc[1] + 0.01
    min_long = loc[1] - 0.01
    max_lan = loc[0] + 0.01
    min_lan = loc[0] - 0.01
    
    df = df[df['LONGITUDE'] <= max_long]
    df = df[df['LONGITUDE'] >= min_long]
    df = df[df['LATITUDE'] <= max_lan]
    df = df[df['LATITUDE'] >= min_lan]
    return df

# check if the distance bewteen two points are smaller or equal to 2 km
def around_3km(loc, df):
    max_long = loc[1] + 0.02
    min_long = loc[1] - 0.02
    max_lan = loc[0] + 0.02
    min_lan = loc[0] - 0.02

    df = df[df['LONGITUDE'] <= max_long]
    df = df[df['LONGITUDE'] >= min_long]
    df = df[df['LATITUDE'] <= max_lan]
    df = df[df['LATITUDE'] >= min_lan]
    return df

# check if the distance bewteen two points are smaller or equal to 3 km
def around_10km(loc, df):
    max_long = loc[1] + 0.07
    min_long = loc[1] - 0.07
    max_lan = loc[0] + 0.07
    min_lan = loc[0] - 0.07

    df = df[df['LONGITUDE'] <= max_long]
    df = df[df['LONGITUDE'] >= min_long]
    df = df[df['LATITUDE'] <= max_lan]
    df = df[df['LATITUDE'] >= min_lan]
    return df

# comparing the distances and record two cloest points
def cloest_2point(dist_dict,dist,loc,stop):
    # if there is no point, then just append the point into dict
    if len(dist_dict) < 2:
        dist_dict.append((dist,loc,stop))
    # if there already has three points, then compares distance
    else:
        # if the current distance smaller than records' distance
        if dist < dist_dict[0][0]:
            # delete the record point
            dist_dict.pop(0)
            # append new cloest point
            dist_dict.append((dist,loc,stop))
        elif dist < dist_dict[1][0]:
            # delete the record point
            dist_dict.pop(1)
            # append new cloest point
            dist_dict.append((dist,loc,stop))
    return dist_dict

# comparing the distances and record the most cloest points
def cloest_point(dist_dict,dist,loc,stop):
    # if there is no point, then just append the point into dict
    if len(dist_dict) < 1:
        dist_dict.append((dist,loc,stop))
    # if there already has three points, then compares distance
    else:
        # if the current distance smaller than records' distance
        if dist < dist_dict[0][0]:
            # delete the record point
            dist_dict.pop(0)
            # append new cloest point
            dist_dict.append((dist,loc,stop))
    return dist_dict
            
# calculate the cloest three train station for each property data
def distance_train(loc1):
    # read train station data
    GNR = pd.read_csv('../data/curated/GNR_cleaned.csv', low_memory = False)
    # initaliza the dict for record the cloest three point of interest
    dist_lis = []
    # read train station data
    train = GNR[GNR['FEATURE'] == 'TRAIN STATION']
    train = around_10km(loc1, train)
    
    # if there is not any train station within 10 km straight line distance, it should return empty list
    if train.shape[0] != 0:
        # find cloest two points
        for i in range(train.shape[0]):
            # feature points
            loc2 = (train.iloc[i]["LATITUDE"],train.iloc[i]["LONGITUDE"])
            # calculate distance between property and feature
            dist = hs.haversine(loc1,loc2,unit=Unit.METERS)
            # check the cloest
            dist_lis = cloest_2point(dist_lis,dist,loc2,train.iloc[i]["PLACE_NAME"])
    return dist_lis

# calculate the cloest three bus stop for each property data
def distance_bus(loc1):
    # read train station data
    GNR = pd.read_csv('../data/curated/GNR_cleaned.csv', low_memory = False)
    # initaliza the dict for record the cloest three point of interest
    dist_lis = []
    # read train station data
    bus = GNR[GNR['FEATURE'] == 'BUS']
    bus = around_1500m(loc1, bus)
    # if there is not any bus stop within 1.5 km straight line distance, it should return empty list
    if bus.shape[0] == 0:
        dist_lis = []
    else:
        # find cloest point
        for i in range(bus.shape[0]):
            # feature points
            loc2 = (bus.iloc[i]["LATITUDE"],bus.iloc[i]["LONGITUDE"])
            # calculate distance between property and feature
            dist = hs.haversine(loc1,loc2,unit=Unit.METERS)
            # check the cloest
            dist_lis = cloest_point(dist_lis,dist,loc2,bus.iloc[i]["PLACE_NAME"])
    return dist_lis

# calculate the cloest three tram stop for each property data
def distance_tram(loc1):
    # read train station data
    GNR = pd.read_csv('../data/curated/GNR_cleaned.csv', low_memory = False)
    # initaliza the dict for record the cloest three point of interest
    dist_lis = []
    # read train station data
    tram = GNR[GNR['FEATURE'] == 'TRAM STATION']
    tram = around_3km(loc1, tram)
    # if there is not any bus stop within 3 km straight line distance, it should return empty list
    if tram.shape[0] == 0:
        dist_lis = []
    else:
        # find cloest point
        for i in range(tram.shape[0]):
            # feature points
            loc2 = (tram.iloc[i]["LATITUDE"],tram.iloc[i]["LONGITUDE"])
            # calculate distance between property and feature
            dist = hs.haversine(loc1,loc2,unit=Unit.METERS)
            # check the cloest
            dist_lis = cloest_point(dist_lis,dist,loc2,tram.iloc[i]["PLACE_NAME"])
    return dist_lis

# find the cloest school, park, supermarket and etc
def distance_GNR(loc1):
    # read data in
    GNR = pd.read_csv('../data/curated/GNR_cleaned.csv', low_memory = False)
    # delete train
    GNR = GNR[GNR['FEATURE']!= 'TRAIN STATION']
    
    # get all feature types
    feature = ['cloest_' + s for s in list(GNR['FEATURE'].unique())]
    # record the cloest distance and place and duration
    dist_GNR = {}
    # find the cloest point of interest
    for i in range(len(feature)):
        
        # select the feature type
        temp_df = GNR[GNR['FEATURE'] == list(GNR['FEATURE'].unique())[i]].copy()
        dist_lis = []
        if 'TRAM' in feature[i]:
            temp_df = around_3km(loc1, temp_df)
        elif 'BUS'in feature[i]:
            temp_df = around_1500m(loc1, temp_df)
            
        # find cloest feature
        for j in range(temp_df.shape[0]):
            loc2 = (temp_df.iloc[j]["LATITUDE"],temp_df.iloc[j]["LONGITUDE"])
            # calculate distance between property and feature
            dist = hs.haversine(loc1,loc2,unit=Unit.METERS)
            # check the cloest
            dist_lis = cloest_point(dist_lis,dist,loc2,temp_df.iloc[j]["PLACE_NAME"])
            dist_GNR[feature[i]] = dist_lis
            
    return dist_GNR

# unzip the cloest GNR distance info
def extract_cloest(df):
    # get the columns'name
    feature_lis = ['cloest_' + s for s in list(GNR['FEATURE'].unique())]
    # delete column name about train, tram and bus
    feature_lis = [x for x in feature_lis if ('TRAIN STATION' not in x)]
    # unzip all the features
    for feature in feature_lis:
        df[feature] = np.nan
        # upzip each row
        for row in range(df.shape[0]):
            dist = df.loc[row, 'GNR'][feature][0][0]
            df.at[row, feature] = dist
    return df

In [18]:
start=datetime.now()
# find the cloest train station
property_df['cloest_train_station'] = property_df["coordinates"].apply(distance_train)
print(datetime.now()-start)

KeyboardInterrupt: 

In [None]:
start=datetime.now()
# find the cloest train station
property_df['GNR'] = property_df["coordinates"].apply(distance_GNR)
print(datetime.now()-start)

In [93]:
property_df = extract_cloest(property_df)
property_df = property_df.drop(columns=['GNR'])

In [None]:
property_df

In [None]:
property_df.to_csv('../data/curated/property_cleaned.csv',index=False)

## Find the cloest point

In [None]:
property_df = pd.read_csv('../data/curated/property_cleaned.csv', low_memory=False)
property_df['coordinates'] = property_df['coordinates'].apply(eval)
property_df['cloest_train_station'] = property_df['cloest_train_station'].apply(eval)

In [131]:
# put two coordinate in and return the duration between two coordinates
# coordinate form [longitude, latitude]
def calculate_distance_between_coordinates(coordinate1, coordinate2, api_key):

    # connect open route service
    client = ors.Client(key = api_key)

    # put two coordinates in list
    cor = [(coordinate1[1],coordinate1[0]), (coordinate2[1],coordinate2[0])]

    # using open route service
    route = client.directions(
    coordinates= cor,
    profile='driving-car',
    format='geojson',
    )
    
    time.sleep(1.5)

    # dict of distance and duration
    dist = route['features'][0]['properties']['segments'][0]['distance']
    duration = route['features'][0]['properties']['segments'][0]['duration']
    
    # return the duration
    return dist, duration

In [157]:
def find_driving(coor,lis,api_key):
    result ={"dist":[],"name":[],"duration":[]}
    
    if len(lis) > 0:
        # first point
        dist, duration = calculate_distance_between_coordinates(coor,lis[0][1],api_key)
        result['dist'].append(dist)
        result['name'].append(lis[0][2])
        result['duration'].append(duration)
        
        if len(lis) > 1:
            # second point
            dist, duration = calculate_distance_between_coordinates(coor,lis[1][1],api_key)
            result['dist'].append(dist)
            result['name'].append(lis[1][2])
            result['duration'].append(duration)

    return result

In [198]:
def transportation_time(df,api):
    df['cloest_TRAIN_STATION'] = np.nan
    df['cloest_TRAIN_STATION_duration'] = np.nan
    df['cloest_TRAIN_STATION_distance'] = np.nan
    result = find_driving(df['coordinates'],df['cloest_train_station'],api)
    
    if result['duration'][0] > result['duration'][1]:
        idx = 1
    else:
        idx =0
    
    df['cloest_TRAIN_STATION'] = result['name'][idx]
    df['cloest_TRAIN_STATION_duration'] = result['duration'][idx]
    df['cloest_TRAIN_STATION_distance'] = result['dist'][idx]
    return df

In [181]:
def find_shortest(lis,df):
    
    df['cloest_TRAIN_STATION'] = np.nan
    df['cloest_TRAIN_STATION_duration'] = np.nan
    df['cloest_TRAIN_STATION_distance'] = np.nan
    
    if len(lis) > 1:
        if lis[0][0] > lis[1][0]:
            result = lis[1]
        else:
            result = lis[0]
    else:
        result = lis[0]
    
    df['cloest_TRAIN_STATION'] = result[2]
    df['cloest_TRAIN_STATION_duration'] = np.nan
    df['cloest_TRAIN_STATION_distance'] = result[0]
    return df
    

In [199]:
df = pd.DataFrame(columns = test.columns)
# reverse the coordinates by small size due to request limitation
i = 0
key = 0
while i < property_df.shape[0] and key < 15:
    df_i = property_df.iloc[i].copy()
    try:
        df_i = transportation_time(df_i,api_keys[key])
        df_i = df_i.to_frame().transpose()
        df = pd.concat([df ,df_i],ignore_index=True)
    except ors.exceptions.ApiError as err:
        print(err)
        if 'Quota exceeded' in str(err):
            if key == 14:
                key = 0
            else:
                key+=1
            print("Update Key ", key)
            
        elif 'Could not find routable point within a radius of 350.0 meters of specified coordinate' in str(err):
            df_i = find_shortest(df_i['cloest_train_station'],df)
            df_i = df_i.to_frame().transpose()
            df = pd.concat([df ,df_i],ignore_index=True)
        else:
            pass
    i+=1
    if i % 1000 == 0:
        print("DONE ROW ",i)

In [None]:
property_df.to_csv("../data/curated/property_train.csv")

## 不太需要

In [None]:
try:
    properity_coor = property_df['coordinates'][0]
    closest_train_station = property_df['cloest_train_station'][0][0][1]
    gmaps = googlemaps.Client(key='AIzaSyAwGu0E8STeETxFXmrL0UjyeF7mFRAn_5o')
    now = datetime.now()
    directions_result = gmaps.directions(properity_coor, closest_train_station, mode='driving', departure_time=now)
    print(directions_result)
except googlemaps.exceptions.ApiError as err:
    print(err)

In [None]:

gmaps = googlemaps.Client(key='AIzaSyAwGu0E8STeETxFXmrL0UjyeF7mFRAn_5k')
now = datetime.now()
for i in property_df.shape[0]:
    properity_coor = property_df['coordinates'][i]
    closest_train_station = property_df['cloest_train_station'][i][0][1]
    closest_tram_stop = property_df['cloest_tram_stop'][i][0][1]
    closest_bus_stop = property_df['cloest_bus_stop'][i][0][1]

    
directions_result = gmaps.directions()

In [None]:
# google map code below:

In [None]:
# find the travel time bewteen the property and its corresponding closest train station
final_direction_result = []
gmaps = googlemaps.Client(key='AIzaSyAwGu0E8STeETxFXmrL0UjyeF7mFRAn_5k')
now = datetime.now()
for i in range(property_df.shape[0]):
    properity_coor = property_df['coordinates'][i]
    if len(property_df['cloest_train_station'][i]) < 1:
        final_direction_result.append(-1)
        pass
    else:
        closest_train_station = property_df['cloest_train_station'][i][0][1]
        directions_result = gmaps.directions(properity_coor, closest_train_station, mode='driving', departure_time=now)
        final_direction_result.append(directions_result)
final_direction_result

In [None]:
properity_coor = property_df['coordinates'][0]
closest_train_station = property_df['cloest_train_station'][0][0][1]
gmaps = googlemaps.Client(key='AIzaSyAwGu0E8STeETxFXmrL0UjyeF7mFRAn_5k')
now = datetime.now()
directions_result = gmaps.directions(properity_coor, closest_train_station, mode='driving', departure_time=now)
directions_result

In [None]:
properity_coor = property_df['coordinates'][0]
properity_coor

In [None]:
# read rental data
path = os.getcwd().replace("notebooks","") + "data/curated/"
train = pd.read_csv(path+'train_station.csv', low_memory = False)
property_df = pd.read_csv(path+'cleaned_rent.csv', low_memory = False)
train["coordinates"] = list(zip(train.LATITUDE, train.LONGITUDE))
#GNR = GNR.drop_duplicates(subset='coordinates', keep="first")

In [None]:
train.head(5)

In [None]:
# combine longtitude and latitude to coordnates and use first 10 property data as sample
property_df["coordinates"] = list(zip(property_df.latitude, property_df.longitude))
property_df = property_df.iloc[:10]
property_df.head(5)

In [None]:

# comparing the distances and record three cloest points
def cloest_point(dist_dict,dist,loc,stop):
    # if there is no point, then just append the point into dict
    if len(dist_dict) < 3:
        dist_dict.append((dist,loc,stop))
    # if there already has three points, then compares distance
    else:
        # if the current distance smaller than records' distance
        if dist < dist_dict[0][0]:
            # delete the record point
            dist_dict.pop(0)
            # append new cloest point
            dist_dict.append((dist,loc,stop))
        elif dist < dist_dict[1][0]:
            # delete the record point
            dist_dict.pop(1)
            # append new cloest point
            dist_dict.append((dist,loc,stop))
        elif dist < dist_dict[2][0]:
            # delete the record point
            dist_dict.pop(2)
            # append new cloest point
            dist_dict.append((dist,loc,stop))
    return dist_dict
            
# calculate the cloest three point of interest for each property data
def distance(loc1):
    # read train station data
    train = pd.read_csv(path+'train_station.csv', low_memory = False)
    # extact all features
    train_stop = list(train["STOP_NAME"].unique())
    # initaliza the dict for record the cloest three point of interest
    dist_lis = []
    # find cloest three points
    for i in range(train.shape[0]):
        # feature points
        loc2 = (train.iloc[i]["LATITUDE"],train.iloc[i]["LONGITUDE"])
        # calculate distance between property and feature
        dist = hs.haversine(loc1,loc2,unit=Unit.METERS)
        # check the cloest
        dist_lis = cloest_point(dist_lis,dist,loc2,train.iloc[i]["STOP_NAME"])
    return dist_lis

In [None]:
property_df["train_station"]= property_df["coordinates"].apply(distance)  # calculate distance for each property row

In [None]:
property_df.head(5)

In [None]:
property_df["train_station"][0]

In [None]:
# put two coordinate in and return the duration between two coordinates
# coordinate form [longitude, latitude]
def calculate_distance_between_coordinates(coordinate1, coordinate2):

    # put your own open route service api key in here
    api_key_openrouteservice = '5b3ce3597851110001cf6248d864908ae526479e86e6f4dd70971a37'

    # connect open route service
    client = ors.Client(key = api_key_openrouteservice)

    # put two coordinates in list
    cor = [coordinate1, coordinate2]

    # using open route service
    route = client.directions(
    coordinates= cor,
    profile='driving-car',
    format='geojson',
    )

    # dict of distance and duration
    dist = route['features'][0]['properties']['segments'][0]['distance']
    duration = route['features'][0]['properties']['segments'][0]['duration']
    
    # return the duration
    return dist, duration

In [None]:
nearest_train_list = []
nearest_distance_list = []
nearest_duration_list = []
for i in range(len(property_df)):
    # print(i)
    long = property_df["longitude"][i]
    lat = property_df["latitude"][i]
    property_coordinate = [long, lat]

    # print(property_coordinate)

    train = property_df["train_station"][i]

    duration = []
    distance = []

    for j in range(len(train)):
        train_coord = train[j][1]
        train_long = train_coord[1]
        train_lat = train_coord[0]

        train_position = [train_long, train_lat]
        # print(train_position)
        dist_in_between, duration_in_detween = calculate_distance_between_coordinates(property_coordinate, train_position)

        duration.append(duration_in_detween)
        distance.append(dist_in_between)
    
    for k in range(1, len(duration)):
        if duration[k-1] <= duration[k]:
            nearest_point_index = k-1
            nearest_duration = duration[k-1]
            nearest_distance = distance[k-1]

        else:
            nearest_point_index = k
            nearest_duration = duration[k]
            nearest_distance = distance[k]

    nearest_train = train[nearest_point_index]
    print(nearest_train)
    nearest_train_list.append(nearest_train[2])
    nearest_distance_list.append(nearest_distance)
    nearest_duration_list.append(nearest_duration)

In [None]:
property_df["nearest_train"] = nearest_train_list
property_df["nearest_distance(m)"] = nearest_distance_list
property_df["nearest_duration(s)"] = nearest_duration_list
property_df = property_df[['address','coordinates', 'train_station', 'nearest_train','nearest_distance(m)', 'nearest_duration(s)']]

property_df.to_csv("../data/curated/dist_property_train.csv")

In [None]:
property_df