In [52]:
import os
import json
import time
import folium
import numpy as np
import pandas as pd
import seaborn as sns
import haversine as hs
from haversine import Unit
import openrouteservice as ors
from difflib import SequenceMatcher
from IPython.display import display
from pandas.io.json import json_normalize

## Get API Keys

In [56]:
with open('../data/raw/APIkeys.txt') as file:
    api_keys = file.readlines()
    api_keys = [key.rstrip() for key in api_keys]

## Merge Dataset

In [3]:
# read the dataset after data-cleaning
external = pd.read_csv('../data/curated/external.csv', low_memory = False)
property_df = pd.read_csv('../data/curated/cleaned_property_data.csv', low_memory = False)
rental = pd.read_csv('../data/curated/rental_median.csv', low_memory = False)
GNR = pd.read_csv('../data/curated/GNR.csv', low_memory = False)

#read postcode match suburb
with open('../data/raw/postcode_match_suburb.json') as json_data:
    data = json.load(json_data)
postcode_match = pd.DataFrame.from_dict({'postcode':data.keys(), 'suburb':data.values()})
postcode_match['postcode'] = pd.to_numeric(postcode_match['postcode'])

In [4]:
# display all dataframe
display(external.head(10))
display(property_df.head(10))
display(rental.head(10))
display(GNR.head(10))
display(postcode_match.head(10))
property_df.shape

Unnamed: 0,postcode,locality,state,LGA,SA3_NAME_2016,SA2_Code,SA2_Name,2001_population,2002_population,2003_population,...,Percent_total_Jun_2019,Percent_total_Sep_2019,Percent_total_Dec_2019,Percent_total_Mar_2020,Percent_total_Jun_2020,Percent_total_Sep_2020,Percent_total_Dec_2020,Percent_total_Mar_2021,Percent_total_Jun_2021,Percent_total_Sep_2021
0,3000,MELBOURNE,VIC,Melbourne,Melbourne City,206041122.0,Melbourne,,,,...,0.016,0.012,0.014,0.011,0.011,0.012,0.016,0.016,0.026,0.028
1,3001,MELBOURNE,VIC,Moonee Valley,Melbourne City,206041122.0,Melbourne,,,,...,0.013,0.021,0.012,0.01,0.023,0.022,0.018,0.013,0.031,0.033
2,3002,EAST MELBOURNE,VIC,Yarra,Melbourne City,206041119.0,East Melbourne,3731.0,3859.0,4243.0,...,0.006,0.007,0.009,0.008,0.008,0.004,0.008,0.007,0.019,0.014
3,3003,WEST MELBOURNE,VIC,Melbourne,Melbourne City,206041127.0,West Melbourne,0.0,0.0,0.0,...,0.016,0.012,0.014,0.011,0.011,0.012,0.016,0.016,0.026,0.028
4,3004,MELBOURNE,VIC,Yarra,Melbourne City,206041126.0,Southbank,,,,...,0.006,0.007,0.009,0.008,0.008,0.004,0.008,0.007,0.019,0.014
5,3004,ST KILDA ROAD CENTRAL,VIC,Yarra,Melbourne City,206041125.0,South Yarra - West,5317.0,5233.0,5176.0,...,0.006,0.007,0.009,0.008,0.008,0.004,0.008,0.007,0.019,0.014
6,3004,ST KILDA ROAD MELBOURNE,VIC,Yarra,Melbourne City,206041125.0,South Yarra - West,5317.0,5233.0,5176.0,...,0.006,0.007,0.009,0.008,0.008,0.004,0.008,0.007,0.019,0.014
7,3005,WORLD TRADE CENTRE,VIC,Melbourne,Melbourne City,206041118.0,Docklands,154.0,926.0,1913.0,...,0.016,0.012,0.014,0.011,0.011,0.012,0.016,0.016,0.026,0.028
8,3006,SOUTH WHARF,VIC,Port Phillip,Melbourne City,206041126.0,Southbank,,,,...,0.007,0.008,0.01,0.006,0.009,0.008,0.006,0.009,0.01,0.011
9,3006,SOUTHBANK,VIC,Port Phillip,Port Phillip,206051132.0,South Melbourne,,,,...,0.007,0.008,0.01,0.006,0.009,0.008,0.006,0.009,0.01,0.011


Unnamed: 0,index,address,rent,features,type,furnitured,pool,gym,coordinates,desc,postcode,floor,num_bed,num_bath,num_car_park,rent_weekly
0,0,1414/218-228 A'Beckett Street Melbourne VIC 3000,$400 Per Week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8102832, 144.9566691]",South Melbourne Leasing,3000,14,1,1,0,400
1,1,11a/131 Lonsdale Sreet Melbourne VIC 3000,$350 per week,1 Bed1 Bath− Parking,Studio,Yes,No,No,"[-37.810779, 144.9685513]",Wimpie Santoso,3000,11,1,1,0,350
2,2,911/408 Lonsdale Street Melbourne VIC 3000,$330 per week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8125979, 144.9604012]",Ender Gok,3000,9,1,1,0,330
3,3,918/422 Collins St Melbourne VIC 3000,$600 Per week fully furnished,2 Beds1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8170971, 144.9601487]",Precinct Leasing,3000,9,2,1,0,600
4,4,602/118 Franklin Street Melbourne VIC 3000,$330,1 Bed1 Bath− Parking,Apartment / Unit / Flat,No,No,No,"[-37.8082052, 144.9589035]",Manuel Flores,3000,6,1,1,0,330
5,5,1112/333-351 Exhibition Street Melbourne VIC 3000,$600 per week,2 Beds2 Baths− Parking,Apartment / Unit / Flat,Yes,No,No,"[-37.80789559999999, 144.9682873]",Mark Faranda,3000,11,2,2,0,600
6,6,3002/288 Spencer St Melbourne VIC 3000,$510,2 Beds1 Bath1 Parking,Apartment / Unit / Flat,No,Yes,Yes,"[-37.813775, 144.9520948]",Leasing Team,3000,30,2,1,1,510
7,7,4/180 Little Collins Street Melbourne VIC 3000,$500 per week,1 Bed1 Bath− Parking,Apartment / Unit / Flat,Yes,No,No,"[-37.8138601, 144.9679067]",Trish Ha,3000,4,1,1,0,500
8,8,1605/565 Flinders Street Melbourne VIC 3000,$500 per week,2 Beds2 Baths1 Parking,Apartment / Unit / Flat,No,No,No,"[-37.8210586, 144.9559072]",Justine Muscat,3000,16,2,2,1,500
9,9,612/408 Lonsdale Street Melbourne VIC 3000,$350,1 Bed1 Bath− Parking,Apartment / Unit / Flat,Yes,No,No,"[-37.8125979, 144.9604012]",Manuel Flores,3000,6,1,1,0,350


Unnamed: 0,Suburb,Rental_total_Mar_2000_1b_flat,Rental_total_Jun_2000_1b_flat,Rental_total_Sep_2000_1b_flat,Rental_total_Dec_2000_1b_flat,Rental_total_Mar_2001_1b_flat,Rental_total_Jun_2001_1b_flat,Rental_total_Sep_2001_1b_flat,Rental_total_Dec_2001_1b_flat,Rental_total_Mar_2002_1b_flat,...,Rental_total_Mar_2019,Rental_total_Jun_2019,Rental_total_Sep_2019,Rental_total_Dec_2019,Rental_total_Mar_2020,Rental_total_Jun_2020,Rental_total_Sep_2020,Rental_total_Dec_2020,Rental_total_Mar_2021,Rental_total_Jun_2021
0,Armadale,150,150,155,160,160,160,165,165,165,...,480,480,480,485,500,500,500,495,450,440
1,Carlton North,150,155,150,150,160,160,160,160,165,...,580,577,580,580,585,590,590,590,580,580
2,Docklands,-,-,-,-,-,-,-,-,265,...,570,580,575,580,570,550,500,460,420,400
3,East Melbourne,180,180,188,185,190,195,200,210,210,...,495,500,520,550,550,550,515,495,450,450
4,East St Kilda,140,145,145,150,150,150,152,155,160,...,420,420,425,425,425,425,420,410,395,385
5,Elwood,150,155,160,160,170,170,175,175,180,...,450,450,450,450,450,450,450,440,425,425
6,Fitzroy,140,140,150,150,150,150,160,165,173,...,585,595,595,590,572,570,565,550,550,540
7,Port Melbourne,280,280,280,275,270,275,280,290,290,...,610,600,620,620,630,600,590,590,575,590
8,South Melbourne,170,175,190,225,230,240,240,240,250,...,530,540,550,550,549,530,510,490,450,430
9,South Yarra,170,175,175,180,180,185,190,190,190,...,475,480,490,495,485,475,450,435,420,400


Unnamed: 0,PLACE_NAME,FEATURE,LONGITUDE,LATITUDE,geometry
0,MOUNT CLEAR SECONDARY COLLEGE,SECONDARY SCHOOL,143.8766,-37.607,POINT (2400824.055420277 2431949.3066944666)
1,MOUNT CLEAR TECHNICAL ANNEXE,SECONDARY SCHOOL,143.8766,-37.607,POINT (2400824.055420277 2431949.3066944666)
2,MOUNT CLEAR COLLEGE,SECONDARY SCHOOL,143.8766,-37.607,POINT (2400824.055420277 2431949.3066944666)
3,VIEWBANK COLLEGE,SECONDARY SCHOOL,145.0865,-37.741,POINT (2507631.5373240355 2417744.9461323526)
4,Yarram Secondary College - Devon North Campus,SECONDARY SCHOOL,146.6478,-38.519,POINT (2643730.207553347 2330161.818601803)
5,Bass Coast College - San Remo Campus,SECONDARY SCHOOL,145.3905,-38.532,POINT (2534061.1052171467 2329809.2720654747)
6,SANDRINGHAM SECONDARY COLLEGE BEAUMARIS CAMPUS,SECONDARY SCHOOL,145.0312,-37.978,POINT (2502746.435917485 2391445.676364539)
7,BEAUMARIS HIGH SCHOOL,SECONDARY SCHOOL,145.0312,-37.978,POINT (2502746.435917485 2391445.676364539)
8,SANDRINGHAM SECONDARY COLLEGE HIGHETT CAMPUS,SECONDARY SCHOOL,145.0212,-37.955,POINT (2501868.406406302 2393912.2221767586)
9,HIGHETT HIGH SCHOOL,SECONDARY SCHOOL,145.0212,-37.955,POINT (2501868.406406302 2393912.2221767586)


Unnamed: 0,postcode,suburb
0,3000,[MELBOURNE]
1,3001,[MELBOURNE]
2,3002,[EAST MELBOURNE]
3,3003,[WEST MELBOURNE]
4,3004,"[MELBOURNE, ST KILDA ROAD CENTRAL, ST KILDA RO..."
5,3006,"[SOUTH WHARF, SOUTHBANK]"
6,3008,[DOCKLANDS]
7,3010,[UNIVERSITY OF MELBOURNE]
8,3011,"[FOOTSCRAY, SEDDON, SEDDON WEST]"
9,3012,"[BROOKLYN, KINGSVILLE, MAIDSTONE, TOTTENHAM, W..."


(14527, 16)

In [5]:
# show the all types of property data
property_df['type'].unique()

array(['Apartment / Unit / Flat', 'Studio', 'Townhouse', 'House', 'Villa',
       'New House & Land', 'Penthouse', 'Terrace', 'Semi-Detached',
       'Acreage / Semi-Rural', 'Duplex', 'New Apartments / Off the Plan',
       'Carspace', 'Retirement', 'Rural', 'Farm'], dtype=object)

In [6]:
# delete some unsusal data, treat them as outlier
property_df = property_df[property_df['type'] != 'Carspace']
property_df = property_df[property_df['type'] != 'Retirement']
property_df = property_df[property_df['type'] != 'Farm']
property_df = property_df[property_df['type'] != 'Acreage / Semi-Rural']
property_df = property_df[property_df['type'] != 'Rural']
property_df = property_df[property_df['type'] != 'New House & Land']
# re-classify the property data
property_df['type'] = property_df['type'].replace('Villa','House')
property_df['type'] = property_df['type'].replace('Semi-Detached','House')
property_df['type'] = property_df['type'].replace('Duplex','House')
property_df['type'] = property_df['type'].replace('New Apartments / Off the Plan','Apartment / Unit / Flat')
property_df['type'] = property_df['type'].replace('Terrace','Apartment / Unit / Flat')
property_df = property_df.reset_index(drop=True)

In [7]:
# split the address and get the suburbs
def extract_suburb(address):
    address = address.split(" ")
    if address[-3].isdigit():
        return address[-4]
    else:
        return address[-3]
property_df['suburb'] = property_df["address"].apply(extract_suburb)
property_df['suburb'] = property_df['suburb'].str.upper()  # make letter upper 
property_df['postcode'] = pd.to_numeric(property_df['postcode'])  # make sure the postcodes are int

In [8]:
# select the feature we need
property_df = property_df[['address', 'rent_weekly', 'floor', 'suburb','postcode', 'type', 'furnitured', 'pool',
                           'gym', 'num_bed', 'num_bath', 'num_car_park', 'coordinates']]
# convert coordinates from str to list
property_df['coordinates'] = property_df['coordinates'].apply(eval)

In [9]:
property_df.head(5)

Unnamed: 0,address,rent_weekly,floor,suburb,postcode,type,furnitured,pool,gym,num_bed,num_bath,num_car_park,coordinates
0,1414/218-228 A'Beckett Street Melbourne VIC 3000,400,14,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,1,1,0,"[-37.8102832, 144.9566691]"
1,11a/131 Lonsdale Sreet Melbourne VIC 3000,350,11,MELBOURNE,3000,Studio,Yes,No,No,1,1,0,"[-37.810779, 144.9685513]"
2,911/408 Lonsdale Street Melbourne VIC 3000,330,9,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,1,1,0,"[-37.8125979, 144.9604012]"
3,918/422 Collins St Melbourne VIC 3000,600,9,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,2,1,0,"[-37.8170971, 144.9601487]"
4,602/118 Franklin Street Melbourne VIC 3000,330,6,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,1,1,0,"[-37.8082052, 144.9589035]"


In [10]:
# calculate the similarity percentage
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# find the most similar suburb
def most_similar(df,suburb):
    if df['suburb'].shape[0] == 1:
        current_match = [100,df['suburb'].iloc[0][0]]
    else:
        # initialize the most similar suburb
        current_match = [0,"None"]
        # check similarity for each suburb
        for sub in df['suburb']:
            # get the similarity percentage
            simi_percent = similar(suburb, sub)
            # update most similar suburb
            if simi_percent > current_match[0]:
                current_match = [simi_percent, sub]
    # return the most similar suburb
    return current_match[1]

def correct_suburb(suburb_df, property_df):
    # check property_df each row's suburb
    for row in range(property_df.shape[0]):
        # get the property postcode
        postcode = property_df.loc[row, 'postcode']
        # get the postcode and corresponding suburbs
        match_df = suburb_df[suburb_df['postcode'] == postcode]
        sub_lis = list(match_df['suburb'])[0]
        # if the suburb matched, don't change it
        if property_df.loc[row, 'suburb'] in sub_lis:
            pass
        # if the suburb don't matched, replace by the most similar suburb by postcode
        else:
            most_match = most_similar(match_df, property_df.loc[row, 'suburb'])
            property_df.at[row, 'suburb'] = most_match
    return property_df


In [11]:
# correct the suburb names in property data
property_df = correct_suburb(postcode_match,property_df)  

In [12]:
property_df.shape

(14505, 13)

In [13]:
# rename the retal columns' names
rental.columns = ['suburb' if x=='Suburb' else x for x in rental.columns]
# make letter upper
rental['suburb'] = rental['suburb'].str.upper()
# let CBD represent MELBOURNE 3000
rental = rental.replace('CBD', 'MELBOURNE')

In [14]:
rental

Unnamed: 0,suburb,Rental_total_Mar_2000_1b_flat,Rental_total_Jun_2000_1b_flat,Rental_total_Sep_2000_1b_flat,Rental_total_Dec_2000_1b_flat,Rental_total_Mar_2001_1b_flat,Rental_total_Jun_2001_1b_flat,Rental_total_Sep_2001_1b_flat,Rental_total_Dec_2001_1b_flat,Rental_total_Mar_2002_1b_flat,...,Rental_total_Mar_2019,Rental_total_Jun_2019,Rental_total_Sep_2019,Rental_total_Dec_2019,Rental_total_Mar_2020,Rental_total_Jun_2020,Rental_total_Sep_2020,Rental_total_Dec_2020,Rental_total_Mar_2021,Rental_total_Jun_2021
0,ARMADALE,150,150,155,160,160,160,165,165,165,...,480,480,480,485,500,500,500,495,450,440
1,CARLTON NORTH,150,155,150,150,160,160,160,160,165,...,580,577,580,580,585,590,590,590,580,580
2,DOCKLANDS,-,-,-,-,-,-,-,-,265,...,570,580,575,580,570,550,500,460,420,400
3,EAST MELBOURNE,180,180,188,185,190,195,200,210,210,...,495,500,520,550,550,550,515,495,450,450
4,EAST ST KILDA,140,145,145,150,150,150,152,155,160,...,420,420,425,425,425,425,420,410,395,385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,AVONDALE HEIGHTS,110,120,120,125,120,120,125,128,130,...,420,420,420,420,420,410,410,420,410,410
137,NEWPORT,100,100,100,100,110,110,110,110,110,...,480,480,495,495,500,495,495,500,495,490
138,SPOTSWOOD,100,100,100,100,110,110,110,110,110,...,480,480,495,495,500,495,495,500,495,490
139,ST ALBANS,95,95,99,100,100,100,105,110,105,...,350,350,355,360,360,360,360,355,350,350


In [15]:
# change the position of "EAST","WEST","NORTH","SOUTH" for some cases
def change_word_position(suburb):
    # split the suburb by space
    suburb_lis = suburb.split(" ")
    name =""
    # if the suburb name contains more than one word
    # then we need to have check the positions of ["EAST","WEST","NORTH","SOUTH"]
    if len(suburb_lis) >= 2:
        if suburb_lis[0] in ["EAST","WEST","NORTH","SOUTH"]:
            # if the suburb name contains words below, then doesn't need to change the position
            if suburb_lis[1] not in ["MELBOURNE",'GEELONG','BENDIGO','YEOBURN','WANGARATTA','WARBURTON','SALE',
                                     'BAIRNSDALE','YARRA','FOOTSCRAY']:
                # make the word of ["EAST","WEST","NORTH","SOUTH"] at the end of suburb name
                for i in range(1,len(suburb_lis)):
                    name+=(str(suburb_lis[i])+" ")
                name+=str(suburb_lis[0])
            # otherwise, doesn't need to change suburb name
            else:
                name = suburb
        else:
            name = suburb
    else:
        name = suburb
    return name
rental['suburb'] = rental['suburb'].apply(change_word_position)

In [16]:
# find the most similar suburb names for rental data
def correct_suburb(suburb_df, df):
    # check property_df each row's suburb
    for row in range(df.shape[0]):
        # all the suburb names
        sub_lis = list(set(suburb_df.suburb.sum()))
        # if the suburb matched, don't change it
        if df.loc[row, 'suburb'] in sub_lis:
            pass
        # if the suburb don't matched, replace by the most similar suburb by postcode
        else:
            most_match = most_similar(pd.DataFrame.from_dict({'suburb':sub_lis}), df.loc[row, 'suburb'])
            df.at[row, 'suburb'] = most_match
    return df
rental = correct_suburb(postcode_match,rental)

In [17]:
# merge rental data and property_df
property_df = pd.merge(property_df, rental, on='suburb', how='left').fillna(np.nan)

In [18]:
property_df

Unnamed: 0,address,rent_weekly,floor,suburb,postcode,type,furnitured,pool,gym,num_bed,...,Rental_total_Mar_2019,Rental_total_Jun_2019,Rental_total_Sep_2019,Rental_total_Dec_2019,Rental_total_Mar_2020,Rental_total_Jun_2020,Rental_total_Sep_2020,Rental_total_Dec_2020,Rental_total_Mar_2021,Rental_total_Jun_2021
0,1414/218-228 A'Beckett Street Melbourne VIC 3000,400,14,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,1,...,500.0,500.0,510.0,510.0,510.0,495.0,460.0,420.0,380.0,370.0
1,11a/131 Lonsdale Sreet Melbourne VIC 3000,350,11,MELBOURNE,3000,Studio,Yes,No,No,1,...,500.0,500.0,510.0,510.0,510.0,495.0,460.0,420.0,380.0,370.0
2,911/408 Lonsdale Street Melbourne VIC 3000,330,9,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,1,...,500.0,500.0,510.0,510.0,510.0,495.0,460.0,420.0,380.0,370.0
3,918/422 Collins St Melbourne VIC 3000,600,9,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,2,...,500.0,500.0,510.0,510.0,510.0,495.0,460.0,420.0,380.0,370.0
4,602/118 Franklin Street Melbourne VIC 3000,330,6,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,1,...,500.0,500.0,510.0,510.0,510.0,495.0,460.0,420.0,380.0,370.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14500,16B Sandy Mount Avenue Inverloch VIC 3996,550,1,INVERLOCH,3996,House,No,No,No,2,...,,,,,,,,,,
14501,28 Beachcomber Drive Inverloch VIC 3996,550,1,INVERLOCH,3996,House,Yes,No,No,4,...,,,,,,,,,,
14502,14 Inverloch Parade Inverloch VIC 3996,440,1,INVERLOCH,3996,House,No,No,No,2,...,,,,,,,,,,
14503,Inverloch VIC 3996,580,1,INVERLOCH,3996,House,No,No,No,3,...,,,,,,,,,,


# calculate distances

In [36]:
# comparing the distances and record three cloest points
def cloest_point(dist_dict,dist,loc,stop):
    # if there is no point, then just append the point into dict
    if len(dist_dict) < 2:
        dist_dict.append((dist,loc,stop))
    # if there already has three points, then compares distance
    else:
        # if the current distance smaller than records' distance
        if dist < dist_dict[0][0]:
            # delete the record point
            dist_dict.pop(0)
            # append new cloest point
            dist_dict.append((dist,loc,stop))
        elif dist < dist_dict[1][0]:
            # delete the record point
            dist_dict.pop(1)
            # append new cloest point
            dist_dict.append((dist,loc,stop))
    return dist_dict
            
# calculate the cloest three train station for each property data
def distance_train(loc1):
    # read train station data
    GNR = pd.read_csv('../data/curated/GNR.csv', low_memory = False)
    # read train station data
    train = GNR[GNR['FEATURE'] == 'TRAIN STATION']
    # extact all features
    train_stop = list(train["PLACE_NAME"].unique())
    # initaliza the dict for record the cloest three point of interest
    dist_lis = []
    # find cloest three points
    for i in range(train.shape[0]):
        # feature points
        loc2 = (train.iloc[i]["LATITUDE"],train.iloc[i]["LONGITUDE"])
        # calculate distance between property and feature
        dist = hs.haversine(loc1,loc2,unit=Unit.METERS)
        # check the cloest
        dist_lis = cloest_point(dist_lis,dist,loc2,train.iloc[i]["PLACE_NAME"])
    return dist_lis

# calculate the cloest three bus stop for each property data
def distance_bus(loc1):
    # read train station data
    GNR = pd.read_csv('../data/curated/GNR.csv', low_memory = False)
    # read train station data
    bus = GNR[GNR['FEATURE'] == 'BUS']
    # extact all features
    bus_stop = list(bus["PLACE_NAME"].unique())
    # initaliza the dict for record the cloest three point of interest
    dist_lis = []
    # find cloest three points
    for i in range(bus.shape[0]):
        # feature points
        loc2 = (bus.iloc[i]["LATITUDE"],bus.iloc[i]["LONGITUDE"])
        # calculate distance between property and feature
        dist = hs.haversine(loc1,loc2,unit=Unit.METERS)
        # check the cloest
        dist_lis = cloest_point(dist_lis,dist,loc2,bus.iloc[i]["PLACE_NAME"])
    return dist_lis

# calculate the cloest three tram stop for each property data
def distance_tram(loc1):
    # read train station data
    GNR = pd.read_csv('../data/curated/GNR.csv', low_memory = False)
    # read train station data
    tram = GNR[GNR['FEATURE'] == 'TRAM STATION']
    # extact all features
    tram_stop = list(tram["PLACE_NAME"].unique())
    # initaliza the dict for record the cloest three point of interest
    dist_lis = []
    # find cloest three points
    for i in range(tram.shape[0]):
        # feature points
        loc2 = (tram.iloc[i]["LATITUDE"],tram.iloc[i]["LONGITUDE"])
        # calculate distance between property and feature
        dist = hs.haversine(loc1,loc2,unit=Unit.METERS)
        # check the cloest
        dist_lis = cloest_point(dist_lis,dist,loc2,tram.iloc[i]["PLACE_NAME"])
    return dist_lis

In [None]:
# find the cloest train station
property_df['cloest_train_station'] = property_df["coordinates"].apply(distance_train)

In [None]:
# find the cloest tram stop
property_df['cloest_tram_stop'] = property_df["coordinates"].apply(distance_tram)

In [None]:
# find the cloest bus stop
property_df['cloest_bus_stop'] = property_df["coordinates"].apply(distance_bus)

In [128]:
test = property_df.loc[:10].copy()
test['cloest_train_station'] = test["coordinates"].apply(distance_train)
test['cloest_tram_stop'] = test["coordinates"].apply(distance_tram)
test['cloest_bus_stop'] = test["coordinates"].apply(distance_bus)

In [43]:
test['coordinates']

0            [-37.8102832, 144.9566691]
1             [-37.810779, 144.9685513]
2            [-37.8125979, 144.9604012]
3            [-37.8170971, 144.9601487]
4            [-37.8082052, 144.9589035]
5     [-37.80789559999999, 144.9682873]
6             [-37.813775, 144.9520948]
7            [-37.8138601, 144.9679067]
8            [-37.8210586, 144.9559072]
9            [-37.8125979, 144.9604012]
10            [-37.813775, 144.9520948]
Name: coordinates, dtype: object

In [47]:
test['cloest_train_station'][0][0][2]

'Flagstaff Railway Station (Melbourne City)'

In [137]:
# put two coordinate in and return the duration between two coordinates
# coordinate form [longitude, latitude]
def calculate_distance_between_coordinates(coordinate1, coordinate2, api_key):

    # connect open route service
    client = ors.Client(key = api_key)

    # put two coordinates in list
    cor = [(coordinate1[1],coordinate1[0]), (coordinate2[1],coordinate2[0])]

    # using open route service
    route = client.directions(
    coordinates= cor,
    profile='driving-car',
    format='geojson',
    )
    
    time.sleep(1.5)

    # dict of distance and duration
    dist = route['features'][0]['properties']['segments'][0]['distance']
    duration = route['features'][0]['properties']['segments'][0]['duration']
    
    # return the duration
    return dist, duration

In [138]:
def find_driving(coor,lis,api_key):
    result ={"dist":[],"name":[],"duration":[]}
    # first point
    dist, duration = calculate_distance_between_coordinates(coor,lis[0][1],api_key)
    result['dist'].append(dist)
    result['name'].append(lis[0][2])
    result['duration'].append(duration)
    
    # second point
    dist, duration = calculate_distance_between_coordinates(coor,lis[1][1],api_key)
    result['dist'].append(dist)
    result['name'].append(lis[1][2])
    result['duration'].append(duration)
    
    return result

In [None]:
#property_df.at[row, 'suburb'] = most_match
            
def transportation_time(df,api1,api2,api3):
    train = []
    tram = []
    bus =[]
    for row in df.index.to_list():
        train.append(find_driving(df.loc[row, 'coordinates'],df.loc[row, 'cloest_train_station'],api))
        tram.append(find_driving(df.loc[row, 'coordinates'],df.loc[row, 'cloest_tram_stop'],api))
        bus.append(find_driving(df.loc[row, 'coordinates'],df.loc[row, 'cloest_bus_stop'],api))
    df['driving_to_train'] = train
    df['driving_to_tram'] = tram
    df['driving_to_bus'] = bus
    return df

In [None]:
df = pd.DataFrame()
# reverse the coordinates by small size due to request limitation
i = 0
key = 0
while i < 30000:
    if i < 29000:
        df_i = property_df.copy().iloc[i:i+2000]
        df_i = transportation_time(df_i,api_keys[key],api_keys[key+1],api_keys[key+2])
        df = pd.concat([df ,df_i],ignore_index=True)
    else:
        df_i = property_df.copy().iloc[i:]
        df_i = transportation_time(df_i,api_keys[key],api_keys[key+1],api_keys[key+2])
        df = pd.concat([df ,df_i],ignore_index=True)
    print(i)
    i+=2000
    key+=3

In [143]:
tesy = transportation_time(test,api_keys[0])
tesy

Unnamed: 0,address,rent_weekly,floor,suburb,postcode,type,furnitured,pool,gym,num_bed,...,Rental_total_Sep_2020,Rental_total_Dec_2020,Rental_total_Mar_2021,Rental_total_Jun_2021,cloest_train_station,cloest_tram_stop,cloest_bus_stop,driving_to_train,driving_to_tram,driving_to_bus
0,1414/218-228 A'Beckett Street Melbourne VIC 3000,400,14,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,1,...,460.0,420.0,380.0,370.0,"[(208.7885192155693, (-37.811981, 144.955654),...","[(122.48588269536778, (-37.809743, 144.955454)...","[(145.52607550888214, (-37.80933, 144.955534),...","{'dist': [282.0, 622.5], 'name': ['Flagstaff R...","{'dist': [196.0, 500.5], 'name': ['8-Franklin ...","{'dist': [489.3, 35.7], 'name': ['Queen Victor..."
1,11a/131 Lonsdale Sreet Melbourne VIC 3000,350,11,MELBOURNE,3000,Studio,Yes,No,No,1,...,460.0,420.0,380.0,370.0,"[(531.6128151500881, (-37.809939, 144.962594),...","[(195.49052985170448, (-37.812468, 144.969169)...","[(46.1280390868096, (-37.810444, 144.968861), ...","{'dist': [1130.9, 762.1], 'name': ['Melbourne ...","{'dist': [459.6, 763.3], 'name': ['7-Russell S...","{'dist': [655.3, 608.9], 'name': ['Exhibition ..."
2,911/408 Lonsdale Street Melbourne VIC 3000,330,9,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,1,...,460.0,420.0,380.0,370.0,"[(422.63001113194304, (-37.811981, 144.955654)...","[(205.39104815345, (-37.811105, 144.961778), 5...","[(21.831936026455327, (-37.812636, 144.960645)...","{'dist': [669.1, 515.3], 'name': ['Flagstaff R...","{'dist': [508.5, 585.6], 'name': ['5-Melbourne...","{'dist': [18.6, 238.9], 'name': ['Hardware Lan..."
3,918/422 Collins St Melbourne VIC 3000,600,9,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,2,...,460.0,420.0,380.0,370.0,"[(692.4760990676032, (-37.811981, 144.955654),...","[(84.91961925921264, (-37.817607, 144.959429),...","[(112.47418871191125, (-37.816648, 144.961296)...","{'dist': [1103.0, 2428.3], 'name': ['Flagstaff...","{'dist': [355.7, 483.1], 'name': ['3-William S...","{'dist': [152.5, 206.1], 'name': ['Collins St/..."
4,602/118 Franklin Street Melbourne VIC 3000,330,6,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,1,...,460.0,420.0,380.0,370.0,"[(507.70605016321656, (-37.811981, 144.955654)...","[(192.08204040095504, (-37.806561, 144.959574)...","[(167.36806435020796, (-37.809659, 144.95841),...","{'dist': [902.0, 533.6], 'name': ['Flagstaff R...","{'dist': [343.7, 485.4], 'name': ['7-Queen Vic...","{'dist': [427.9, 619.0], 'name': ['A'Beckett S..."
5,1112/333-351 Exhibition Street Melbourne VIC 3000,600,11,MELBOURNE,3000,Apartment / Unit / Flat,Yes,No,No,2,...,460.0,420.0,380.0,370.0,"[(549.3521437399352, (-37.809939, 144.962594),...","[(38.94556666050394, (-37.808185, 144.968537),...","[(120.6828396590071, (-37.808841, 144.968962),...","{'dist': [1120.8, 983.1], 'name': ['Melbourne ...","{'dist': [400.2, 405.9], 'name': ['8-Exhibitio...","{'dist': [543.1, 196.2], 'name': ['Little Lons..."
6,3002/288 Spencer St Melbourne VIC 3000,510,30,MELBOURNE,3000,Apartment / Unit / Flat,No,Yes,Yes,2,...,460.0,420.0,380.0,370.0,"[(466.56544569832755, (-37.817936, 144.951411)...","[(100.48796546084708, (-37.813211, 144.951201)...","[(116.32056387560505, (-37.814821, 144.952077)...","{'dist': [1881.1, 619.8], 'name': ['Southern C...","{'dist': [1047.4, 615.1], 'name': ['119-Spence...","{'dist': [450.0, 1356.2], 'name': ['Lonsdale S..."
7,4/180 Little Collins Street Melbourne VIC 3000,500,4,MELBOURNE,3000,Apartment / Unit / Flat,Yes,No,No,1,...,460.0,420.0,380.0,370.0,"[(539.0886706587495, (-37.811054, 144.972911),...","[(127.52924039328948, (-37.812769, 144.968354)...","[(359.3030307887556, (-37.811567, 144.965025),...","{'dist': [743.6, 2303.4], 'name': ['Parliament...","{'dist': [520.3, 197.3], 'name': ['7-Russell S...","{'dist': [669.7, 698.8], 'name': ['Melbourne C..."
8,1605/565 Flinders Street Melbourne VIC 3000,500,16,MELBOURNE,3000,Apartment / Unit / Flat,No,No,No,2,...,460.0,420.0,380.0,370.0,"[(525.867016936961, (-37.817936, 144.951411), ...","[(25.4241223662484, (-37.820834, 144.955853), ...","[(285.5406557151809, (-37.819325, 144.953509),...","{'dist': [931.9, 1548.8], 'name': ['Southern C...","{'dist': [185.5, 42.8], 'name': ['1-Spencer St...","{'dist': [383.9, 1295.4], 'name': ['Southern C..."
9,612/408 Lonsdale Street Melbourne VIC 3000,350,6,MELBOURNE,3000,Apartment / Unit / Flat,Yes,No,No,1,...,460.0,420.0,380.0,370.0,"[(422.63001113194304, (-37.811981, 144.955654)...","[(205.39104815345, (-37.811105, 144.961778), 5...","[(21.831936026455327, (-37.812636, 144.960645)...","{'dist': [669.1, 515.3], 'name': ['Flagstaff R...","{'dist': [508.5, 585.6], 'name': ['5-Melbourne...","{'dist': [18.6, 238.9], 'name': ['Hardware Lan..."


In [97]:
test['driving_to_train'][0]

[[-37.8102832, 144.9566691],
 [(208.7885192155693,
   (-37.811981, 144.955654),
   'Flagstaff Railway Station (Melbourne City)'),
  (521.9037087436633,
   (-37.809939, 144.962594),
   'Melbourne Central Railway Station (Melbourne City)')]]

In [102]:
test.index.to_list()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
# read rental data
path = os.getcwd().replace("notebooks","") + "data/curated/"
train = pd.read_csv(path+'train_station.csv', low_memory = False)
property_df = pd.read_csv(path+'cleaned_rent.csv', low_memory = False)
train["coordinates"] = list(zip(train.LATITUDE, train.LONGITUDE))
#GNR = GNR.drop_duplicates(subset='coordinates', keep="first")

In [None]:
train.head(5)

In [None]:
# combine longtitude and latitude to coordnates and use first 10 property data as sample
property_df["coordinates"] = list(zip(property_df.latitude, property_df.longitude))
property_df = property_df.iloc[:10]
property_df.head(5)

In [None]:

# comparing the distances and record three cloest points
def cloest_point(dist_dict,dist,loc,stop):
    # if there is no point, then just append the point into dict
    if len(dist_dict) < 3:
        dist_dict.append((dist,loc,stop))
    # if there already has three points, then compares distance
    else:
        # if the current distance smaller than records' distance
        if dist < dist_dict[0][0]:
            # delete the record point
            dist_dict.pop(0)
            # append new cloest point
            dist_dict.append((dist,loc,stop))
        elif dist < dist_dict[1][0]:
            # delete the record point
            dist_dict.pop(1)
            # append new cloest point
            dist_dict.append((dist,loc,stop))
        elif dist < dist_dict[2][0]:
            # delete the record point
            dist_dict.pop(2)
            # append new cloest point
            dist_dict.append((dist,loc,stop))
    return dist_dict
            
# calculate the cloest three point of interest for each property data
def distance(loc1):
    # read train station data
    train = pd.read_csv(path+'train_station.csv', low_memory = False)
    # extact all features
    train_stop = list(train["STOP_NAME"].unique())
    # initaliza the dict for record the cloest three point of interest
    dist_lis = []
    # find cloest three points
    for i in range(train.shape[0]):
        # feature points
        loc2 = (train.iloc[i]["LATITUDE"],train.iloc[i]["LONGITUDE"])
        # calculate distance between property and feature
        dist = hs.haversine(loc1,loc2,unit=Unit.METERS)
        # check the cloest
        dist_lis = cloest_point(dist_lis,dist,loc2,train.iloc[i]["STOP_NAME"])
    return dist_lis

In [None]:
property_df["train_station"]= property_df["coordinates"].apply(distance)  # calculate distance for each property row

In [None]:
property_df.head(5)

In [None]:
property_df["train_station"][0]

In [None]:
# put two coordinate in and return the duration between two coordinates
# coordinate form [longitude, latitude]
def calculate_distance_between_coordinates(coordinate1, coordinate2):

    # put your own open route service api key in here
    api_key_openrouteservice = '5b3ce3597851110001cf6248d864908ae526479e86e6f4dd70971a37'

    # connect open route service
    client = ors.Client(key = api_key_openrouteservice)

    # put two coordinates in list
    cor = [coordinate1, coordinate2]

    # using open route service
    route = client.directions(
    coordinates= cor,
    profile='driving-car',
    format='geojson',
    )

    # dict of distance and duration
    dist = route['features'][0]['properties']['segments'][0]['distance']
    duration = route['features'][0]['properties']['segments'][0]['duration']
    
    # return the duration
    return dist, duration

In [None]:
nearest_train_list = []
nearest_distance_list = []
nearest_duration_list = []
for i in range(len(property_df)):
    # print(i)
    long = property_df["longitude"][i]
    lat = property_df["latitude"][i]
    property_coordinate = [long, lat]

    # print(property_coordinate)

    train = property_df["train_station"][i]

    duration = []
    distance = []

    for j in range(len(train)):
        train_coord = train[j][1]
        train_long = train_coord[1]
        train_lat = train_coord[0]

        train_position = [train_long, train_lat]
        # print(train_position)
        dist_in_between, duration_in_detween = calculate_distance_between_coordinates(property_coordinate, train_position)

        duration.append(duration_in_detween)
        distance.append(dist_in_between)
    
    for k in range(1, len(duration)):
        if duration[k-1] <= duration[k]:
            nearest_point_index = k-1
            nearest_duration = duration[k-1]
            nearest_distance = distance[k-1]

        else:
            nearest_point_index = k
            nearest_duration = duration[k]
            nearest_distance = distance[k]

    nearest_train = train[nearest_point_index]
    print(nearest_train)
    nearest_train_list.append(nearest_train[2])
    nearest_distance_list.append(nearest_distance)
    nearest_duration_list.append(nearest_duration)

In [None]:
property_df["nearest_train"] = nearest_train_list
property_df["nearest_distance(m)"] = nearest_distance_list
property_df["nearest_duration(s)"] = nearest_duration_list
property_df = property_df[['address','coordinates', 'train_station', 'nearest_train','nearest_distance(m)', 'nearest_duration(s)']]

property_df.to_csv("../data/curated/dist_property_train.csv")

In [None]:
property_df

#### for further coding

In [None]:
def find_closest(loc1, train_points):
    for point in train_points:
        dist, duration = calculate_distance_between_coordinates(loc1,point[1])
    return dist, duration

In [None]:
# generate a dict for each property data that contains distance and coordination of cloest three features
def generate_dict(features):
    dist_lis = {}
    # initialize feature
    for feature in features:
        dist_lis[feature] = []
    return dist_lis

# comparing the distances and record three cloest points
def cloest_point(point,dist_dict,dist,loc):
    # if there is no point, then just append the point into dict
    if len(dist_dict[point["FEATURE"]]) < 3:
        dist_dict[point["FEATURE"]].append((dist,loc,point["PLACE_NAME"]))
    # if there already has three points, then compares distance
    else:
        # if the current distance smaller than records' distance
        if dist < dist_dict[point["FEATURE"]][0][0]:
            # delete the record point
            dist_dict[point["FEATURE"]].pop(0)
            # append new cloest point
            dist_dict[point["FEATURE"]].append((dist,loc,point["PLACE_NAME"]))
        elif dist < dist_dict[point["FEATURE"]][1][0]:
            # delete the record point
            dist_dict[point["FEATURE"]].pop(1)
            # append new cloest point
            dist_dict[point["FEATURE"]].append((dist,loc,point["PLACE_NAME"]))
        elif dist < dist_dict[point["FEATURE"]][2][0]:
            # delete the record point
            dist_dict[point["FEATURE"]].pop(2)
            # append new cloest point
            dist_dict[point["FEATURE"]].append((dist,loc,point["PLACE_NAME"]))
    return dist_dict
            
# calculate the cloest three point of interest for each property data
def distance(loc1):
    # read point of interest data
    GNR = pd.read_csv(path+'GNR_suburb.csv', low_memory = False)
    # extact all features
    Point_of_Interest = list(GNR["FEATURE"].unique())
    # initaliza the dict for record the cloest three point of interest
    dist_lis = generate_dict(Point_of_Interest)
    # find cloest three points
    for feature in Point_of_Interest:
        df = GNR[GNR["FEATURE"] == feature]  # select feature data
        # calculate  distances for all feature points
        for i in range(df.shape[0]):
            # feature points
            loc2 = (df.iloc[i]["LATITUDE"],df.iloc[i]["LONGITUDE"])
            # calculate distance between property and feature
            dist = hs.haversine(loc1,loc2,unit=Unit.METERS)
            # check the cloest
            dist_lis = cloest_point(df.iloc[i],dist_lis,dist,loc2)
    return dist_lis