In [1]:
import pandas as pd 
import numpy as np 
import json 
import requests 
from ast import literal_eval
import time 
import re 

In [21]:
rental_df = pd.read_csv("../data/curated/Preprocessed_Scraped_Rental.csv")
rental_df = rental_df.drop('Unnamed: 0', axis = 1)

#drop entires without lat/lon 
#NOTE - There aren't any, just a precausionary step 
rental_df = rental_df.dropna(subset = ['coordinates'], axis = 0)

# Obtain all suburb names in Victoria
state_suburbs_df = pd.read_csv("../data/raw/georef-australia-state-suburb.csv")
state_suburbs_df = state_suburbs_df[state_suburbs_df["Official Name State"] == "Victoria"]
suburbs = [suburb.lower() for suburb in state_suburbs_df["Official Name Suburb"]]
for i in range(len(suburbs)):
    suburbs[i] = re.sub(r'\(.*\)', '', suburbs[i])
    suburbs[i] = suburbs[i].strip()

extra_suburbs = ["preston west", "prahran east", "sanctuary lakes", "mallacoota"]
for i in range(len(extra_suburbs)):
    suburbs.append(extra_suburbs[i])
    
# Convert address to list

house_postcodes = [int(i[-1]) for i in rental_df['name'].str.split()] 
rental_df.loc[:,'house_postcode'] = house_postcodes

rental_df["name"] = rental_df["name"].str.split(' ')

# Create new column for postcodes
rental_df["postcode"] = [row[-1] for row in rental_df["name"]]

# Create new columns for potential suburb names
rental_df["name_one"] = [row[-3].lower() for row in rental_df["name"]]
rental_df["name_two"] = [row[-4:-2] for row in rental_df["name"]]
rental_df["name_three"] = [row[-5:-2] for row in rental_df["name"]]

# Convert potential suburb names from list to string
rental_df["name_two"] = rental_df["name_two"].str.join(' ')
rental_df["name_two"] = [row.lower() for row in rental_df["name_two"]]
rental_df["name_three"] = rental_df["name_three"].str.join(' ')
rental_df["name_three"] = [row.lower() for row in rental_df["name_three"]]

i = 0
for suburb_one, suburb_two, suburb_three in zip(rental_df.iloc[:, -3], rental_df.iloc[:, -2], rental_df.iloc[:, -1]):
    if suburb_three in suburbs:
        rental_df.loc[i, ["suburb"]] = suburb_three
    elif suburb_two in suburbs:
        rental_df.loc[i, ["suburb"]] = suburb_two
    elif suburb_one in suburbs:
        rental_df.loc[i, ["suburb"]] = suburb_one
    i += 1

rental_df = rental_df.drop(columns=["name_one", "name_two", "name_three"])

# Drop rows containing at least one null value in primary dataset
rental_df = rental_df.dropna(axis=0, how ="any")

sample_rental_df = rental_df.iloc[0:100].reset_index(drop = True)
sample_rental_df.head(5)

Unnamed: 0,coordinates,cost_text,desc,name,rooms,house_postcode,postcode,suburb
0,"[-37.9032708, 145.0770553]",$680,Joseph Chatziconstantis,"[15, Fintonia, Street, Hughesdale, VIC, 3166]","['3 Beds', '2 Baths', '2 Parking']",3166,3166,hughesdale
1,"[-37.9077745, 145.092738]",$650.00,James Drakopoulos,"[41, Bishop, Street, Oakleigh, VIC, 3166]","['3 Beds', '2 Baths', '2 Parking']",3166,3166,oakleigh
2,"[-37.8922004, 145.0899378]",$500,Nigel Chee,"[109/6, Dalgety, Street, Oakleigh, VIC, 3166]","['1 Bed', '1 Bath', '1 Parking']",3166,3166,oakleigh
3,"[-37.8972564, 145.0871089]",$570,Matthew Swinnerton,"[16/30, Swindon, Road, Oakleigh, VIC, 3166]","['3 Beds', '2 Baths', '2 Parking']",3166,3166,oakleigh
4,"[-37.894307, 145.082706]",$475,Jason Gu,"[1/14, Bletchley, Road, Hughesdale, VIC, 3166]","['2 Beds', '1 Bath', '1 Parking']",3166,3166,hughesdale


## Calculating distance from the house to CBD

####  This function calculates the distance (in km) between any latitude and longitude to Melbourne CBD 

In [3]:
def cbd_distance(house_coords):
    
    melb_cbd = [-37.815207, 144.963937]
    
    journey = requests.get(f"http://router.project-osrm.org/route/v1/car/{house_coords[1]},{house_coords[0]};{melb_cbd[1]},{melb_cbd[0]}?overview=false""")
    
    distance_meters = json.loads(journey.content)['routes'][0]['legs'][0]['distance']
    
    return distance_meters / 1000

####  This function loops throught dataframe and returns a list of distances for the houses 

In [4]:
def calculate_cbd_distance(dataframe):

    CBD_distances = []
    for i in dataframe['coordinates']:

        try:
            CBD_distances.append(cbd_distance(literal_eval(i)))
        except:
            CBD_distances.append(np.nan)    
            continue 
            
    return CBD_distances 

#### Creating new feature called 'CBD_Distance' 

In [5]:
sample_rental_df.loc[:,'CBD_Distance'] = calculate_cbd_distance(sample_rental_df)
sample_rental_df.to_csv('../data/raw/tmp_with_cbd_dist.csv') 
sample_rental_df.head(5)

Unnamed: 0,coordinates,cost_text,desc,name,rooms,house_postcode,postcode,suburb,CBD_Distance
0,"[-37.9032708, 145.0770553]",$680,Joseph Chatziconstantis,"[15, Fintonia, Street, Hughesdale, VIC, 3166]","['3 Beds', '2 Baths', '2 Parking']",3166,3166,hughesdale,18.978
1,"[-37.9077745, 145.092738]",$650.00,James Drakopoulos,"[41, Bishop, Street, Oakleigh, VIC, 3166]","['3 Beds', '2 Baths', '2 Parking']",3166,3166,oakleigh,19.0444
2,"[-37.8922004, 145.0899378]",$500,Nigel Chee,"[109/6, Dalgety, Street, Oakleigh, VIC, 3166]","['1 Bed', '1 Bath', '1 Parking']",3166,3166,oakleigh,17.1238
3,"[-37.8972564, 145.0871089]",$570,Matthew Swinnerton,"[16/30, Swindon, Road, Oakleigh, VIC, 3166]","['3 Beds', '2 Baths', '2 Parking']",3166,3166,oakleigh,17.3138
4,"[-37.894307, 145.082706]",$475,Jason Gu,"[1/14, Bletchley, Road, Hughesdale, VIC, 3166]","['2 Beds', '1 Bath', '1 Parking']",3166,3166,hughesdale,17.5619


## Calculating distance from the house to the nearest school from the house

#### Cleaning up the school dataset and retaining only the necessary columns 

In [6]:
retained_cols = ['SCHOOL_NO', 'X' , 'Y', 'Postal_Postcode', 'Address_Town']
vic_schools = pd.read_csv('../data/raw/dv309_schoollocations2021.csv', encoding='cp1252',
                         usecols = retained_cols)

vic_schools['coordinates']= vic_schools[['Y','X']].values.tolist()
vic_schools['Address_Town'] = vic_schools['Address_Town'].str.lower()

vic_schools.drop(['X', 'Y'], axis=1, inplace=True)
vic_schools = vic_schools.dropna(subset = ['coordinates', 'Postal_Postcode'], how = 'any').reset_index(drop = True)

vic_schools.head(5)

Unnamed: 0,SCHOOL_NO,Address_Town,Postal_Postcode,coordinates
0,1,alberton,3971,"[-38.617713, 146.666601]"
1,3,allansford,3277,"[-38.386281, 142.590393]"
2,4,avoca,3467,"[-37.084502, 143.475649]"
3,8,avenel,3664,"[-36.901368, 145.234722]"
4,12,warrandyte,3113,"[-37.742675, 145.21398]"


#### Function to calculate road distance between any two points 

In [7]:
def calculate_proximity(point1, point2):
    
    journey = requests.get(f"http://router.project-osrm.org/route/v1/car/{point1[1]},{point1[0]};{point2[1]},{point2[0]}?overview=false""")
    distance_meters = json.loads(journey.content)['routes'][0]['legs'][0]['distance']
    
    return distance_meters / 1000

#### Function to calculate nearest school distance from house

Algorithm 

* Find the schools in the same suburb - Calculate distance to each of them and then choose closest one 
* If no school in same suburb - Expand search to same postcode
* If no school in same postcode - Expand search to nearby postcodes 

In [22]:
def nearest_school_distance(house):
    
    curr_house_suburb = house['suburb']
    curr_suburb_schools = vic_schools[vic_schools['Address_Town'] == curr_house_suburb]
    curr_house_lat_lon = literal_eval(house['coordinates'])
    
    if len(curr_suburb_schools) > 0:
        
        nearest_schools_dist = []
        
        for i in curr_suburb_schools['coordinates']:
            
            dist_to_school = calculate_proximity(curr_house_lat_lon, i)
            nearest_schools_dist.append(dist_to_school)
            
        return sorted(nearest_schools_dist)[0]
    
    curr_house_postcode = house['house_postcode']
    curr_house_lat_lon = literal_eval(house['coordinates'])
    curr_postcode_schools = vic_schools[vic_schools['Postal_Postcode'] == curr_house_postcode]
    
    if len(curr_postcode_schools) > 0:
        
        nearest_schools_dist = []
        
        for i in curr_postcode_schools['coordinates']:
            
            
            dist_to_school = calculate_proximity(curr_house_lat_lon, i)
            nearest_schools_dist.append(dist_to_school)
            
        return sorted(nearest_schools_dist)[0]
    
    found = 0 
    i = 1 
    
    
    while(found != 1):
        
        
        lookup_range = [curr_house_postcode - i, curr_house_postcode + i]
        lookup_range_schools = vic_schools [(vic_schools ['Postal_Postcode'] >= lookup_range[0]) & 
                                        (vic_schools ['Postal_Postcode'] <= lookup_range[1])]
    
        display(lookup_range_schools)
        
        if len(lookup_range_schools) < 1:
            i = i + 1
            
        else:
            found = 1 
            
    nearest_schools_dist = []
    
    
    for i in lookup_range_schools['coordinates']:
        
        dist_to_school = calculate_proximity(curr_house_lat_lon, i)
        nearest_schools_dist.append(dist_to_school)
        
    return sorted(nearest_schools_dist)[0]

    return np.nan

#### Loop through dataframe and calculate distance to clodest school for all houses 

In [23]:
def calculate_school_distance(dataframe):
    
    school_distances = []
    for i in dataframe.index:
        
        try:
            school_distances.append(nearest_school_distance(dataframe.iloc[i]))
        except:
            school_distances.append(np.nan)
            continue 
            
    return school_distances

#### Create new feature for nearest school and save temporary data frame

In [24]:
start_time = time.time()

sample_rental_df.loc[:,'Nearest_School_Distance'] = calculate_school_distance(sample_rental_df)
sample_rental_df.to_csv('../data/raw/tmp_with_cbd_dist_school.csv') 

sample_rental_df.head(5)

end_time = time.time()

print(end_time - start_time)

180.2337248325348


## Calculating distance from the house to the nearest hospital from the house

#### Cleaning up hospital dataset 

In [11]:
retained_columns = ['Hospital name','State','Suburb', 'Latitude', 'Longitude', 'Postcode', 'Hospital_ID']

hospitals = pd.read_csv('../data/raw/myhospitals-contact-details.csv', encoding='cp1252',
                       usecols = retained_columns)

hospitals = hospitals[hospitals['State'] == 'Vic'].reset_index(drop = True)

hospitals['Suburb'] = hospitals['Suburb'].str.lower()

hospitals['coordinates']= hospitals[['Latitude','Longitude']].values.tolist()
hospitals = hospitals.drop(['Latitude','Longitude'], axis = 1)

#### Calulating distance from a given house to nearest hospital  

In [12]:
def nearest_hospital(house):
    
    curr_house_lat_lon = literal_eval(house['coordinates'])
    curr_house_suburb = house['suburb']
    curr_suburb_hospital = hospitals[hospitals['Suburb'] == curr_house_suburb]
    
    if len(curr_suburb_hospital) > 0:
        
        nearest_hosp_dist = []
        
        for i in curr_suburb_hospital['coordinates']:
            
            dist_to_hosp = calculate_proximity(curr_house_lat_lon, i)
            nearest_hosp_dist.append(dist_to_hosp)
            
        return sorted(nearest_hosp_dist)[0]
    
    curr_house_postcode = house['house_postcode']
    curr_postcode_hospital = hospitals[hospitals['Postcode'] == curr_house_postcode].reset_index(drop = True)

    
    if len(curr_postcode_hospital) == 1:
        
        dist_to_hosp = calculate_proximity(curr_house_lat_lon, curr_postcode_hospital.iloc[0]['coordinates'])
        
        return dist_to_hosp
        
    if len(curr_postcode_hospital) >= 2:
        
        nearest_hosp_dist = []
        
        for i in curr_postcode_hospital['coordinates']:
            
            dist_to_hosp = calculate_proximity(curr_house_lat_lon, i)
            nearest_hosp_dist.append(dist_to_hosp)
            
        return sorted(nearest_hosp_dist)[0]

    
    found = 0 
    i = 1 
    
   
    while(found != 1):
        
        lookup_range = [curr_house_postcode - i, curr_house_postcode + i]
        lookup_range_hosp = hospitals [(hospitals ['Postcode'] >= lookup_range[0]) & 
                                        (hospitals ['Postcode'] <= lookup_range[1])].reset_index(drop = True)
    
        if len(lookup_range_hosp) == 0:
            i = i + 1
            
        else:
            found = 1 
            
            
    if len(lookup_range_hosp) == 1:
        
        dist_to_hosp = calculate_proximity(curr_house_lat_lon, lookup_range_hosp.iloc[0]['coordinates'])
        
        return dist_to_hosp
    
    
    if len(lookup_range_hosp) > 1:
        
        nearest_hosp_dist = list()
        
        for i in lookup_range_hosp['coordinates']:
            
            dist_to_hosp = calculate_proximity(curr_house_lat_lon, i)
            nearest_hosp_dist.append(dist_to_hosp)
            
        return sorted(nearest_hosp_dist)[0]
    
    return np.nan

In [13]:
def calculate_hospital_distance(dataframe):
    
    hospital_distances = []
    for i in dataframe.index:
        
        try:
            hospital_distances.append(nearest_hospital(dataframe.iloc[i]))
        except:
            hospital_distances.append(np.nan)
            continue 
            
    return hospital_distances

In [14]:
sample_rental_df['nearest_hospital'] = calculate_hospital_distance(sample_rental_df)
sample_rental_df.to_csv('../data/raw/tmp_with_cbd_dist_school_hosp.csv') 

sample_rental_df.head(5)

Unnamed: 0,coordinates,cost_text,desc,name,rooms,house_postcode,postcode,suburb,CBD_Distance,Nearest_School_Distance,nearest_hospital
0,"[-37.9032708, 145.0770553]",$680,Joseph Chatziconstantis,"[15, Fintonia, Street, Hughesdale, VIC, 3166]","['3 Beds', '2 Baths', '2 Parking']",3166,3166,hughesdale,18.978,0.3622,3.6709
1,"[-37.9077745, 145.092738]",$650.00,James Drakopoulos,"[41, Bishop, Street, Oakleigh, VIC, 3166]","['3 Beds', '2 Baths', '2 Parking']",3166,3166,oakleigh,19.0444,1.2317,4.4447
2,"[-37.8922004, 145.0899378]",$500,Nigel Chee,"[109/6, Dalgety, Street, Oakleigh, VIC, 3166]","['1 Bed', '1 Bath', '1 Parking']",3166,3166,oakleigh,17.1238,0.4685,5.719
3,"[-37.8972564, 145.0871089]",$570,Matthew Swinnerton,"[16/30, Swindon, Road, Oakleigh, VIC, 3166]","['3 Beds', '2 Baths', '2 Parking']",3166,3166,oakleigh,17.3138,0.4079,5.0104
4,"[-37.894307, 145.082706]",$475,Jason Gu,"[1/14, Bletchley, Road, Hughesdale, VIC, 3166]","['2 Beds', '1 Bath', '1 Parking']",3166,3166,hughesdale,17.5619,1.298,5.1099


## Calculating distance from the house to the nearest school from the house

#### Handling metro and regional station data

In [15]:
postcodes = pd.read_csv('../data/curated/victoria_postcodes.csv')
postcodes['locality'] = postcodes['locality'].str.lower()

retained_cols = ['STOP_NAME', 'LATITUDE', 'LONGITUDE']

metro = pd.read_csv('../data/raw/datasource-VIC_Govt_PTV-VIC_Govt_DELWP_datavic_PTV_METRO_TRAIN_STATION.csv',
                   usecols = retained_cols)
metro['Suburb'] = metro.STOP_NAME.str.extract('.*\((.*)\).*')
metro['Suburb'] = metro['Suburb'].str.lower()
metro['Type'] = 'Metro'

metro = metro.merge(postcodes[['locality', 'postcode']], left_on='Suburb',right_on = 'locality', how='inner')
metro = metro.drop('locality', axis = 1).reset_index(drop = True)
metro = metro.dropna(how = 'any')

regional = pd.read_csv('../data/raw/datasource-VIC_Govt_PTV-VIC_Govt_DELWP_datavic_PTV_REGIONAL_TRAIN_STATION.csv',
                      usecols = retained_cols)
regional['Suburb'] = regional.STOP_NAME.str.extract('.*\((.*)\).*')
regional['Suburb'] = regional['Suburb'].str.lower()
regional['Type'] = 'Regional'
 
regional = regional.merge(postcodes[['locality', 'postcode']], left_on='Suburb',right_on = 'locality', how='inner')
regional = regional.drop('locality', axis = 1)
regional = regional.dropna(how = 'any').reset_index(drop = True)

all_stations = pd.concat([metro, regional]).reset_index(drop = True)
all_stations['coordinates']= all_stations[['LATITUDE','LONGITUDE']].values.tolist()
all_stations = all_stations.drop(['LATITUDE','LONGITUDE'], axis = 1)

all_stations.head(5)

Unnamed: 0,STOP_NAME,Suburb,Type,postcode,coordinates
0,Royal Park Railway Station (Parkville),parkville,Metro,3052,"[-37.7812, 144.9523]"
1,Flemington Bridge Railway Station (North Melbo...,north melbourne,Metro,3051,"[-37.7881, 144.9393]"
2,Macaulay Railway Station (North Melbourne),north melbourne,Metro,3051,"[-37.7943, 144.9362]"
3,North Melbourne Railway Station (West Melbourne),west melbourne,Metro,3003,"[-37.8074, 144.9426]"
4,Clifton Hill Railway Station (Clifton Hill),clifton hill,Metro,3068,"[-37.7887, 144.9954]"


#### Function to find nearest train station for a given house 

In [16]:
def nearest_station(house):
    
    curr_house_lat_lon = literal_eval(house['coordinates'])
    curr_house_suburb = house['suburb']
    curr_suburb_station = all_stations[all_stations['Suburb'] == curr_house_suburb]
    
    if len(curr_suburb_station) > 0:
        
        nearest_train_dist = []
        
        for i in curr_suburb_station['coordinates']:
            
            dist_to_train = calculate_proximity(curr_house_lat_lon, i)
            nearest_train_dist.append(dist_to_train)

        return sorted(nearest_train_dist)[0]
    
    curr_house_postcode = house['house_postcode']
    curr_postcode_station = all_stations[all_stations['postcode'] == curr_house_postcode].reset_index(drop = True)

    
    if len(curr_postcode_station) == 1:
        
        dist_to_train = calculate_proximity(curr_house_lat_lon, curr_postcode_station.iloc[0]['coordinates'])
        
        return dist_to_train
        
    if len(curr_postcode_station) >= 2:
        
        nearest_train_dist = []
        
        for i in curr_postcode_station['coordinates']:
            
            dist_to_train = calculate_proximity(curr_house_lat_lon, i)
            nearest_train_dist.append(dist_to_train)
            
        return sorted(nearest_train_dist)[0]

    
    found = 0 
    i = 1 
    
    while(found != 1):
        
        lookup_range = [curr_house_postcode - i, curr_house_postcode + i]
        lookup_range_train = all_stations [(all_stations ['postcode'] >= lookup_range[0]) & 
                                        (all_stations ['postcode'] <= lookup_range[1])].reset_index(drop = True)
    
        if len(lookup_range_train) == 0:
            i = i + 1
            
        else:
            found = 1 
            
    if len(lookup_range_train) == 1:
        
        dist_to_train = calculate_proximity(curr_house_lat_lon, lookup_range_train.iloc[0]['coordinates'])
        
        return dist_to_train
    
    
    if len(lookup_range_train) > 1:
        
        nearest_train_dist = list()
        
        for i in lookup_range_train['coordinates']:
            
            dist_to_train = calculate_proximity(curr_house_lat_lon, i)
            nearest_train_dist.append(dist_to_train)
            
        return sorted(nearest_train_dist)[0]
    
    return np.nan

In [17]:
def calculate_train_distance(dataframe):
    
    train_distances = []
    
    for i in dataframe.index:
        
        try:
            train_distances.append(nearest_station(dataframe.iloc[i]))
            
        except:
            train_distances.append(np.nan)
            
    return train_distances

In [18]:
sample_rental_df['nearest_train_station'] = calculate_train_distance(sample_rental_df)
sample_rental_df.to_csv('../data/raw/tmp_with_cbd_dist_school_hosp_train.csv') 

sample_rental_df.head(5)

Unnamed: 0,coordinates,cost_text,desc,name,rooms,house_postcode,postcode,suburb,CBD_Distance,Nearest_School_Distance,nearest_hospital,nearest_train_station
0,"[-37.9032708, 145.0770553]",$680,Joseph Chatziconstantis,"[15, Fintonia, Street, Hughesdale, VIC, 3166]","['3 Beds', '2 Baths', '2 Parking']",3166,3166,hughesdale,18.978,0.3622,3.6709,1.3443
1,"[-37.9077745, 145.092738]",$650.00,James Drakopoulos,"[41, Bishop, Street, Oakleigh, VIC, 3166]","['3 Beds', '2 Baths', '2 Parking']",3166,3166,oakleigh,19.0444,1.2317,4.4447,1.115
2,"[-37.8922004, 145.0899378]",$500,Nigel Chee,"[109/6, Dalgety, Street, Oakleigh, VIC, 3166]","['1 Bed', '1 Bath', '1 Parking']",3166,3166,oakleigh,17.1238,0.4685,5.719,1.3874
3,"[-37.8972564, 145.0871089]",$570,Matthew Swinnerton,"[16/30, Swindon, Road, Oakleigh, VIC, 3166]","['3 Beds', '2 Baths', '2 Parking']",3166,3166,oakleigh,17.3138,0.4079,5.0104,0.6788
4,"[-37.894307, 145.082706]",$475,Jason Gu,"[1/14, Bletchley, Road, Hughesdale, VIC, 3166]","['2 Beds', '1 Bath', '1 Parking']",3166,3166,hughesdale,17.5619,1.298,5.1099,0.6205
