In [None]:
import pandas as pd 
import numpy as np 
import json 
import requests 
from ast import literal_eval
import time 
import re 

In [None]:
rental_df = pd.read_csv("../data/curated/Feature_Engineered_Scraped_Dataset.csv")
rental_df = rental_df.drop('Unnamed: 0', axis = 1)

rental_df.loc[:,'house_postcode'] = rental_df['postcode']

rental_df = rental_df.dropna(axis=0, how ="any").reset_index(drop = True)

sample_rental_df = rental_df.iloc[0:12100].reset_index(drop = True)

## Calculating distance from the house to CBD

####  This function calculates the distance (in km) between any latitude and longitude to Melbourne CBD 

In [None]:
def cbd_distance(house_coords):
    
    # cbd coordinates 
    melb_cbd = [-37.815207, 144.963937]
    
    journey = requests.get(f"http://router.project-osrm.org/route/v1/car/{house_coords[1]},{house_coords[0]};{melb_cbd[1]},{melb_cbd[0]}?overview=false""")
    
    # extract distance from trip information nested dictionary 
    distance_meters = json.loads(journey.content)['routes'][0]['legs'][0]['distance']
    
    return distance_meters / 1000

####  This function loops throught dataframe and returns a list of distances for the houses 

In [None]:
def calculate_cbd_distance(dataframe):

    CBD_distances = []
    for i in dataframe['coordinates']:

        try:
            CBD_distances.append(cbd_distance(literal_eval(i)))
        except:
            CBD_distances.append(np.nan)    
            continue 
            
    return CBD_distances 

#### Creating new feature called 'CBD_Distance' 

In [None]:
cbd_distances = calculate_cbd_distance(sample_rental_df)

In [None]:
sample_rental_df.loc[:,'CBD_Distance'] = cbd_distances
sample_rental_df.to_csv('../data/raw/tmp_with_cbd_dist.csv') 
sample_rental_df.head(5)

## Calculating distance from the house to the nearest school from the house

#### Cleaning up the school dataset and retaining only the necessary columns 

In [None]:
# setup school dataset coordinates and location to calculate proximity 

retained_cols = ['SCHOOL_NO', 'X' , 'Y', 'Postal_Postcode', 'Address_Town']
vic_schools = pd.read_csv('../data/raw/dv309_schoollocations2021.csv', encoding='cp1252',
                         usecols = retained_cols)

vic_schools['coordinates']= vic_schools[['Y','X']].values.tolist()
vic_schools['Address_Town'] = vic_schools['Address_Town'].str.lower()

vic_schools.drop(['X', 'Y'], axis=1, inplace=True)
vic_schools = vic_schools.dropna(subset = ['coordinates', 'Postal_Postcode'], how = 'any').reset_index(drop = True)

vic_schools.head(5)

#### Function to calculate road distance between any two points 

In [None]:
def calculate_proximity(point1, point2):
    
    journey = requests.get(f"http://router.project-osrm.org/route/v1/car/{point1[1]},{point1[0]};{point2[1]},{point2[0]}?overview=false""")
    distance_meters = json.loads(journey.content)['routes'][0]['legs'][0]['distance']
    
    return distance_meters / 1000

#### Function to calculate nearest school distance from house

Algorithm 

* Find the schools in the same suburb - Calculate distance to each of them and then choose closest one 
* If no school in same suburb - Expand search for schools in the same postcode
* If no school in same suburb and same postcode - Expand search to nearby postcodes 

In [None]:
def nearest_school_distance(house):
    
    curr_house_suburb = house['suburb']
    curr_suburb_schools = vic_schools[vic_schools['Address_Town'] == curr_house_suburb]
    curr_house_lat_lon = literal_eval(house['coordinates'])
    
    if len(curr_suburb_schools) > 0:
        
        nearest_schools_dist = []
        
        for i in curr_suburb_schools['coordinates']:
            
            dist_to_school = calculate_proximity(curr_house_lat_lon, i)
            nearest_schools_dist.append(dist_to_school)
            
        return sorted(nearest_schools_dist)[0]
    
    curr_house_postcode = house['house_postcode']
    curr_house_lat_lon = literal_eval(house['coordinates'])
    curr_postcode_schools = vic_schools[vic_schools['Postal_Postcode'] == curr_house_postcode]
    
    if len(curr_postcode_schools) > 0:
        
        nearest_schools_dist = []
        
        for i in curr_postcode_schools['coordinates']:
            
            
            dist_to_school = calculate_proximity(curr_house_lat_lon, i)
            nearest_schools_dist.append(dist_to_school)
            
        return sorted(nearest_schools_dist)[0]
    
    found = 0 
    i = 1 
    
    while(found != 1):
        
        lookup_range = [curr_house_postcode - i, curr_house_postcode + i]
        lookup_range_schools = vic_schools [(vic_schools ['Postal_Postcode'] >= lookup_range[0]) & 
                                        (vic_schools ['Postal_Postcode'] <= lookup_range[1])]
    
        display(lookup_range_schools)
        
        if len(lookup_range_schools) < 1:
            i = i + 1   
        else:
            found = 1 
    
    if len(lookup_range_schools > 0):
        for i in lookup_range_schools['coordinates']:
            
            nearest_schools_dist = []

            dist_to_school = calculate_proximity(curr_house_lat_lon, i)
            nearest_schools_dist.append(dist_to_school)
        
        return sorted(nearest_schools_dist)[0]

    return np.nan

#### Loop through dataframe and calculate distance to clodest school for all houses 

In [None]:
def calculate_school_distance(dataframe):
    
    school_distances = []
    for i in dataframe.index:
        
        try:
            school_distances.append(nearest_school_distance(dataframe.iloc[i]))
        except:
            school_distances.append(np.nan)
            continue 
            
    return school_distances

#### Create new feature for nearest school and save temporary data frame

In [None]:
school_distances = calculate_school_distance(sample_rental_df)

In [None]:
sample_rental_df.loc[:,'Nearest_School_Distance'] = school_distances
sample_rental_df.head(5)

## Calculating distance from the house to the nearest hospital from the house

#### Cleaning up hospital dataset 

In [None]:
retained_columns = ['Hospital name','State','Suburb', 'Latitude', 'Longitude', 'Postcode', 'Hospital_ID']

hospitals = pd.read_csv('../data/raw/myhospitals-contact-details.csv', encoding='cp1252',
                       usecols = retained_columns)

hospitals = hospitals[hospitals['State'] == 'Vic'].reset_index(drop = True)

hospitals['Suburb'] = hospitals['Suburb'].str.lower()

hospitals['coordinates']= hospitals[['Latitude','Longitude']].values.tolist()
hospitals = hospitals.drop(['Latitude','Longitude'], axis = 1)

#### Calulating distance from a given house to nearest hospital  

In [None]:
def nearest_hospital(house):
    
    curr_house_lat_lon = literal_eval(house['coordinates'])
    curr_house_suburb = house['suburb']
    curr_suburb_hospital = hospitals[hospitals['Suburb'] == curr_house_suburb]
    
    if len(curr_suburb_hospital) > 0:
        
        nearest_hosp_dist = []
        
        for i in curr_suburb_hospital['coordinates']:
            
            dist_to_hosp = calculate_proximity(curr_house_lat_lon, i)
            nearest_hosp_dist.append(dist_to_hosp)
            
        return sorted(nearest_hosp_dist)[0]
    
    curr_house_postcode = house['house_postcode']
    curr_postcode_hospital = hospitals[hospitals['Postcode'] == curr_house_postcode].reset_index(drop = True)

    if len(curr_postcode_hospital) == 1:
        
        dist_to_hosp = calculate_proximity(curr_house_lat_lon, curr_postcode_hospital.iloc[0]['coordinates'])
        
        return dist_to_hosp
        
    if len(curr_postcode_hospital) >= 2:
        
        nearest_hosp_dist = []
        
        for i in curr_postcode_hospital['coordinates']:
            
            dist_to_hosp = calculate_proximity(curr_house_lat_lon, i)
            nearest_hosp_dist.append(dist_to_hosp)
            
        return sorted(nearest_hosp_dist)[0]
    
    found = 0 
    i = 1 
    
    while(found != 1):
        
        lookup_range = [curr_house_postcode - i, curr_house_postcode + i]
        lookup_range_hosp = hospitals [(hospitals ['Postcode'] >= lookup_range[0]) & 
                                        (hospitals ['Postcode'] <= lookup_range[1])].reset_index(drop = True)
    
        if len(lookup_range_hosp) == 0:
            i = i + 1 
        else:
            found = 1 
                
    if len(lookup_range_hosp) == 1:
        
        dist_to_hosp = calculate_proximity(curr_house_lat_lon, lookup_range_hosp.iloc[0]['coordinates'])
        
        return dist_to_hosp
    
    if len(lookup_range_hosp) > 1:
        
        nearest_hosp_dist = list()
        
        for i in lookup_range_hosp['coordinates']:
            
            dist_to_hosp = calculate_proximity(curr_house_lat_lon, i)
            nearest_hosp_dist.append(dist_to_hosp)
            
        return sorted(nearest_hosp_dist)[0]
    
    return np.nan

In [None]:
def calculate_hospital_distance(dataframe):
    
    hospital_distances = []
    for i in dataframe.index:
        
        try:
            hospital_distances.append(nearest_hospital(dataframe.iloc[i]))
        except:
            hospital_distances.append(np.nan)
            continue 
        
    return hospital_distances

In [None]:
hospital_distances = calculate_hospital_distance(sample_rental_df)

In [None]:
sample_rental_df['nearest_hospital'] = hospital_distances
sample_rental_df.head(5)

## Calculating distance from the house to the nearest school from the house

#### Handling metro and regional station data

In [None]:
postcodes = pd.read_csv('../data/curated/victoria_postcodes.csv')
postcodes['locality'] = postcodes['locality'].str.lower()

retained_cols = ['STOP_NAME', 'LATITUDE', 'LONGITUDE']

metro = pd.read_csv('../data/raw/datasource-VIC_Govt_PTV-VIC_Govt_DELWP_datavic_PTV_METRO_TRAIN_STATION.csv',
                   usecols = retained_cols)
metro['Suburb'] = metro.STOP_NAME.str.extract('.*\((.*)\).*')
metro['Suburb'] = metro['Suburb'].str.lower()
metro['Type'] = 'Metro'

metro = metro.merge(postcodes[['locality', 'postcode']], left_on='Suburb',right_on = 'locality', how='inner')
metro = metro.drop('locality', axis = 1).reset_index(drop = True)
metro = metro.dropna(how = 'any')

regional = pd.read_csv('../data/raw/datasource-VIC_Govt_PTV-VIC_Govt_DELWP_datavic_PTV_REGIONAL_TRAIN_STATION.csv',
                      usecols = retained_cols)
regional['Suburb'] = regional.STOP_NAME.str.extract('.*\((.*)\).*')
regional['Suburb'] = regional['Suburb'].str.lower()
regional['Type'] = 'Regional'
 
regional = regional.merge(postcodes[['locality', 'postcode']], left_on='Suburb',right_on = 'locality', how='inner')
regional = regional.drop('locality', axis = 1)
regional = regional.dropna(how = 'any').reset_index(drop = True)

all_stations = pd.concat([metro, regional]).reset_index(drop = True)
all_stations['coordinates']= all_stations[['LATITUDE','LONGITUDE']].values.tolist()
all_stations = all_stations.drop(['LATITUDE','LONGITUDE'], axis = 1)

all_stations.head(5)

#### Function to find nearest train station for a given house 

In [None]:
def nearest_station(house):
    
    curr_house_lat_lon = literal_eval(house['coordinates'])
    curr_house_suburb = house['suburb']
    curr_suburb_station = all_stations[all_stations['Suburb'] == curr_house_suburb]
    
    if len(curr_suburb_station) > 0:
        
        nearest_train_dist = []
        
        for i in curr_suburb_station['coordinates']:
            
            dist_to_train = calculate_proximity(curr_house_lat_lon, i)
            nearest_train_dist.append(dist_to_train)

        return sorted(nearest_train_dist)[0]
    
    curr_house_postcode = house['house_postcode']
    curr_postcode_station = all_stations[all_stations['postcode'] == curr_house_postcode].reset_index(drop = True)

    if len(curr_postcode_station) == 1:
        
        dist_to_train = calculate_proximity(curr_house_lat_lon, curr_postcode_station.iloc[0]['coordinates'])
        
        return dist_to_train
        
    if len(curr_postcode_station) >= 2:
        
        nearest_train_dist = []
        
        for i in curr_postcode_station['coordinates']:
            
            dist_to_train = calculate_proximity(curr_house_lat_lon, i)
            nearest_train_dist.append(dist_to_train)
            
        return sorted(nearest_train_dist)[0]

    found = 0 
    i = 1 
    
    while(found != 1):
        
        lookup_range = [curr_house_postcode - i, curr_house_postcode + i]
        lookup_range_train = all_stations [(all_stations ['postcode'] >= lookup_range[0]) & 
                                        (all_stations ['postcode'] <= lookup_range[1])].reset_index(drop = True)
    
        if len(lookup_range_train) == 0:
            i = i + 1    
        else:
            found = 1 
            
    if len(lookup_range_train) == 1:
        
        dist_to_train = calculate_proximity(curr_house_lat_lon, lookup_range_train.iloc[0]['coordinates'])
        
        return dist_to_train
    
    if len(lookup_range_train) > 1:
        
        nearest_train_dist = list()
        
        for i in lookup_range_train['coordinates']:
            
            dist_to_train = calculate_proximity(curr_house_lat_lon, i)
            nearest_train_dist.append(dist_to_train)
            
        return sorted(nearest_train_dist)[0]
    
    return np.nan

In [None]:
def calculate_train_distance(dataframe):
    
    train_distances = []
    
    for i in dataframe.index:
        
        try:
            train_distances.append(nearest_station(dataframe.iloc[i]))    
        except:
            train_distances.append(np.nan)
            
    return train_distances

In [None]:
station_distances = calculate_train_distance(sample_rental_df)

In [None]:
sample_rental_df['nearest_train_station'] = station_distances 
sample_rental_df.to_csv('../data/curated/Scraped_Primary_All_Features.csv') 

sample_rental_df.head(5)