### This notebook follows ``ors_iteration_add_rentalDistance.ipynb`` which computed distance/time from each property to all places and CBD. Now we only want the closest distances.

In [1]:
import pandas as pd
import sys
import os
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../../scripts/')
from add_distance import get_min_distance_time

In [2]:
if not os.path.exists('../../data/distance'):
    os.makedirs('../../data/distance')

In [19]:
YEARS = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
for year in YEARS:
    distance_df = pd.read_csv(f"../../data/featured/{year}_distance_rental_place.csv")
    # Drop the 0 values here (due to API request failures)
    clean_distance_df = distance_df.drop(distance_df[(distance_df['dist_to_place_M'] == 0.0) & (distance_df['time_to_place_S'] == 0.0)].index)
    addr_month_pair = len(clean_distance_df.drop_duplicates(subset=['address', 'month']))
    # Check original number of unique property-month pairs
    print(f"Originally {addr_month_pair} address-month pairs, shape = {clean_distance_df.shape}")
    min_distance_df = get_min_distance_time(clean_distance_df, year)

Originally 10940 address-month pairs, shape = (153844, 25)
Completed Year 2013, updated 10940 address-month pairs, shape = (34707, 22)
Originally 12088 address-month pairs, shape = (195824, 25)
Completed Year 2014, updated 12088 address-month pairs, shape = (40289, 22)
Originally 12490 address-month pairs, shape = (203384, 25)
Completed Year 2015, updated 12490 address-month pairs, shape = (41674, 22)
Originally 14751 address-month pairs, shape = (292277, 25)
Completed Year 2016, updated 14751 address-month pairs, shape = (49700, 22)
Originally 16683 address-month pairs, shape = (329737, 25)
Completed Year 2017, updated 16683 address-month pairs, shape = (56149, 22)
Originally 19034 address-month pairs, shape = (373103, 25)
Completed Year 2018, updated 19034 address-month pairs, shape = (64021, 22)
Originally 20864 address-month pairs, shape = (404342, 25)
Completed Year 2019, updated 20864 address-month pairs, shape = (70029, 22)
Originally 20472 address-month pairs, shape = (411940, 

### Transform dataset, map row-wise distance to column-wise

In [20]:
if not os.path.exists('../../data/curated/min_distance'):
    os.makedirs('../../data/curated/min_distance')

In [25]:
YEARS = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
for year in YEARS:
   min_distance_df = pd.read_csv(f"../../data/distance/{year}_min_distance.csv")
   print(f"original min distance shape {min_distance_df.shape}")
   house_df = min_distance_df[['address', 'latitude_ori', 'longitude_ori', 'nbed', 'nbath', 'ncar',
      'weekly_rent', 'postcode', 'year', 'month', 'residence_type',
      'SA2_CODE', 'dist_to_cbd_KM']]
   mask = min_distance_df[['min_distance_to_place_KM', 'place_type']]
   
   park_dist_df = mask[mask['place_type'] == 'park']
    
   prim_dist_df = mask[mask['place_type'] == 'primary']

   second_dist_df = mask[mask['place_type'] == 'secondary']

   train_dist_df = mask[mask['place_type'] == 'station']

   added_df = house_df.join(park_dist_df).rename({"min_distance_to_place_KM": "min_distance_to_park"}, axis=1).drop('place_type', axis=1)
   added_df = added_df.join(prim_dist_df).rename({"min_distance_to_place_KM": "min_distance_to_prim"}, axis=1).drop('place_type', axis=1)
   added_df = added_df.join(second_dist_df).rename({"min_distance_to_place_KM": "min_distance_to_second"}, axis=1).drop('place_type', axis=1)
   added_df = added_df.join(train_dist_df).rename({"min_distance_to_place_KM": "min_distance_to_train"}, axis=1).drop('place_type', axis=1)

   transformed_df = added_df.groupby(['address', 'latitude_ori', 'longitude_ori', 'nbed', 'nbath', 'ncar', 'postcode', 'SA2_CODE', 'residence_type', 'dist_to_cbd_KM'], as_index=False)\
   .agg({'min_distance_to_park': 'first', 'min_distance_to_prim': 'first', 'min_distance_to_second': 'first', 'min_distance_to_train': 'first', 'weekly_rent': 'mean'})\
   .drop(['latitude_ori', 'longitude_ori'], axis=1)
   
   print(f"transformed df has shape {transformed_df.shape}")
   transformed_df.to_csv(f"../../data/curated/min_distance/{year}_min_distance.csv",index=False)

original min distance shape (34707, 22)
transformed df has shape (8526, 13)
original min distance shape (40289, 22)
transformed df has shape (9352, 13)
original min distance shape (41674, 22)
transformed df has shape (10191, 13)
original min distance shape (49700, 22)
transformed df has shape (11663, 13)
original min distance shape (56149, 22)
transformed df has shape (12873, 13)
original min distance shape (64021, 22)
transformed df has shape (14155, 13)
original min distance shape (70029, 22)
transformed df has shape (15928, 13)
original min distance shape (68584, 22)
transformed df has shape (15777, 13)
original min distance shape (83279, 22)
transformed df has shape (19203, 13)
original min distance shape (231399, 22)
transformed df has shape (54094, 13)


In [27]:
min_distance_df = pd.read_csv(f"../../data/distance/2020_min_distance.csv")
print(f"original min distance shape {min_distance_df.shape}")
house_df = min_distance_df[['address', 'latitude_ori', 'longitude_ori', 'nbed', 'nbath', 'ncar',
    'weekly_rent', 'postcode', 'year', 'month', 'residence_type',
    'SA2_CODE', 'dist_to_cbd_KM']]
mask = min_distance_df[['min_distance_to_place_KM', 'place_type']]

park_dist_df = mask[mask['place_type'] == 'park']

prim_dist_df = mask[mask['place_type'] == 'primary']

second_dist_df = mask[mask['place_type'] == 'secondary']

train_dist_df = mask[mask['place_type'] == 'station']

added_df = house_df.join(park_dist_df).rename({"min_distance_to_place_KM": "min_distance_to_park"}, axis=1).drop('place_type', axis=1)
added_df = added_df.join(prim_dist_df).rename({"min_distance_to_place_KM": "min_distance_to_prim"}, axis=1).drop('place_type', axis=1)
added_df = added_df.join(second_dist_df).rename({"min_distance_to_place_KM": "min_distance_to_second"}, axis=1).drop('place_type', axis=1)
added_df = added_df.join(train_dist_df).rename({"min_distance_to_place_KM": "min_distance_to_train"}, axis=1).drop('place_type', axis=1)

transformed_df = added_df.groupby(['address', 'latitude_ori', 'longitude_ori', 'nbed', 'nbath', 'ncar', 'postcode', 'SA2_CODE', 'residence_type', 'dist_to_cbd_KM'], as_index=False)\
.agg({'min_distance_to_park': 'first', 'min_distance_to_prim': 'first', 'min_distance_to_second': 'first', 'min_distance_to_train': 'first', 'weekly_rent': 'mean'})\
.drop(['latitude_ori', 'longitude_ori'], axis=1)

print(f"transformed df has shape {transformed_df.shape}")
transformed_df.head(20)

original min distance shape (68584, 22)
transformed df has shape (15777, 13)


Unnamed: 0,address,nbed,nbath,ncar,postcode,SA2_CODE,residence_type,dist_to_cbd_KM,min_distance_to_park,min_distance_to_prim,min_distance_to_second,min_distance_to_train,weekly_rent
0,". 'WILLOW COTTAGE' TARCOMBE RD, AVENEL",3.0,1.0,1,3664,204011058,House,126.1111,18.377,9.08981,,10.60099,250.0
1,". 'WILLOW COTTAGE' TARCOMBE ROAD, AVENEL",3.0,1.0,1,3664,204011058,House,127.42434,19.69024,10.40304,,11.91423,250.0
2,". UNDER APPLICATION ., DOLLAR",3.0,2.0,0,3871,205031087,Other,158.61377,6.05915,6.98571,19.96922,15.76803,370.0
3,". UNDER APPLICATION ., FAIRBANK",3.0,2.0,2,3951,205031090,Other,135.618,10.3244,10.17627,9.78421,6.72449,420.0
4,".275 SOLDIERS SETTLERS ROAD, TALLANGATTA VALLEY",3.0,1.0,2,3701,204031072,House,367.34544,12.32206,5.23339,15.68916,5.419,280.0
5,".4 BURNETT STREET, YARRAGON",3.0,1.0,1,3823,205011078,House,117.01551,0.91479,0.60114,9.59293,0.6766,300.0
6,"0 MANAGERS RESIDENCE, BOLINDA VALE, CLARKEFIELD",3.0,1.0,2,3430,210021235,House,50.62358,15.87879,0.88293,14.93019,0.85544,330.0
7,"003/903 DANDENONG ROAD, MALVERN EAST",1.0,1.0,0,3145,208041195,Other,13.86135,0.9325,0.9188,2.494,2.208,305.0
8,"005/903 DANDENONG ROAD, MALVERN EAST",1.0,1.0,0,3145,208041195,Other,13.86135,0.9325,0.9188,2.494,2.208,270.0
9,"01/17 MACQUARIE STREET, PRAHRAN",1.0,1.0,0,3181,206061136,Other,6.76544,,0.87658,0.16912,0.65365,260.0
