In [None]:
from tqdm import tqdm
import geopy
import pandas as pd
import os

In [None]:
property_files = '../data/'

In [None]:
df = pd.read_csv(f'{property_files}curated/past_listing.csv')
# print the size (length) of datasets to have a rough idea about 
# how much data are dropped each time
print(len(df))
df = df.rename(columns={'code': 'postcode'}, errors="coerce")
# the record must be listed more than once
# as we want to predict the future price using past listing data
df = df[df.groupby('address').address.transform('count') > 1]

In [None]:
df.head()

In [None]:
# read in the postcode information
postcode_df = pd.read_csv('../data/raw/abs/australian_postcodes.csv')
# only retain information that is relavent for faster running time
postcode_df = postcode_df.loc[postcode_df['state'] == 'VIC']
postcode_df = postcode_df[['postcode', 'lgaregion']]
df = pd.merge(df, postcode_df, on='postcode')

In [None]:
# a list of meltropolitan melbourne LGA that is defined by the Victorian government
metro_melb = ['Banyule', 'Bayside', 'Boroondara', 'Brimbank', 'Cardinia', 'Casey', 'Darebin', 'Frankston', 'Glen Eira',
              'Greater Dandenong', 'Hobsons Bay', 'Hume', 'Kingston', 'Knox', 'Manningham', 'Maribyrnong', 'Maroondah',
              'Melbourne', 'Melton', 'Monash', 'Moonee Valley', 'Moreland', 'Mornington Peninsula', 'Nillumbik',
              'Port Phillip', 'Stonnington', 'Whitehorse', 'Whittlesea', 'Wyndham', 'Yarra', 'Yarra Ranges']
# only retain records that are in Meltropolitan Melbourne
df = df.loc[df['lgaregion'].isin(metro_melb)]

In [None]:
# check again how many records are left
len(df)

In [None]:
# consider there are still a lot of entries and restricted API usage
# we only use entries with fairly recent records, namely properties 
# that have been listed in 2022 and 2021
addresses = df.loc[df['year'] == 2022]['address'].tolist()
addresses = df.loc[df['year'] == 2021]['address'].tolist() + addresses
# this deduplicate the records
# for example a property listed three times will have three rows 
# deduplication will make sure they only exist in the list once
addresses = list(set(addresses))
# check the number of addresses are going to be geocoded
len(addresses)

In [None]:
addresses[0]

In [None]:
# geocode_df
geocode_df = []
coded_l = []
# coded_l = geocode_df.address.to_list()

In [None]:
# as there is a request limit of 1 request per second for Nominatim API
# considering the large size of data, the estimated run time would be multiple days
# hence, we need to cache responses
# this chunck is reading the response from API
if os.path.exists(f'{property_files}raw/geo.csv'):
    df2 = pd.read_csv(f'{property_files}raw/geo.csv').iloc[: , 1:]
    temp = []
    addresses_saved = df2['address'].tolist()
    # remove the addresses that have been requested
    for i in tqdm(addresses):
        if i not in addresses_saved:
            temp.append(i)
    addresses = temp
    len(addresses)

In [None]:
# request geocode for each address
count = 0;
geo_data = []
for address in tqdm(addresses):
    locator = geopy.Nominatim(user_agent="myGeocoder");
    location = locator.geocode(address+", VICTORIA",timeout=None);
    if location != None:
        info = [address, location.address, location.latitude, location.longitude]
        # save response into a list
        geo_data.append(info)

In [None]:
if os.path.exists(f'{property_files}raw/geo.csv'):
    # merge with previously response
    df2 = df2.append(pd.DataFrame(geo_data, columns = df2.columns))
else:
    # or cache the response as a dataframe
    df2 = pd.DataFrame(geo_data, columns = ['address', 'loc_address', 'lat', 'lon'])

In [None]:
# preview the result
df2.head()

In [None]:
# save/cache result to a csv file 
df2.to_csv(f'{property_files}/raw/geo.csv')