# Pre-processing

In [14]:
from IPython.display import display
import pandas as pd
import re


DIR_RAW = "../data/raw/"
DIR_CUR = "../data/curated/"

PATTERN_PRICE = r"\$?\s*(\d[\d\.,]+)(([\s\/]*((per[\s\/]week)|(weekly)|(p[\/.]*w[k\.]*)|(wk)|(a week)|(w)|(week)|(p\/week)|(per weekly)|(per wk))\b)|$)"
PATTERN_BED = r"(\d+) beds?"
PATTERN_BATH = r"(\d+) baths?"
PATTERN_CAR = r"(\d+) parking"
PATTERN_STATE = r".+ (\w+) \d{4}"
PATTERN_BOND = r"bond \$?(\d+)"
PATTERN_INTERNAL_AREA = r"internal area ([\d\.]+)m"
PATTERN_LAND_AREA = r"land area ([\d\.]+)m"
PATTERN_LAST_SOLD = r"last sold in (\d{4})"
PATTERN_OTHER_SOLD = r"(\d+) other"
PATTERN_FIRST_LISTED = r"first listed on (\d+ \w+),"
PATTERN_POSTCODE = r"vic (\d{4})"
PATTERN_PERCENTAGE = r"(\d+\.?\d*)"
PATTERN_PERFOMANCE_PRICE = r"(\d+\.?\d*[mk]?)"
PATTERN_INT = r"([\d,]+)"
PATTERN_RANGE = r"(\d+ to \d+)|(\d+\+)"

FUNC_NONE = lambda x: x
FUNC_STR_TO_NUM = lambda x: float(x.replace(",", ""))
FUNC_PRICE_CONVERT = lambda x: (float(x[0:-1])*1000000 if x[-1] in "mM"
                                else float(x[0:-1])*1000 if x[-1] in "kK"
                                else float(x))
FUNC_PERCENTAGE = lambda x:float(x) / 100


pd.options.display.max_columns = None


def pattern_match(df, feature, pattern, function=FUNC_NONE):
    instances = df[feature]

    values = []
    for instance in instances:
        instance = str(instance)
        match = re.search(pattern, instance, flags=re.IGNORECASE)
        if match:
            values.append(function(match.group(1)))
        else:
            values.append(None)
    
    return values

In [15]:
df_raw = pd.read_json(f"{DIR_RAW}scrape_2022-09-02_04-12-25-567025.json")
df_cur = pd.DataFrame()

## Clean Scrapped Data
- need to also grab suburb
- should remove any address with Barooga NSW 3644 (vic postcode nsw town)
- should latitude/longitude be float or just str?
- probably should rename some features to be more meaningful

In [16]:
df_cur["url"] = df_raw["url"]
df_cur["postcode"] = pattern_match(df_raw, "address", PATTERN_POSTCODE)
df_cur["state"] = pattern_match(df_raw, "address", PATTERN_STATE)

df_cur["weekly_rent"] = pattern_match(df_raw, "price", PATTERN_PRICE, FUNC_STR_TO_NUM)
df_cur["bond"] = pattern_match(df_raw, "bond", PATTERN_BOND, FUNC_STR_TO_NUM)

df_cur["num_beds"] = pattern_match(df_raw, "num_beds", PATTERN_BED, FUNC_STR_TO_NUM)
df_cur["num_baths"] = pattern_match(df_raw, "num_bath", PATTERN_BATH, FUNC_STR_TO_NUM)
df_cur["num_parking"] = pattern_match(df_raw, "num_car", PATTERN_CAR, FUNC_STR_TO_NUM)

df_cur["property_type"] = df_raw["property_type"]

df_cur["internal_area"] = pattern_match(df_raw, "internal_area", PATTERN_INTERNAL_AREA, FUNC_STR_TO_NUM)
df_cur["land_area"] = pattern_match(df_raw, "land_area", PATTERN_LAND_AREA, FUNC_STR_TO_NUM)

df_cur["last_sold"] = pattern_match(df_raw, "domain_says", PATTERN_LAST_SOLD)
df_cur["other_sold_n_bed_suburb"] = pattern_match(df_raw, "domain_says", PATTERN_OTHER_SOLD, FUNC_STR_TO_NUM)
#df_cur["first_listed"] = pattern_match(df_raw, "domain_says", PATTERN_FIRST_LISTED)

df_cur["neighbourhood_under_20"] = pattern_match(df_raw, "neighbourhood_under_20", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_20_to_39"] = pattern_match(df_raw, "neighbourhood_20_to_39", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_40_to_59"] = pattern_match(df_raw, "neighbourhood_40_to_59", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_above_60"] = pattern_match(df_raw, "neighbourhood_above_60", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_long_term_residents"] = pattern_match(df_raw, "neighbourhood_long_term_residents", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_owners"] = pattern_match(df_raw, "neighbourhood_owners", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_renter"] = pattern_match(df_raw, "neighbourhood_renter", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_family"] = pattern_match(df_raw, "neighbourhood_family", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_single"] = pattern_match(df_raw, "neighbourhood_single", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)

df_cur["performance_median_price"] = pattern_match(df_raw, "performance_median_price", PATTERN_PERFOMANCE_PRICE, FUNC_PRICE_CONVERT)
df_cur["performance_auction_clearance"] = pattern_match(df_raw, "performance_auction_clearance", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["performance_sold_this_year"] = pattern_match(df_raw, "performance_sold_this_year", PATTERN_INT, FUNC_STR_TO_NUM)
df_cur["performance_avg_days_on_market"] = pattern_match(df_raw, "performance_avg_days_on_market", PATTERN_INT, FUNC_STR_TO_NUM)

df_cur["demographic_population"] = pattern_match(df_raw, "demographic_population", PATTERN_INT, FUNC_STR_TO_NUM)
df_cur["demographic_owner"] = pattern_match(df_raw, "demographic_owner", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["demographic_renter"] = pattern_match(df_raw, "demographic_renter", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["demographic_family"] = pattern_match(df_raw, "demographic_family", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["demographic_single"] = pattern_match(df_raw, "demographic_single", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["demographic_average_age"] = pattern_match(df_raw, "demographic_average_age", PATTERN_RANGE)

df_cur["latitude"] = df_raw["latitude"].astype(float)
df_cur["longitude"] = df_raw["longitude"].astype(float)

## Merging Data Sets
- more to be merged

In [17]:
df_school = pd.read_csv(f"{DIR_CUR}property_to_school.csv")
df_cur = df_cur.merge(df_school, on="url")

## Save 'Current' Pre-Processed Data

In [18]:
print(f"There are {df_cur.shape[0]} rows")
#print(df_cur.dtypes)
display(df_cur.head(10))
df_cur.to_csv(f"{DIR_CUR}/pre_processed_data.csv", index=False)

There are 15271 rows


Unnamed: 0,url,postcode,state,weekly_rent,bond,num_beds,num_baths,num_parking,property_type,internal_area,land_area,last_sold,other_sold_n_bed_suburb,neighbourhood_under_20,neighbourhood_20_to_39,neighbourhood_40_to_59,neighbourhood_above_60,neighbourhood_long_term_residents,neighbourhood_owners,neighbourhood_renter,neighbourhood_family,neighbourhood_single,performance_median_price,performance_auction_clearance,performance_sold_this_year,performance_avg_days_on_market,demographic_population,demographic_owner,demographic_renter,demographic_family,demographic_single,demographic_average_age,latitude,longitude,school_duration,school_distance
0,https://www.domain.com.au/40-esmond-street-ard...,3022,VIC,400.0,1738.0,3.0,2.0,1.0,House,,,2018.0,37.0,0.26,0.26,0.26,0.21,0.66,0.74,0.26,0.49,0.51,680000.0,0.56,37.0,44.0,3099.0,0.67,0.33,0.41,0.59,20 to 39,-37.775134,144.797067,8.303333,5332.2
1,https://www.domain.com.au/11-grant-avenue-gisb...,3437,VIC,600.0,2608.0,4.0,2.0,1.0,House,,,2022.0,92.0,0.25,0.23,0.28,0.24,0.67,0.81,0.19,0.62,0.38,1098000.0,0.69,92.0,41.0,9000.0,0.83,0.17,0.58,0.42,40 to 59,-37.493363,144.595569,1.713333,717.6
2,https://www.domain.com.au/15-balmoral-avenue-b...,3083,VIC,575.0,2499.0,3.0,1.0,3.0,House,,,2014.0,229.0,0.23,0.26,0.27,0.25,0.79,0.84,0.16,0.57,0.43,830000.0,0.75,229.0,52.0,28681.0,0.72,0.28,0.47,0.53,20 to 39,-37.707414,145.065424,0.858333,214.6
3,https://www.domain.com.au/6-39-wellington-stre...,3182,VIC,600.0,2607.0,2.0,3.0,2.0,Apartment / Unit / Flat,,,2011.0,361.0,0.11,0.54,0.28,0.07,0.35,0.24,0.76,0.47,0.53,630000.0,0.53,361.0,72.0,20218.0,0.35,0.65,0.22,0.78,20 to 39,-37.856525,144.985485,1.955,795.0
4,https://www.domain.com.au/7-9-sheffield-street...,3058,VIC,370.0,,2.0,1.0,1.0,Apartment / Unit / Flat,,,,126.0,0.18,0.37,0.25,0.2,0.62,0.64,0.36,0.47,0.53,557000.0,0.42,126.0,131.0,26188.0,0.65,0.35,0.42,0.58,20 to 39,-37.74628,144.966701,1.346667,549.4
5,https://www.domain.com.au/1305-151-berkeley-st...,3000,VIC,700.0,3042.0,2.0,2.0,1.0,Apartment / Unit / Flat,,,,592.0,0.07,0.78,0.12,0.02,0.13,0.06,0.94,0.11,0.89,542000.0,0.44,592.0,147.0,47279.0,0.3,0.7,0.24,0.76,20 to 39,-37.802205,144.958215,1.423333,638.7
6,https://www.domain.com.au/302-30-newquay-prome...,3008,VIC,1100.0,4780.0,3.0,2.0,1.0,Apartment / Unit / Flat,103.0,,2022.0,70.0,0.14,0.56,0.23,0.07,0.2,0.42,0.58,0.48,0.52,1150000.0,,70.0,112.0,10962.0,0.36,0.64,0.39,0.61,20 to 39,-37.814536,144.942408,1.143333,327.0
7,https://www.domain.com.au/1-glee-street-wyndha...,3024,VIC,360.0,1564.0,3.0,2.0,2.0,House,,251.0,2016.0,280.0,0.0,0.0,0.37,0.63,0.61,0.56,0.44,0.59,0.41,505000.0,0.79,280.0,42.0,23294.0,0.72,0.28,0.51,0.49,20 to 39,-37.848156,144.593031,6.601667,4002.5
8,https://www.domain.com.au/108-mcconnell-street...,3031,VIC,680.0,2955.0,4.0,2.0,1.0,House,,,2022.0,24.0,0.2,0.34,0.31,0.15,0.49,0.57,0.43,0.51,0.49,1514000.0,0.74,24.0,,10821.0,0.47,0.53,0.34,0.66,20 to 39,-37.793445,144.92873,0.711667,189.4
9,https://www.domain.com.au/6-drum-st-rye-vic-39...,3941,VIC,650.0,2824.0,4.0,2.0,2.0,House,,,2019.0,104.0,0.31,0.17,0.27,0.25,0.66,0.77,0.23,0.64,0.36,1523000.0,0.79,104.0,37.0,8412.0,0.75,0.25,0.48,0.52,,-38.385274,144.802606,5.371667,2977.6
