# Pre-processing

In [1]:
from IPython.display import display
import pandas as pd
import re


DIR_RAW = "../data/raw/"
DIR_CUR = "../data/curated/"

PATTERN_PRICE = r"\$?\s*(\d[\d\.,]+)(([\s\/]*((per[\s\/]week)|(weekly)|(p[\/.]*w[k\.]*)|(wk)|(a week)|(w)|(week)|(p\/week)|(per weekly)|(per wk))\b)|$)"
PATTERN_BED = r"(\d+) beds?"
PATTERN_BATH = r"(\d+) baths?"
PATTERN_CAR = r"(\d+) parking"
PATTERN_BOND = r"bond \$?(\d+)"
PATTERN_INTERNAL_AREA = r"internal area ([\d\.]+)m"
PATTERN_LAND_AREA = r"land area ([\d\.]+)m"
PATTERN_LAST_SOLD = r"last sold in (\d{4})"
PATTERN_OTHER_SOLD = r"(\d+) other"
PATTERN_FIRST_LISTED = r"first listed on (\d+ \w+),"
PATTERN_POSTCODE = r"vic (\d{4})"
PATTERN_PERCENTAGE = r"(\d+\.?\d*)"
PATTERN_PERFOMANCE_PRICE = r"(\d+\.?\d*[mk]?)"
PATTERN_INT = r"(\d+)"
PATTERN_RANGE = r"(\d+ to \d+)|(\d+\+)"

FUNC_NONE = lambda x: x
FUNC_STR_TO_NUM = lambda x: float(x.replace(",", ""))
FUNC_PRICE_CONVERT = lambda x: (float(x[0:-1])*1000000 if x[-1] in "mM"
                                else float(x[0:-1])*1000 if x[-1] in "kK"
                                else float(x))
FUNC_PERCENTAGE = lambda x:float(x) / 100


def pattern_match(df, feature, pattern, function=FUNC_NONE):
    instances = df[feature]

    values = []
    for instance in instances:
        instance = str(instance)
        match = re.search(pattern, instance, flags=re.IGNORECASE)
        if match:
            values.append(function(match.group(1)))
        else:
            values.append(None)
    
    return values

In [2]:
df_raw = pd.read_json(f"{DIR_RAW}scrape_2022-09-02_04-12-25-567025.json")
df_cur = pd.DataFrame()

## Clean Scrapped Data
- need to also grab suburb
- should remove any address with Barooga NSW 3644 (vic postcode nsw town)
- should latitude/longitude be float or just str?
- probably should rename some features to be more meaningful

In [3]:
df_cur["url"] = df_raw["url"]
df_cur["postcode"] = pattern_match(df_raw, "address", PATTERN_POSTCODE)
# also need suburb

df_cur["weekly_rent"] = pattern_match(df_raw, "price", PATTERN_PRICE, FUNC_STR_TO_NUM)
df_cur["bond"] = pattern_match(df_raw, "bond", PATTERN_BOND, FUNC_STR_TO_NUM)

df_cur["num_beds"] = pattern_match(df_raw, "num_beds", PATTERN_BED, FUNC_STR_TO_NUM)
df_cur["num_baths"] = pattern_match(df_raw, "num_bath", PATTERN_BATH, FUNC_STR_TO_NUM)
df_cur["num_parking"] = pattern_match(df_raw, "num_car", PATTERN_CAR, FUNC_STR_TO_NUM)

df_cur["internal_area_m^2"] = pattern_match(df_raw, "internal_area", PATTERN_INTERNAL_AREA, FUNC_STR_TO_NUM)
df_cur["land_area_m^2"] = pattern_match(df_raw, "land_area", PATTERN_LAND_AREA, FUNC_STR_TO_NUM)

df_cur["last_sold"] = pattern_match(df_raw, "domain_says", PATTERN_LAST_SOLD)
df_cur["other_sold_n_bed_suburn"] = pattern_match(df_raw, "domain_says", PATTERN_OTHER_SOLD, FUNC_STR_TO_NUM)
#df_cur["first_listed"] = pattern_match(df_raw, "domain_says", PATTERN_FIRST_LISTED)

df_cur["neighbourhood_under_20"] = pattern_match(df_raw, "neighbourhood_under_20", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_20_to_39"] = pattern_match(df_raw, "neighbourhood_20_to_39", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_40_to_59"] = pattern_match(df_raw, "neighbourhood_40_to_59", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_above_60"] = pattern_match(df_raw, "neighbourhood_above_60", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_long_term_residents"] = pattern_match(df_raw, "neighbourhood_long_term_residents", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_owners"] = pattern_match(df_raw, "neighbourhood_owners", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_renter"] = pattern_match(df_raw, "neighbourhood_renter", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_family"] = pattern_match(df_raw, "neighbourhood_family", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_single"] = pattern_match(df_raw, "neighbourhood_single", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)

df_cur["performance_median_price"] = pattern_match(df_raw, "performance_median_price", PATTERN_PERFOMANCE_PRICE, FUNC_PRICE_CONVERT)
df_cur["performance_auction_clearance"] = pattern_match(df_raw, "performance_auction_clearance", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["performance_sold_this_year"] = pattern_match(df_raw, "performance_sold_this_year", PATTERN_INT, FUNC_STR_TO_NUM)
df_cur["performance_avg_days_on_market"] = pattern_match(df_raw, "performance_avg_days_on_market", PATTERN_INT, FUNC_STR_TO_NUM)

df_cur["demographic_population"] = pattern_match(df_raw, "demographic_population", PATTERN_INT, FUNC_STR_TO_NUM)
df_cur["demographic_owner"] = pattern_match(df_raw, "demographic_owner", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["demographic_renter"] = pattern_match(df_raw, "demographic_renter", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["demographic_family"] = pattern_match(df_raw, "demographic_family", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["demographic_single"] = pattern_match(df_raw, "demographic_single", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["demographic_average_age"] = pattern_match(df_raw, "demographic_average_age", PATTERN_RANGE)

df_cur["latitude"] = df_raw["latitude"].astype(float)
df_cur["longitude"] = df_raw["longitude"].astype(float)

## Merging Data Sets
- more to be merged

In [4]:
df_school = pd.read_csv(f"{DIR_CUR}property_to_school.csv")
df_cur = df_cur.merge(df_school, on="url")

## Save 'Current' Pre-Processed Data

In [5]:
print(f"There are {df_cur.shape[0]} rows")
#print(df_cur.dtypes)
display(df_cur.head(10))
df_cur.to_csv(f"{DIR_CUR}/pre_processed_data.csv", index=False)

There are 15271 rows


Unnamed: 0,url,postcode,weekly_rent,bond,num_beds,num_baths,num_parking,internal_area_m^2,land_area_m^2,last_sold,...,demographic_population,demographic_owner,demographic_renter,demographic_family,demographic_single,demographic_average_age,latitude,longitude,school_duration,school_distance
0,https://www.domain.com.au/40-esmond-street-ard...,3022,400.0,1738.0,3.0,2.0,1.0,,,2018.0,...,3.0,0.67,0.33,0.41,0.59,20 to 39,-37.775134,144.797067,8.303333,5332.2
1,https://www.domain.com.au/11-grant-avenue-gisb...,3437,600.0,2608.0,4.0,2.0,1.0,,,2022.0,...,9.0,0.83,0.17,0.58,0.42,40 to 59,-37.493363,144.595569,1.713333,717.6
2,https://www.domain.com.au/15-balmoral-avenue-b...,3083,575.0,2499.0,3.0,1.0,3.0,,,2014.0,...,28.0,0.72,0.28,0.47,0.53,20 to 39,-37.707414,145.065424,0.858333,214.6
3,https://www.domain.com.au/6-39-wellington-stre...,3182,600.0,2607.0,2.0,3.0,2.0,,,2011.0,...,20.0,0.35,0.65,0.22,0.78,20 to 39,-37.856525,144.985485,1.955,795.0
4,https://www.domain.com.au/7-9-sheffield-street...,3058,370.0,,2.0,1.0,1.0,,,,...,26.0,0.65,0.35,0.42,0.58,20 to 39,-37.74628,144.966701,1.346667,549.4
5,https://www.domain.com.au/1305-151-berkeley-st...,3000,700.0,3042.0,2.0,2.0,1.0,,,,...,47.0,0.3,0.7,0.24,0.76,20 to 39,-37.802205,144.958215,1.423333,638.7
6,https://www.domain.com.au/302-30-newquay-prome...,3008,1100.0,4780.0,3.0,2.0,1.0,103.0,,2022.0,...,10.0,0.36,0.64,0.39,0.61,20 to 39,-37.814536,144.942408,1.143333,327.0
7,https://www.domain.com.au/1-glee-street-wyndha...,3024,360.0,1564.0,3.0,2.0,2.0,,251.0,2016.0,...,23.0,0.72,0.28,0.51,0.49,20 to 39,-37.848156,144.593031,6.601667,4002.5
8,https://www.domain.com.au/108-mcconnell-street...,3031,680.0,2955.0,4.0,2.0,1.0,,,2022.0,...,10.0,0.47,0.53,0.34,0.66,20 to 39,-37.793445,144.92873,0.711667,189.4
9,https://www.domain.com.au/6-drum-st-rye-vic-39...,3941,650.0,2824.0,4.0,2.0,2.0,,,2019.0,...,8.0,0.75,0.25,0.48,0.52,,-38.385274,144.802606,5.371667,2977.6
