# Pre-processing

In [1]:
from IPython.display import display
import pandas as pd
import re
import numpy as np


DIR_RAW = "../data/raw/"
DIR_CUR = "../data/curated/"


df_postcode = pd.read_csv(f"{DIR_RAW}postcodes.csv")
df_postcode = df_postcode[df_postcode["state"] == "VIC"]
suburbs = set(map(lambda x: x.lower(), df_postcode["locality"].unique()))


PATTERN_PRICE = r"\$?\s*(\d[\d\.,]+)(([\s\/]*((per[\s\/]week)|(weekly)|(p[\/.]*w[k\.]*)|(wk)|(a week)|(w)|(week)|(p\/week)|(per weekly)|(per wk))\b)|$)"
PATTERN_BED = r"^(\d+) beds?"
PATTERN_BATH = r"^(\d+) baths?"
PATTERN_CAR = r"^(\d+) parking"
PATTERN_STATE = r".+ (\w+) \d{4}"
PATTERN_SUBURB = f"({'|'.join(suburbs)}|sanctuary lakes)( vic)?"
PATTERN_BOND = r"bond \$?(\d+)"
PATTERN_INTERNAL_AREA = r"internal area ([\d\.]+)m"
PATTERN_LAND_AREA = r"land area ([\d\.]+)m"
PATTERN_LAST_SOLD = r"last sold in (\d{4})"
PATTERN_OTHER_SOLD = r"(\d+) other"
PATTERN_FIRST_LISTED = r"first listed on (\d+ \w+),"
PATTERN_POSTCODE = r"vic (\d{4})"
PATTERN_PERCENTAGE = r"(\d+\.?\d*)"
PATTERN_PERFOMANCE_PRICE = r"(\d+\.?\d*[mk]?)"
PATTERN_INT = r"([\d,]+)"
PATTERN_RANGE = r"(\d+ to \d+)|(\d+\+)"

FUNC_NONE = lambda x: x
FUNC_STR_TO_NUM = lambda x: float(x.replace(",", ""))
FUNC_PRICE_CONVERT = lambda x: (float(x[0:-1])*1000000 if x[-1] in "mM"
                                else float(x[0:-1])*1000 if x[-1] in "kK"
                                else float(x))
FUNC_PERCENTAGE = lambda x:float(x) / 100


# show all attributes when displayed and don't truncate values
pd.options.display.max_columns = None
pd.set_option('display.max_colwidth', None)


def pattern_match(df, feature, pattern, function=FUNC_NONE):
    instances = df[feature]

    values = []
    for instance in instances:
        instance = str(instance).lower()
        match = re.search(pattern, instance, flags=re.IGNORECASE)
        if match:
            values.append(function(match.group(1)))
        else:
            values.append(None)
    
    return values

In [2]:
df_raw = pd.read_json(f"{DIR_RAW}scrape_2022-09-02_04-12-25-567025.json")
df_cur = pd.DataFrame()

## Clean Scrapped Data

In [3]:
df_cur["url"] = df_raw["url"]
df_cur["postcode"] = pattern_match(df_raw, "address", PATTERN_POSTCODE)
df_cur["suburb"] = pattern_match(df_raw, "address", PATTERN_SUBURB)  # takes a long time
df_cur["state"] = pattern_match(df_raw, "address", PATTERN_STATE)

df_cur["weekly_rent"] = pattern_match(df_raw, "price", PATTERN_PRICE, FUNC_STR_TO_NUM)
df_cur["bond"] = pattern_match(df_raw, "bond", PATTERN_BOND, FUNC_STR_TO_NUM)

df_cur["num_beds"] = pattern_match(df_raw, "num_beds", PATTERN_BED, FUNC_STR_TO_NUM)
df_cur["num_baths"] = pattern_match(df_raw, "num_bath", PATTERN_BATH, FUNC_STR_TO_NUM)
df_cur["num_parking"] = pattern_match(df_raw, "num_car", PATTERN_CAR, FUNC_STR_TO_NUM)

df_cur["property_type"] = df_raw["property_type"]

df_cur["internal_area"] = pattern_match(df_raw, "internal_area", PATTERN_INTERNAL_AREA, FUNC_STR_TO_NUM)
df_cur["land_area"] = pattern_match(df_raw, "land_area", PATTERN_LAND_AREA, FUNC_STR_TO_NUM)

df_cur["last_sold"] = pattern_match(df_raw, "domain_says", PATTERN_LAST_SOLD)
df_cur["other_sold_n_bed_suburb"] = pattern_match(df_raw, "domain_says", PATTERN_OTHER_SOLD, FUNC_STR_TO_NUM)
#df_cur["first_listed"] = pattern_match(df_raw, "domain_says", PATTERN_FIRST_LISTED)

df_cur["neighbourhood_under_20"] = pattern_match(df_raw, "neighbourhood_under_20", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_20_to_39"] = pattern_match(df_raw, "neighbourhood_20_to_39", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_40_to_59"] = pattern_match(df_raw, "neighbourhood_40_to_59", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_above_60"] = pattern_match(df_raw, "neighbourhood_above_60", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_long_term_residents"] = pattern_match(df_raw, "neighbourhood_long_term_residents", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_owners"] = pattern_match(df_raw, "neighbourhood_owners", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_renter"] = pattern_match(df_raw, "neighbourhood_renter", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_family"] = pattern_match(df_raw, "neighbourhood_family", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["neighbourhood_single"] = pattern_match(df_raw, "neighbourhood_single", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)

df_cur["performance_median_price"] = pattern_match(df_raw, "performance_median_price", PATTERN_PERFOMANCE_PRICE, FUNC_PRICE_CONVERT)
df_cur["performance_auction_clearance"] = pattern_match(df_raw, "performance_auction_clearance", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["performance_sold_this_year"] = pattern_match(df_raw, "performance_sold_this_year", PATTERN_INT, FUNC_STR_TO_NUM)
df_cur["performance_avg_days_on_market"] = pattern_match(df_raw, "performance_avg_days_on_market", PATTERN_INT, FUNC_STR_TO_NUM)

df_cur["demographic_population"] = pattern_match(df_raw, "demographic_population", PATTERN_INT, FUNC_STR_TO_NUM)
df_cur["demographic_owner"] = pattern_match(df_raw, "demographic_owner", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["demographic_renter"] = pattern_match(df_raw, "demographic_renter", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["demographic_family"] = pattern_match(df_raw, "demographic_family", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["demographic_single"] = pattern_match(df_raw, "demographic_single", PATTERN_PERCENTAGE, FUNC_PERCENTAGE)
df_cur["demographic_average_age"] = pattern_match(df_raw, "demographic_average_age", PATTERN_RANGE)

df_cur["latitude"] = df_raw["latitude"].astype(float)
df_cur["longitude"] = df_raw["longitude"].astype(float)

## Merging Data Sets

In [4]:
API_FEATURES = ["school_duration", "school_distance",
                "park_duration", "park_distance",
                "shop_duration", "shop_distance"]


df_school = pd.read_csv(f"{DIR_CUR}property_to_school.csv")
df_cur = df_cur.merge(df_school, on="url")

# zero distances/durations are NaN
df_cur[API_FEATURES] = df_cur[API_FEATURES].replace(0, np.nan)

## Outlier Detection

### Remove NSW town
All VIC postcodes start with 3 and and all NSW postcodes start with 2. There is a NSW town (Barooga NSW 3644) who uses a VIC postcode

In [5]:
print(f"Instances before outlier removal: {len(df_cur.index)}")
df_cur = df_cur[df_cur["state"] != "nsw"]
print(f"Instances after  outlier removal: {len(df_cur.index)}")

Instances before outlier removal: 15271
Instances after  outlier removal: 15268


### Remove carspaces
Any property listed as carspace is removed

In [6]:
print(f"Instances before outlier removal: {len(df_cur.index)}")
df_cur = df_cur[df_cur["property_type"] != "Carspace"]
print(f"Instances after  outlier removal: {len(df_cur.index)}")

Instances before outlier removal: 15268
Instances after  outlier removal: 15259


### Remove outliers based on the lower fence and 2 standard deviations

this just removes extreme outliers where data has been input incorrectly

In [7]:
Q1 = df_cur["weekly_rent"].quantile(0.25)
Q3 = df_cur["weekly_rent"].quantile(0.75)
IQR = Q3 - Q1

mean = df_cur["weekly_rent"].mean()
std = df_cur["weekly_rent"].std()

lower_fence = Q1 - 1.5*IQR
upper_bounds = mean + 2*std

df_cur = df_cur[df_cur["weekly_rent"].between(lower_fence, upper_bounds)]

### Remove outliers based on cook's distance

In [8]:
from sklearn.cluster import KMeans
from statsmodels.stats.outliers_influence import OLSInfluence
import statsmodels.regression.linear_model as lm


PRED = ["num_beds", "num_baths", "num_parking", "bond",
        "school_distance", "park_distance", "shop_distance"]

TARG = "weekly_rent"

# impute missing values
df_impute = df_cur.copy(deep=True)
df_impute[PRED + [TARG]] = df_cur[PRED + [TARG]].fillna(df_cur[PRED + [TARG]].mean())

# Fit an ordinary linear model
model = lm.OLS(df_impute[[TARG]], df_impute[PRED])
influence = OLSInfluence(model.fit())

print(f"Instances before outleir removal: {len(df_cur.index)}")
df_cur =  df_cur[influence.cooks_distance[0] < 0.002]
print(f"Instances after  outleir removal: {len(df_cur.index)}")

Instances before outleir removal: 14605
Instances after  outleir removal: 14490


## Save 'Current' Pre-Processed Data

In [9]:
print(f"There are {df_cur.shape[0]} rows")
#print(df_cur.dtypes)
display(df_cur.head(10))
df_cur.to_csv(f"{DIR_CUR}/pre_processed_data.csv", index=False)

There are 14490 rows


Unnamed: 0.5,url,postcode,suburb,state,weekly_rent,bond,num_beds,num_baths,num_parking,property_type,internal_area,land_area,last_sold,other_sold_n_bed_suburb,neighbourhood_under_20,neighbourhood_20_to_39,neighbourhood_40_to_59,neighbourhood_above_60,neighbourhood_long_term_residents,neighbourhood_owners,neighbourhood_renter,neighbourhood_family,neighbourhood_single,performance_median_price,performance_auction_clearance,performance_sold_this_year,performance_avg_days_on_market,demographic_population,demographic_owner,demographic_renter,demographic_family,demographic_single,demographic_average_age,latitude,longitude,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,school_duration,school_distance,park_duration,park_distance,shop_duration,shop_distance
0,https://www.domain.com.au/40-esmond-street-ardeer-vic-3022-16035224,3022,esmond,vic,400.0,1738.0,3.0,2.0,1.0,House,,,2018.0,37.0,0.26,0.26,0.26,0.21,0.66,0.74,0.26,0.49,0.51,680000.0,0.56,37.0,44.0,3099.0,0.67,0.33,0.41,0.59,20 to 39,-37.775134,144.797067,0,0,0,0,0,8.303333,5332.2,0.101667,42.6,5.595,2903.6
1,https://www.domain.com.au/11-grant-avenue-gisborne-vic-3437-16035741,3437,gisborne,vic,600.0,2608.0,4.0,2.0,1.0,House,,,2022.0,92.0,0.25,0.23,0.28,0.24,0.67,0.81,0.19,0.62,0.38,1098000.0,0.69,92.0,41.0,9000.0,0.83,0.17,0.58,0.42,40 to 59,-37.493363,144.595569,1,1,1,1,1,1.713333,717.6,16.075,15921.0,17.856667,17363.3
2,https://www.domain.com.au/15-balmoral-avenue-bundoora-vic-3083-16058638,3083,balmoral,vic,575.0,2499.0,3.0,1.0,3.0,House,,,2014.0,229.0,0.23,0.26,0.27,0.25,0.79,0.84,0.16,0.57,0.43,830000.0,0.75,229.0,52.0,28681.0,0.72,0.28,0.47,0.53,20 to 39,-37.707414,145.065424,2,2,2,2,2,0.858333,214.6,1.67,423.2,7.698333,3781.9
3,https://www.domain.com.au/6-39-wellington-street-st-kilda-vic-3182-16053159,3182,st kilda,vic,600.0,2607.0,2.0,3.0,2.0,Apartment / Unit / Flat,,,2011.0,361.0,0.11,0.54,0.28,0.07,0.35,0.24,0.76,0.47,0.53,630000.0,0.53,361.0,72.0,20218.0,0.35,0.65,0.22,0.78,20 to 39,-37.856525,144.985485,3,3,3,3,3,1.955,795.0,1.28,518.2,3.696667,1723.3
4,https://www.domain.com.au/7-9-sheffield-street-coburg-vic-3058-13008792,3058,coburg,vic,370.0,,2.0,1.0,1.0,Apartment / Unit / Flat,,,,126.0,0.18,0.37,0.25,0.2,0.62,0.64,0.36,0.47,0.53,557000.0,0.42,126.0,131.0,26188.0,0.65,0.35,0.42,0.58,20 to 39,-37.74628,144.966701,4,4,4,4,4,1.346667,549.4,0.783333,404.1,0.958333,453.9
5,https://www.domain.com.au/1305-151-berkeley-street-melbourne-vic-3000-16010201,3000,melbourne,vic,700.0,3042.0,2.0,2.0,1.0,Apartment / Unit / Flat,,,,592.0,0.07,0.78,0.12,0.02,0.13,0.06,0.94,0.11,0.89,542000.0,0.44,592.0,147.0,47279.0,0.3,0.7,0.24,0.76,20 to 39,-37.802205,144.958215,5,5,5,5,5,1.423333,638.7,0.983333,406.2,1.708333,813.8
6,https://www.domain.com.au/302-30-newquay-promenade-docklands-vic-3008-16061690,3008,docklands,vic,1100.0,4780.0,3.0,2.0,1.0,Apartment / Unit / Flat,103.0,,2022.0,70.0,0.14,0.56,0.23,0.07,0.2,0.42,0.58,0.48,0.52,1150000.0,,70.0,112.0,10962.0,0.36,0.64,0.39,0.61,20 to 39,-37.814536,144.942408,6,6,6,6,6,1.143333,327.0,,,1.881667,742.4
7,https://www.domain.com.au/1-glee-street-wyndham-vale-vic-3024-15342537,3024,wyndham vale,vic,360.0,1564.0,3.0,2.0,2.0,House,,251.0,2016.0,280.0,0.0,0.0,0.37,0.63,0.61,0.56,0.44,0.59,0.41,505000.0,0.79,280.0,42.0,23294.0,0.72,0.28,0.51,0.49,20 to 39,-37.848156,144.593031,7,7,7,7,7,6.601667,4002.5,21.043333,14623.6,18.41,12043.6
8,https://www.domain.com.au/108-mcconnell-street-kensington-vic-3031-16003703,3031,kensington,vic,680.0,2955.0,4.0,2.0,1.0,House,,,2022.0,24.0,0.2,0.34,0.31,0.15,0.49,0.57,0.43,0.51,0.49,1514000.0,0.74,24.0,,10821.0,0.47,0.53,0.34,0.66,20 to 39,-37.793445,144.92873,8,8,8,8,8,0.711667,189.4,0.638333,169.6,3.298333,905.5
9,https://www.domain.com.au/6-drum-st-rye-vic-3941-16057271,3941,rye,vic,650.0,2824.0,4.0,2.0,2.0,House,,,2019.0,104.0,0.31,0.17,0.27,0.25,0.66,0.77,0.23,0.64,0.36,1523000.0,0.79,104.0,37.0,8412.0,0.75,0.25,0.48,0.52,,-38.385274,144.802606,9,9,9,9,9,5.371667,2977.6,,,56.715,70202.4
