In [64]:
import json
import os
import re
import pandas as pd

In [65]:
DATA_DIRECTORY = "../scrape_data/"

In [66]:
data_files = [file for file in os.listdir(DATA_DIRECTORY) if file.endswith(".json")]

In [67]:
data_flat = []

In [68]:
def extract_number(text):
    match = re.search(r'Bond\s\$(\d+)', text)
    if match:
        return int(match.group(1))
    return None

In [69]:
def flatten_row(row):
    try:
        row_flat = {"url": row["url"], "price": row["price"], "address": row["address"],
                    "property_type": row["property_type"], "latitude": row["latitude"], "longitude": row["longitude"]}

        for feature in row["features"]:
            if len(feature.split()) == 2:
                feature_value, feature_name, *_ = feature.replace("- ", "").split()
                row_flat[feature_name] = feature_value

        for summary_item in row["summary"]:
            row_flat["bond"] = extract_number(summary_item)

        return row_flat
    except Exception as e:
        print(e)
        return None

In [70]:
for file in data_files:
    with open(DATA_DIRECTORY + file) as f:
        content = json.load(f)
        for row in content:
            flat_row = flatten_row(row)
            if flat_row is not None:
                data_flat.append(flat_row)

print(len(data_flat))

'url'
9828


In [71]:
df = pd.DataFrame(data_flat)
df.head()

Unnamed: 0,url,price,address,property_type,latitude,longitude,Beds,Baths,Parking,bond,Bath,Bed
0,https://www.domain.com.au/10-allara-court-donv...,"$1,400.00","10 Allara Court, Donvale VIC 3111",Townhouse,-37.774273,145.181126,4,3.0,2,9125.0,,
1,https://www.domain.com.au/7-pine-ridge-donvale...,$750 per week,"7 Pine Ridge, Donvale VIC 3111",House,-37.791251,145.175649,4,2.0,−,3259.0,,
2,https://www.domain.com.au/20-mulsanne-way-donv...,$1300 per week,"20 Mulsanne Way, Donvale VIC 3111",House,-37.797232,145.181264,5,2.0,2,5649.0,,
3,https://www.domain.com.au/3-monterey-crescent-...,$825pw / $3585pcm,"3 Monterey Crescent, Donvale VIC 3111",House,-37.792402,145.174323,3,,5,3585.0,1.0,
4,https://www.domain.com.au/3-49-leslie-street-d...,$680.00,"3/49 Leslie Street, Donvale VIC 3111",Townhouse,-37.781012,145.180705,3,2.0,2,2955.0,,


In [72]:
df["Beds"] = df["Beds"].replace(["−"], 0)
df["Baths"] = df["Baths"].replace(["−"], 0)
df["Parking"] = df["Parking"].replace(["−"], 0)

df['Beds'] = df['Beds'].fillna(df['Bed'])
df = df.drop(columns=['Bed'])

df['Baths'] = df['Baths'].fillna(df['Bath'])
df = df.drop(columns=['Bath'])

df["Beds"] = pd.to_numeric(df["Beds"], errors='coerce')
df["Baths"] = pd.to_numeric(df["Baths"], errors='coerce')
df["Parking"] = pd.to_numeric(df["Parking"], errors='coerce')

In [73]:
df.head()

Unnamed: 0,url,price,address,property_type,latitude,longitude,Beds,Baths,Parking,bond
0,https://www.domain.com.au/10-allara-court-donv...,"$1,400.00","10 Allara Court, Donvale VIC 3111",Townhouse,-37.774273,145.181126,4.0,3.0,2.0,9125.0
1,https://www.domain.com.au/7-pine-ridge-donvale...,$750 per week,"7 Pine Ridge, Donvale VIC 3111",House,-37.791251,145.175649,4.0,2.0,0.0,3259.0
2,https://www.domain.com.au/20-mulsanne-way-donv...,$1300 per week,"20 Mulsanne Way, Donvale VIC 3111",House,-37.797232,145.181264,5.0,2.0,2.0,5649.0
3,https://www.domain.com.au/3-monterey-crescent-...,$825pw / $3585pcm,"3 Monterey Crescent, Donvale VIC 3111",House,-37.792402,145.174323,3.0,1.0,5.0,3585.0
4,https://www.domain.com.au/3-49-leslie-street-d...,$680.00,"3/49 Leslie Street, Donvale VIC 3111",Townhouse,-37.781012,145.180705,3.0,2.0,2.0,2955.0


In [74]:
# Save as parquet
df.to_parquet("../data/raw/domain.parquet")