In [2]:
import polars as pl
import os
import json
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display_html
InteractiveShell.ast_node_interactivity = "all"

domain_dir = '../data/landing/domain/'

In [3]:
def get_first_valid(data, *keys):
    '''
    Get the first valid value from a list of keys in a dictionary
    '''
    for key in keys:
        try:
            value = eval(f"data{key}")
            if value is not None:
                return value
        except (KeyError, TypeError):
            continue
    return None

In [4]:
def read_json_file(file_path):
    '''
    Read a JSON file and extract relevant information into a dictionary    
    '''
    with open(file_path, 'r') as file:
        data = json.load(file)
        agency_id = get_first_valid(data, '["props"]["rootGraphQuery"]["listingByIdV2"]["agency"]["agencyId"]')
        listing_id = get_first_valid(data, '["props"]["id"]' , '["props"]["listingId"]')
        limited_agency_mode = get_first_valid(data, '["props"]["limitedAgencyMode"]')
        promotion_level = get_first_valid(data, '["props"]["rootGraphQuery"]["listingByIdV2"]["promoLevel"]')
        photo_count = get_first_valid(data, '["digitalData"]["page"]["pageInfo"]["property"]["photoCount"]')
        property_type = get_first_valid(data, '["props"]["listingSummary"]["propertyType"]', '["props"]["propertyType"]')
        num_bedrooms = get_first_valid(data, '["props"]["beds"]', '["props"]["listingSummary"]["beds"]')
        num_bathrooms = get_first_valid(data, f'["props"]["listingsMap"]["{listing_id}"]["listingModel"]["features"]["baths"]', '["props"]["listingSummary"]["baths"]', '["props"]["rootGraphQuery"]["listingByIdV2"]["bathrooms"]')
        parking = get_first_valid(data, f'["props"]["listingsMap"]["{listing_id}"]["listingModel"]["features"]["parking"]', '["props"]["listingSummary"]["parking"]', '["props"]["rootGraphQuery"]["listingByIdV2"]["carspaces"]')
        is_rural = get_first_valid(data, f'["props"]["listingsMap"]["{listing_id}"]["listingModel"]["features"]["isRural"]') 
        price = get_first_valid(data, '["props"]["rootGraphQuery"]["listingByIdV2"]["priceDetails"]["rawValues"]["exactPriceV2"]')
        bond = get_first_valid(data, '["props"]["rootGraphQuery"]["listingByIdV2"]["priceDetails"]["rawValues"]["bond"]')
        energy_eff_rating = get_first_valid(data, '["props"]["rootGraphQuery"]["listingByIdV2"]["energyEfficiencyRating"]')        
        nbn_download = get_first_valid(data, '["digitalData"]["page"]["pageInfo"]["property"]["nbnDetails"]["downloadSpeedInMbps"]')
        nbn_upload = get_first_valid(data, '["digitalData"]["page"]["pageInfo"]["property"]["nbnDetails"]["uploadSpeedInMbps"]')
        nbn_tech = get_first_valid(data, '["digitalData"]["page"]["pageInfo"]["property"]["nbnDetails"]["techType"]')
        nbn_speed_tier = get_first_valid(data, '["digitalData"]["page"]["pageInfo"]["property"]["nbnDetails"]["speedTier"]')
        nbn_service_status = get_first_valid(data, '["digitalData"]["page"]["pageInfo"]["property"]["nbnDetails"]["serviceStatus"]')
        
        
        unit_number = get_first_valid(data, '["props"]["address"]')
        street_number = get_first_valid(data, '["props"]["streetNumber"]')
        address = get_first_valid(data, '["props"]["address"]')
        suburb = get_first_valid(data, f'["props"]["listingsMap"]["{listing_id}"]["listingModel"]["address"]["suburb"]')
        state = get_first_valid(data, f'["props"]["listingsMap"]["{listing_id}"]["listingModel"]["address"]["state"]', '["props"]["stateAbbreviation"]')
        listing_date = get_first_valid(data, '["props"]["createdOn"]', '["props"]["domainSays"]["firstListedDate"]', '["props"]["rootGraphQuery"]["listingByIdV2"]["dateCreated"]["isoDate"]', '["digitalData"]["page"]["pageInfo"]["issueDate"]')
        actual_listing_date = get_first_valid(data, '["digitalData"]["page"]["pageInfo"]["property"]["dateListed"]')
        num_days_listed = get_first_valid(data, '["digitalData"]["page"]["pageInfo"]["property"]["daysListed"]')
        inspections_count = get_first_valid(data, '["digitalData"]["page"]["pageInfo"]["property"]["inspectionsCount"]')
        last_modification_date = get_first_valid(data, '["props"]["modifiedOn"]', '["props"]["rootGraphQuery"]["listingByIdV2"]["dateUpdated"]["isoDate"]')
        # Surrounding Suburbs = ["props"]["footer"]."surroundingSuburbs"
        postcode = get_first_valid(data, '["props"]["postcode"]')
        latitude = get_first_valid(data, '["props"]["map"]["latitude"]', f'["props"]["listingsMap"]["{listing_id}"]["listingModel"]["address"]["lat"]')
        longitude = get_first_valid(data, '["props"]["map"]["longitude"]', f'["props"]["listingsMap"]["{listing_id}"]["listingModel"]["address"]["lng"]')
        # Features = Process ["props"]["structuredFeatures"] to create individual binary features.
        # School Data = Extract relevant information from ["props"]["schoolCatchment"]."schools" (e.g., distance to nearest primary school, number of schools within a certain radius).
        has_photo = get_first_valid(data, '["digitalData"]["page"]["pageInfo"]["property"]["hasPhoto"]')
        has_floorplan = get_first_valid(data, '["digitalData"]["page"]["pageInfo"]["property"]["hasFloorplan"]')
        # education_level = ["props"]["schoolCatchment"]["schools"]["educationLevel"]
        # distance_ = ["props"]["schoolCatchment"]["schools"]["distance"]
        # school_postcode = ["props"]["schoolCatchment"]."schools"."postCode"
        num_visible_schools = get_first_valid(data, '["props"]["schoolCatchment"]["numberOfVisibleSchools"]')
        
        return {
            "agency_id" : agency_id,
            "listing_id" : listing_id,
            "limited_agency_mode" : limited_agency_mode,
            "promotion_level" : promotion_level,
            "photo_count" : photo_count,
            "property_type" : property_type,
            "num_bedrooms" : num_bedrooms,
            "num_bathrooms" : num_bathrooms,
            "parking" : parking,
            "is_rural" : is_rural,
            "price" : price,
            "bond" : bond,
            "energy_eff_rating" : energy_eff_rating,
            "nbn_download" : nbn_download,
            "nbn_upload" : nbn_upload,
            "nbn_tech" : nbn_tech,
            "nbn_speed_tier" : nbn_speed_tier,
            "nbn_service_status" : nbn_service_status,           
            "unit_number" : unit_number,
            "street_number" : street_number,
            "address" : address,
            "suburb" : suburb,
            "state" : state,
            "listing_date" : listing_date,
            "actual_listing_date" : actual_listing_date,
            "num_days_listed" : num_days_listed,
            "inspections_count" : inspections_count,
            "last_modification_date" : last_modification_date,
            # Surrounding Suburbs = ["props"]["footer"]."surroundingSuburbs"
            "postcode" : postcode,
            "latitude" : latitude,
            "longitude" : longitude,
            # Features = Process ["props"]["structuredFeatures"] to create individual binary features.
            # School Data = Extract relevant information from ["props"]["schoolCatchment"]."schools" (e.g., distance to nearest primary school, number of schools within a certain radius).
            "has_photo" : has_photo,
            "has_floorplan" : has_floorplan,
            # education_level = ["props"]["schoolCatchment"]["schools"]["educationLevel"]
            # distance_ = ["props"]["schoolCatchment"]["schools"]["distance"]
            # school_postcode = ["props"]["schoolCatchment"]."schools"."postCode"
            "num_visible_schools" : num_visible_schools            
        }
    
def read_json_directory(directory_path):
    # List to hold all JSON data
    json_data_list = []

    # Iterate through all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            json_data = read_json_file(file_path)
            json_data_list.append(json_data)

    # Create a Polars DataFrame from the list of JSON data
    df = pl.DataFrame(json_data_list)
    return df

In [5]:
# For example, we will take the first file in the directory
if not os.path.exists(domain_dir):
    print(f"Directory {domain_dir} does not exist")

else:
    df = read_json_directory(domain_dir)


Read 10011763.json
Read 10026864.json
Read 10073916.json
Read 10119117.json
Read 10128494.json
Read 10161062.json
Read 10174178.json
Read 10174988.json
Read 10208672.json
Read 10225364.json
Read 10229561.json
Read 10247037.json
Read 10254026.json
Read 10259497.json
Read 10282068.json
Read 10300988.json
Read 10310393.json
Read 10327239.json
Read 10344261.json
Read 10348185.json
Read 10357190.json
Read 10371427.json
Read 10386479.json
Read 10402244.json
Read 10424094.json
Read 10473058.json
Read 10483352.json
Read 10495039.json
Read 10522941.json
Read 10534630.json
Read 10544843.json
Read 10561886.json
Read 10566702.json
Read 10568315.json
Read 10586387.json
Read 10610449.json
Read 10610874.json
Read 10615662.json
Read 10621148.json
Read 10669332.json
Read 10694414.json
Read 10715072.json
Read 10726643.json
Read 10738030.json
Read 10739431.json
Read 10744418.json
Read 10801345.json
Read 10809711.json
Read 10826804.json
Read 10840290.json
Read 10856328.json
Read 10861845.json
Read 1088471

KeyboardInterrupt: 