In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import hour, dayofweek, month, date_trunc, col

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

24/09/19 13:58:23 WARN Utils: Your hostname, coldbrew.local resolves to a loopback address: 127.0.0.1; using 10.12.240.157 instead (on interface en0)
24/09/19 13:58:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/19 13:58:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
domain_all = spark.read.parquet('../data/raw/domain.parquet')

In [4]:
# Preview the data
num_rows = domain_all.count()
print("Number of rows", num_rows)
domain_all.printSchema()

Number of rows 9828
root
 |-- url: string (nullable = true)
 |-- price: string (nullable = true)
 |-- address: string (nullable = true)
 |-- property_type: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Beds: double (nullable = true)
 |-- Baths: double (nullable = true)
 |-- Parking: double (nullable = true)
 |-- bond: double (nullable = true)
 |-- extracted_price: double (nullable = true)
 |-- geometry: binary (nullable = true)
 |-- sa2_code: string (nullable = true)
 |-- sa2_name: string (nullable = true)
 |-- chg_flag: string (nullable = true)
 |-- chg_lbl: string (nullable = true)
 |-- sa3_code: string (nullable = true)
 |-- sa3_name: string (nullable = true)
 |-- sa4_code: string (nullable = true)
 |-- sa4_name: string (nullable = true)
 |-- gcc_code: string (nullable = true)
 |-- gcc_name: string (nullable = true)
 |-- ste_code: string (nullable = true)
 |-- ste_name: string (nullable = true)
 |-- aus_code: string (n

In [5]:
# Create a dictionary to hold the counts of missing values (written by ChatGPT)
missing_counts = {col: domain_all.filter(domain_all[col].isNull()).count() for col in domain_all.columns}

# Display the counts
for column, count in missing_counts.items():
    print(f"Column {column} has {count} missing values.")

Column url has 0 missing values.
Column price has 0 missing values.
Column address has 0 missing values.
Column property_type has 0 missing values.
Column latitude has 0 missing values.
Column longitude has 0 missing values.
Column Beds has 143 missing values.
Column Baths has 74 missing values.
Column Parking has 11 missing values.
Column bond has 1272 missing values.
Column extracted_price has 162 missing values.
Column geometry has 0 missing values.
Column sa2_code has 2 missing values.
Column sa2_name has 2 missing values.
Column chg_flag has 2 missing values.
Column chg_lbl has 2 missing values.
Column sa3_code has 2 missing values.
Column sa3_name has 2 missing values.
Column sa4_code has 2 missing values.
Column sa4_name has 2 missing values.
Column gcc_code has 2 missing values.
Column gcc_name has 2 missing values.
Column ste_code has 2 missing values.
Column ste_name has 2 missing values.
Column aus_code has 2 missing values.
Column aus_name has 2 missing values.
Column areas

In [6]:
# Drop rows with missing extracted_price
domain_all = domain_all.dropna(subset=["extracted_price"])

In [7]:
# Show rows which have missing values
domain_all.filter(domain_all["Beds"].isNull() | domain_all["Baths"].isNull() | domain_all["Parking"].isNull()).show(200)

24/09/19 13:58:30 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+----+-----+-------+------+---------------+--------------------+---------+--------------------+--------+-----------+--------+--------------------+--------+--------------------+--------+-----------------+--------+--------+--------+---------+--------+--------------------+-----------------+
|                 url|               price|             address|       property_type|           latitude|         longitude|Beds|Baths|Parking|  bond|extracted_price|            geometry| sa2_code|            sa2_name|chg_flag|    chg_lbl|sa3_code|            sa3_name|sa4_code|            sa4_name|gcc_code|         gcc_name|ste_code|ste_name|aus_code| aus_name|areasqkm|            loci_uri|__index_level_0__|
+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+----+-----+-------+------+---------------+----------

In [8]:
# Impute beds, baths, and parking with 0
domain_all = domain_all.fillna(0, subset=["Beds", "Baths", "Parking"])

In [9]:
# Find rows without missing bond values
domain_all = domain_all.filter(domain_all["Bond"].isNotNull())

In [10]:
# Remove outliers in extracted_price (above 99th percentile) or below 0
q = domain_all.approxQuantile("extracted_price", [0.99], 0)
domain_all = domain_all.filter(domain_all["extracted_price"] < q[0])
domain_all = domain_all.filter(domain_all["extracted_price"] > 0)

In [11]:
# Save the cleaned data to a new parquet file
domain_all.write.mode("overwrite").parquet("../data/curated/domain_cleaned")

                                                                                