# Preprocess Domain Data Part 2
Impute missing values and remove outliers.
Written by Daksh Agrawal

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import hour, dayofweek, month, date_trunc, col

In [14]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

In [15]:
domain_all = spark.read.csv('../data/raw/domain.csv', header=True)

In [16]:
# Preview the data
num_rows = domain_all.count()
print("Number of rows", num_rows)
domain_all.printSchema()

Number of rows 9828
root
 |-- _c0: string (nullable = true)
 |-- url: string (nullable = true)
 |-- price: string (nullable = true)
 |-- address: string (nullable = true)
 |-- property_type: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- Beds: string (nullable = true)
 |-- Baths: string (nullable = true)
 |-- Parking: string (nullable = true)
 |-- bond: string (nullable = true)
 |-- extracted_price: string (nullable = true)
 |-- geometry: string (nullable = true)
 |-- index_right: string (nullable = true)
 |-- sa2_code: string (nullable = true)
 |-- sa2_name: string (nullable = true)
 |-- chg_flag: string (nullable = true)
 |-- chg_lbl: string (nullable = true)
 |-- sa3_code: string (nullable = true)
 |-- sa3_name: string (nullable = true)
 |-- sa4_code: string (nullable = true)
 |-- sa4_name: string (nullable = true)
 |-- gcc_code: string (nullable = true)
 |-- gcc_name: string (nullable = true)
 |-- ste_code: string (nul

In [17]:
# Create a dictionary to hold the counts of missing values (written by ChatGPT)
missing_counts = {col: domain_all.filter(domain_all[col].isNull()).count() for col in domain_all.columns}

# Display the counts
for column, count in missing_counts.items():
    print(f"Column {column} has {count} missing values.")

24/10/03 12:18:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: _c0
Expected: _c0 but found: 
CSV file: file:///Users/dakshagrawal/PycharmProjects/project-2-group-real-estate-industry-project-3/data/raw/domain.csv


Column _c0 has 0 missing values.
Column url has 0 missing values.
Column price has 0 missing values.
Column address has 0 missing values.
Column property_type has 0 missing values.
Column latitude has 0 missing values.
Column longitude has 0 missing values.
Column Beds has 143 missing values.
Column Baths has 74 missing values.
Column Parking has 11 missing values.
Column bond has 1272 missing values.
Column extracted_price has 162 missing values.
Column geometry has 0 missing values.
Column index_right has 2 missing values.
Column sa2_code has 2 missing values.
Column sa2_name has 2 missing values.
Column chg_flag has 2 missing values.
Column chg_lbl has 2 missing values.
Column sa3_code has 2 missing values.
Column sa3_name has 2 missing values.
Column sa4_code has 2 missing values.
Column sa4_name has 2 missing values.
Column gcc_code has 2 missing values.
Column gcc_name has 2 missing values.
Column ste_code has 2 missing values.
Column ste_name has 2 missing values.
Column aus_cod

In [18]:
# Drop rows with missing extracted_price
domain_all = domain_all.dropna(subset=["extracted_price", "sa2_code"])

In [19]:
# Show rows which have missing values
domain_all.filter(domain_all["Beds"].isNull() | domain_all["Baths"].isNull() | domain_all["Parking"].isNull()).show(200)

+----+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+----+-----+-------+------+---------------+--------------------+-----------+---------+--------------------+--------+-----------+--------+--------------------+--------+--------------------+--------+-----------------+--------+--------+--------+---------+--------+--------------------+--------------------+
| _c0|                 url|               price|             address|       property_type|           latitude|         longitude|Beds|Baths|Parking|  bond|extracted_price|            geometry|index_right| sa2_code|            sa2_name|chg_flag|    chg_lbl|sa3_code|            sa3_name|sa4_code|            sa4_name|gcc_code|         gcc_name|ste_code|ste_name|aus_code| aus_name|areasqkm|            loci_uri|       geometry_proj|
+----+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+----+--

24/10/03 12:18:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , url, price, address, property_type, latitude, longitude, Beds, Baths, Parking, bond, extracted_price, geometry, index_right, sa2_code, sa2_name, chg_flag, chg_lbl, sa3_code, sa3_name, sa4_code, sa4_name, gcc_code, gcc_name, ste_code, ste_name, aus_code, aus_name, areasqkm, loci_uri, geometry_proj
 Schema: _c0, url, price, address, property_type, latitude, longitude, Beds, Baths, Parking, bond, extracted_price, geometry, index_right, sa2_code, sa2_name, chg_flag, chg_lbl, sa3_code, sa3_name, sa4_code, sa4_name, gcc_code, gcc_name, ste_code, ste_name, aus_code, aus_name, areasqkm, loci_uri, geometry_proj
Expected: _c0 but found: 
CSV file: file:///Users/dakshagrawal/PycharmProjects/project-2-group-real-estate-industry-project-3/data/raw/domain.csv


In [20]:
domain_all = domain_all.drop("_c0")

In [21]:
# Impute beds, baths, and parking with 0
domain_all = domain_all.fillna({"Beds": 0, "Baths": 0, "Parking": 0})

In [22]:
# Convert extracted_price to float
domain_all = domain_all.withColumn("extracted_price", domain_all["extracted_price"].cast("float"))

In [23]:
# Remove outliers in extracted_price (above 99th percentile) or below 0
q = domain_all.approxQuantile("extracted_price", [0.99], 0)
domain_all = domain_all.filter(domain_all["extracted_price"] < q[0])
domain_all = domain_all.filter(domain_all["extracted_price"] > 0)

In [24]:
# Save the cleaned data to a new parquet file
domain_all.toPandas().to_csv("../data/curated/domain_data.csv", header=True)