final outputs:

ERP + household + school_count + PTV -> by sa2
SA2_info

ERP + price_index + interest_rate + price_index -> year, SA2
history_data



In [8]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import re
from itertools import compress


spark = (
    SparkSession.builder.appName("Assignment_2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

INPUT_DIR = "../data/raw/"
OUTPUT_DIR = "../data/curated/"

## SA2_info
#### ERP + household + school_count + PTV -> by sa2
Read and prepare all data sets and create TempView

In [5]:
# read school file
school_sdf = spark.read.csv(f"{INPUT_DIR}ACARA/School_location/School_location.csv", header=True)

# count school by SA2
school_count = school_sdf.groupBy("SA2").agg({
    "School Name": "count"
})
school_count = school_count.withColumnRenamed( "count(School Name)", "school_count")
school_count.show(1, vertical = True, truncate=100)
school_count.createOrReplaceTempView("school")



# read ERP file and create tempview
ERP_sdf = spark.read.csv(f"{INPUT_DIR}ABS/ERP/ERP.csv", header=True)
ERP_sdf = ERP_sdf.filter(F.col("year") == 2021)
ERP_sdf.show(1, vertical = True, truncate=100)
ERP_sdf.createOrReplaceTempView("ERP")



# read median household income file and create tempview
household_sdf = spark.read.csv(f"{INPUT_DIR}ABS/Household_income/Household_income.csv", header=True)
household_sdf.show(2, vertical = True, truncate=100)
household_sdf.createOrReplaceTempView("household")

# read PTV station file and create tempview
ptv_sdf = spark.read.csv(f"{INPUT_DIR}PTV/public_trans.csv", header=True)
ptv_sdf.show(1, vertical = True, truncate=100)
ptv_sdf.createOrReplaceTempView("ptv")

-RECORD 0-----------------
 SA2          | 209031212 
 school_count | 9         
only showing top 1 row

-RECORD 0---------------
 SA2        | 201011481 
 year       | 2021      
 population | 9656      
only showing top 1 row

-RECORD 0------------------
 SA2           | 213051589 
 median_income | 1862      
-RECORD 1------------------
 SA2           | 209041437 
 median_income | 1979      
only showing top 2 rows

-RECORD 0----------------------------------------------------------------------------------------------------------------
 SA2_CODE21       | 201011001                                                                                            
 geometry         | POLYGON ((143.78282104711133 -37.566657808073295, 143.75557764214773 -37.56346721632544, 143.7480... 
 metrobus_count   | 0                                                                                                    
 metrotrain_count | 0                                                                     

Merge data

In [9]:
# inner join
print(school_count.columns)
print(ERP_sdf.columns)
print(household_sdf.columns)
combine_sdf = spark.sql("""
    SELECT  school.SA2, school.school_count, 
        ERP.population AS ERP_population, median_income, 
        metrobus_count, metrotrain_count, metrotram_count, 
        regbus_count, regcoach_count, regtrain_count, skybus_count
    FROM school
    INNER JOIN ERP ON school.SA2 = ERP.SA2
    INNER JOIN household ON school.SA2 = household.SA2
    INNER JOIN ptv ON school.SA2 = ptv.SA2_CODE21
""")
combine_sdf.show(1, vertical = True, truncate=100)
combine_sdf.write.option("header", True).mode("overwrite").csv(
    f"{OUTPUT_DIR}sa2_info.csv"
)

['SA2', 'school_count']
['SA2', 'year', 'population']
['SA2', 'median_income']
-RECORD 0---------------------
 SA2              | 202011018 
 school_count     | 13        
 ERP_population   | 14951     
 median_income    | 1267      
 metrobus_count   | 0         
 metrotrain_count | 0         
 metrotram_count  | 0         
 regbus_count     | 142       
 regcoach_count   | 2         
 regtrain_count   | 1         
 skybus_count     | 0         
only showing top 1 row



## history_data

#### ERP + interest_rate + price_index -> year, SA2

Read and prepare all data sets and create TempView

In [11]:
# read ERP file and create tempview
ERP_sdf = spark.read.csv(f"{INPUT_DIR}ABS/ERP/ERP.csv", header=True)
ERP_sdf = ERP_sdf.filter(F.col("year") == 2021)
ERP_sdf.show(1, vertical = True, truncate=100)
ERP_sdf.createOrReplaceTempView("ERP")

# read population projection file and create tempview
interest_sdf = spark.read.csv(f"{INPUT_DIR}rba/interest_rate/interest_rate.csv", header=True)
interest_sdf = interest_sdf.filter(F.col("year") == 2021)
interest_sdf.show(1, vertical = True, truncate=100)
interest_sdf.createOrReplaceTempView("interest")

# read property price index file and create tempview
index_sdf = spark.read.csv(f"{INPUT_DIR}ABS/Price_index/Price_index.csv", header=True)
index_sdf = index_sdf.filter(F.col("year") == 2021)
index_sdf.show(1, vertical = True, truncate=100)
index_sdf.createOrReplaceTempView("index")

-RECORD 0---------------
 SA2        | 201011481 
 year       | 2021      
 population | 9656      
only showing top 1 row

22/09/12 22:48:14 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , year, bond
 Schema: _c0, year, bond
Expected: _c0 but found: 
CSV file: file:///home/bruce/projects/ass2/data/raw/rba/interest_rate/interest_rate.csv
-RECORD 0----
 _c0  | 112  
 year | 2021 
 bond | 0.93 

-RECORD 0-----------
 year        | 2021 
 price_index | 185  



Merge data

In [13]:

# inner join
print(ERP_sdf.columns)
print(interest_sdf.columns)
print(index_sdf.columns)
combine_sdf = spark.sql("""
    SELECT  SA2, ERP.year, population, bond, price_index
    FROM ERP
    INNER JOIN interest ON ERP.year = interest.year
    INNER JOIN index ON ERP.year = index.year
""")
combine_sdf.show(1, vertical = True, truncate=100)
combine_sdf.write.option("header", True).mode("overwrite").csv(
    f"{OUTPUT_DIR}history_data.csv"
)

['SA2', 'year', 'population']
['_c0', 'year', 'bond']
['year', 'price_index']
-RECORD 0----------------
 SA2         | 201011481 
 year        | 2021      
 population  | 9656      
 bond        | 0.93      
 price_index | 185       
only showing top 1 row



## Match SA2 to Rental data