# ERP + household + school_count + PTV + median_rent-> by sa2
##### INPUT:
School location
- ../data/raw/ACARA/School_location/School_location.csv
- School Name (String), SA2 (int), Latitude (float), Longitude (float), School Type (String)

ERP
- ../data/raw/ABS/ERP/ERP.csv
- SA2 (int), year (int), population (int)

Median household income (weekly) (2021) (By SA2)
- ../data/raw/ABS/Household_income/Household_income.csv
- SA2 (int), year (int),  household_type (String), income_level (String), popultaion (int)

School location
- ../data/raw/ACARA/School_location/School_location.csv
- School Name (String), SA2 (int), Latitude (float), Longitude (float), School Type (String)

Median rent
- ../data/raw/DHHS/history_rent.csv
- SA2 (int), year (int), month (int), count (int), median (float)
##### OUTPUT:
SA2_info
- ../data/curated/sa2_info.csv
- SA2 (int), school_count (int), ERP_population (int), median_income (float), metrobus_count (int), metrotrain_count (int), metrotram_count (int), regbus_count (int), regcoach_count (int), regtrain_count (int), skybus_count (int), recr_count (int), comm_count (int), 





# ERP + price_index + interest_rate -> year, SA2
- median rent
##### INPUT:
ERP
- ../data/raw/ABS/ERP/ERP.csv
- SA2 (int), year (int), population (int)

Residential Property Price Index (1/2011 - 12/2021) (AUS)
- ../data/raw/ABS/Price_index/Price_index.csv
- year (int), price_index (int)

Interest rate (2013 - 2021) (AUS)
- ../data/raw/rba/interest_rate/interest_rate.csv
- time(int, year), bond (float, risk free interest rate)

Median rent
- ../data/raw/DHHS/history_rent.csv
- SA2 (int), year (int), month (int), count (int), median (float)

##### OUTPUT:
history_info
- ../data/curated/history_info.csv
- SA2 (int), year (int), population (int), bond (float), price_index (int), 




# SA2(ERP + household + school_count + PTV + FOI) + rent(basic + distance) -> rent_price
##### INPUT:
SA2:
- ALL: ../data/curated/sa2_info.csv

rent information with distance
- ../data/curated/rent_distance.csv
- rent_index (int), SA2 (int), rent (float), bedroom (int), baths (int), parking (int), Latitude (float), Longitude (float), school_dis (float), station_dis (float)

##### OUTPUT:
- ../data/curated/rent_info.csv
- rent (float), bedroom (int), baths (int), parking (int), school_dis (float), station_dis (float), SA2 (int), school_count (int), ERP_population (int), median_income (float), metrobus_count (int), metrotrain_count (int), metrotram_count (int), regbus_count (int), regcoach_count (int), regtrain_count (int), skybus_count (int), recr_count (int), comm_count

In [77]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import re
from itertools import compress
import pandas as pd
import geopandas as gpd
import zipfile


spark = (
    SparkSession.builder.appName("Assignment_2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)


headers = {"accept": "text/csv"}

In [78]:
def gpd_station_merge(poly_gdf, file_path, by_id_name = "SA2_CODE21",\
    station_id_name = "STOP_ID", method={"STOP_ID": "count"}):
    """
        A function used to merge shape file in path: {file_path} to a 
        geopandas dataframe {poly_gdf} with POLYGON geometry. 
        poly_gdf: a geopandas.GeoDataFrame object contains POLYGON geometry
        file_path: a String of file path to read target shape file
        by_id_name: a String of id name to perform groupby option
        station_id_name: a String of id name stated in the readed gdf
        method: a Dict of operations to perform after groupby
    """

    ### read station file
    station_gdf = gpd.read_file(file_path)

    # metro bus station feature selection
    station_gdf = station_gdf[[station_id_name, "geometry"]]
    


    # merge tabels
    join_gdf = gpd.sjoin(poly_gdf, station_gdf, how="left")
    join_gdf = join_gdf.groupby(by_id_name).agg(method)
    
    return join_gdf

## SA2_info
#### ERP + household + school_count + PTV + median_rent-> by sa2
Read and prepare all data sets and create TempView

In [105]:
# read school file
school_sdf = spark.read.csv(f"../data/raw/ACARA/School_location/School_location.csv", header=True)

# count school by SA2
school_count = school_sdf.groupBy("SA2").agg({
    "School Name": "count"
})
school_count = school_count.withColumnRenamed( "count(School Name)", "school_count")
school_count.show(1, vertical = True, truncate=100)
school_count.createOrReplaceTempView("school")



# read ERP file and create tempview
ERP_sdf = spark.read.csv(f"../data/raw/ABS/ERP/ERP.csv", header=True)
ERP_sdf = ERP_sdf.filter(F.col("year") == 2021)
ERP_sdf.show(1, vertical = True, truncate=100)
ERP_sdf.createOrReplaceTempView("ERP")



# read median household income file and create tempview
household_sdf = spark.read.csv(f"../data/raw/ABS/Household_income/Household_income.csv", header=True)
household_sdf.show(2, vertical = True, truncate=100)
household_sdf.createOrReplaceTempView("household")



# read PTV station file and create tempview
ptv_sdf = spark.read.csv(f"../data/raw/PTV/public_trans.csv", header=True)
ptv_sdf.show(1, vertical = True, truncate=100)
ptv_sdf.createOrReplaceTempView("ptv")



# read PTV station file and create tempview
foi_Sdf = spark.read.csv(f"../data/raw/FOI/foi_count_by_sa2.csv", header=True)
foi_Sdf.show(1, vertical = True, truncate=100)
foi_Sdf.createOrReplaceTempView("foi")



# read median rent file and create tempview
mrent_sdf = spark.read.csv(f"../data/raw/DHHS/history_rent.csv", header=True).dropna()
mrent_sdf = mrent_sdf.filter((F.col("year") == 2021) & (F.col("month") == 3))
mrent_sdf.show(1, vertical = True, truncate=100)
mrent_sdf.createOrReplaceTempView("mrent")


-RECORD 0-----------------
 SA2          | 209031212 
 school_count | 9         
only showing top 1 row

-RECORD 0---------------
 SA2        | 201011481 
 year       | 2021      
 population | 9656      
only showing top 1 row

-RECORD 0------------------
 SA2           | 213051589 
 median_income | 1862      
-RECORD 1------------------
 SA2           | 209041437 
 median_income | 1979      
only showing top 2 rows

-RECORD 0----------------------------------------------------------------------------------------------------------------
 SA2_CODE21       | 201011001                                                                                            
 geometry         | POLYGON ((143.78282104711133 -37.566657808073295, 143.75557764214773 -37.56346721632544, 143.7480... 
 metrobus_count   | 0                                                                                                    
 metrotrain_count | 0                                                                     

Merge data

In [106]:
# inner join
print(school_count.columns)
print(ERP_sdf.columns)
print(household_sdf.columns)
print(ptv_sdf.columns)
print(foi_Sdf.columns)
print(mrent_sdf.columns)
combine_sdf = spark.sql("""
    SELECT  school.SA2, school.school_count, 
        ERP.population AS ERP_population, median_income, 
        metrobus_count, metrotrain_count, metrotram_count, 
        regbus_count, regcoach_count, regtrain_count, skybus_count,
        recr_count, comm_count, mrent.count AS deal_count,
        mrent.median AS median_rent
    FROM school
    INNER JOIN ERP ON school.SA2 = ERP.SA2
    INNER JOIN household ON school.SA2 = household.SA2
    INNER JOIN ptv ON school.SA2 = ptv.SA2_CODE21
    INNER JOIN foi ON school.SA2 = foi.SA2
    INNER JOIN mrent ON school.SA2 = mrent.SA2
""")
combine_sdf.show(1, vertical = True, truncate=100)
print(combine_sdf.columns)
combine_sdf.toPandas().to_csv(
    f"../data/curated/sa2_info.csv"
)

['SA2', 'school_count']
['SA2', 'year', 'population']
['SA2', 'median_income']
['SA2_CODE21', 'geometry', 'metrobus_count', 'metrotrain_count', 'metrotram_count', 'regbus_count', 'regcoach_count', 'regtrain_count', 'skybus_count']
['SA2', 'recr_count', 'comm_count']
['_c0', 'SA2', 'year', 'month', 'count', 'median']
22/09/19 00:21:52 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , SA2, year, month, count, median
 Schema: _c0, SA2, year, month, count, median
Expected: _c0 but found: 
CSV file: file:///home/bruce/projects/ass2/data/raw/DHHS/history_rent.csv
-RECORD 0---------------------
 SA2              | 202011018 
 school_count     | 13        
 ERP_population   | 14951     
 median_income    | 1267      
 metrobus_count   | 0         
 metrotrain_count | 0         
 metrotram_count  | 0         
 regbus_count     | 142       
 regcoach_count   | 2         
 regtrain_count   | 1         
 skybus_count     | 0         
 recr_count       | 1         
 comm_

## history_info

#### ERP + interest_rate + price_index -> year, SA2

Read and prepare all data sets and create TempView

In [81]:
# read ERP file and create tempview
ERP_sdf = spark.read.csv(f"../data/raw/ABS/ERP/ERP.csv", header=True)
ERP_sdf = ERP_sdf.filter(F.col("year") == 2021)
ERP_sdf.show(1, vertical = True, truncate=100)
ERP_sdf.createOrReplaceTempView("ERP")

# read population projection file and create tempview
interest_sdf = spark.read.csv(f"../data/raw/rba/interest_rate/interest_rate.csv", header=True)
interest_sdf = interest_sdf.filter(F.col("year") == 2021)
interest_sdf.show(1, vertical = True, truncate=100)
interest_sdf.createOrReplaceTempView("interest")

# read property price index file and create tempview
index_sdf = spark.read.csv(f"../data/raw/ABS/Price_index/Price_index.csv", header=True)
index_sdf = index_sdf.filter(F.col("year") == 2021)
index_sdf.show(1, vertical = True, truncate=100)
index_sdf.createOrReplaceTempView("index")


# # read median property rent file and create tempview
# mrent_sdf = spark.read.csv(f"../data/raw/DHHS/history_rent.csv", header=True)
# mrent_sdf = mrent_sdf.filter(F.col("year") == 2021)
# mrent_sdf.show(1, vertical = True, truncate=100)
# mrent_sdf.createOrReplaceTempView("mrent")

-RECORD 0---------------
 SA2        | 201011481 
 year       | 2021      
 population | 9656      
only showing top 1 row

22/09/18 23:22:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , year, bond
 Schema: _c0, year, bond
Expected: _c0 but found: 
CSV file: file:///home/bruce/projects/ass2/data/raw/rba/interest_rate/interest_rate.csv
-RECORD 0----
 _c0  | 112  
 year | 2021 
 bond | 0.93 

-RECORD 0-----------
 year        | 2021 
 price_index | 185  



Merge data

In [82]:

# inner join
print(ERP_sdf.columns)
print(interest_sdf.columns)
print(index_sdf.columns)
print(mrent_sdf.columns)
combine_sdf = spark.sql("""
    SELECT  ERP.SA2, ERP.year, population, bond, price_index
    FROM ERP
    INNER JOIN interest ON ERP.year = interest.year
    INNER JOIN index ON ERP.year = index.year
""")
combine_sdf.show(1, vertical = True, truncate=100)
print(combine_sdf.columns)
combine_sdf.toPandas().to_csv(
    f"../data/curated/history_info.csv"
)

['SA2', 'year', 'population']
['_c0', 'year', 'bond']
['year', 'price_index']
['_c0', 'SA2', 'year', 'month', 'count', 'median']
-RECORD 0----------------
 SA2         | 201011481 
 year        | 2021      
 population  | 9656      
 bond        | 0.93      
 price_index | 185       
only showing top 1 row

['SA2', 'year', 'population', 'bond', 'price_index']


## rent_info
- SA2(ERP + household + school_count + PTV + FOI) + rent(basic + distance) -> rent_price
- ../data/curated/rent_info.csv
- rent (float), bedroom (int), baths (int), parking (int), school_dis (float), station_dis (float), SA2 (int), school_count (int), ERP_population (int), median_income (float), metrobus_count (int), metrotrain_count (int), metrotram_count (int), regbus_count (int), regcoach_count (int), regtrain_count (int), skybus_count (int), recr_count (int), comm_count

In [83]:
# read SA2 file and create tempview
SA2_sdf = spark.read.csv(f"../data/curated/sa2_info.csv", header=True)
SA2_sdf.show(1, vertical = True, truncate=100)
SA2_sdf.createOrReplaceTempView("SA2")

# read rent data file and create tempview
rent_sdf = spark.read.csv(f"../data/curated/rent_distance.csv", header=True)
rent_sdf.show(1, vertical = True, truncate=100)
rent_sdf.createOrReplaceTempView("rent")

22/09/18 23:22:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , SA2, school_count, ERP_population, median_income, metrobus_count, metrotrain_count, metrotram_count, regbus_count, regcoach_count, regtrain_count, skybus_count, recr_count, comm_count
 Schema: _c0, SA2, school_count, ERP_population, median_income, metrobus_count, metrotrain_count, metrotram_count, regbus_count, regcoach_count, regtrain_count, skybus_count, recr_count, comm_count
Expected: _c0 but found: 
CSV file: file:///home/bruce/projects/ass2/data/curated/sa2_info.csv
-RECORD 0---------------------
 _c0              | 0         
 SA2              | 202011018 
 school_count     | 13        
 ERP_population   | 14951     
 median_income    | 1267      
 metrobus_count   | 0         
 metrotrain_count | 0         
 metrotram_count  | 0         
 regbus_count     | 142       
 regcoach_count   | 2         
 regtrain_count   | 1         
 skybus_count     | 0         
 recr_count       | 1    

Merge data

In [84]:
# inner join
print(SA2_sdf.columns)
print(rent_sdf.columns)
combine_sdf = spark.sql("""
    SELECT  rent, bedroom, baths, parking, school_dis, station_dis, rent.SA2,
        school_count, ERP_population, median_income, metrobus_count, 
        metrotrain_count, metrotram_count, regbus_count, regcoach_count,
        regtrain_count, skybus_count, recr_count, comm_count
    FROM SA2
    INNER JOIN rent ON SA2.SA2 = rent.SA2
""")
combine_sdf.show(1, vertical = True, truncate=100)
print(combine_sdf.columns)
combine_sdf.toPandas().to_csv(
    f"../data/curated/rent_info.csv"
)

['_c0', 'SA2', 'school_count', 'ERP_population', 'median_income', 'metrobus_count', 'metrotrain_count', 'metrotram_count', 'regbus_count', 'regcoach_count', 'regtrain_count', 'skybus_count', 'recr_count', 'comm_count']
['rent_index', 'SA2', 'rent', 'bedroom', 'baths', 'parking', 'Latitude', 'Longitude', 'school_dis', 'station_dis']
-RECORD 0---------------------
 rent             | 490.0     
 bedroom          | 4         
 baths            | 2         
 parking          | 2         
 school_dis       | 1651.7    
 station_dis      | 5895.5    
 SA2              | 201011001 
 school_count     | 4         
 ERP_population   | 16823     
 median_income    | 1952      
 metrobus_count   | 0         
 metrotrain_count | 0         
 metrotram_count  | 0         
 regbus_count     | 43        
 regcoach_count   | 0         
 regtrain_count   | 0         
 skybus_count     | 0         
 recr_count       | 0         
 comm_count       | 0         
only showing top 1 row

['rent', 'bedroom', 'b

By Junhua Liu for study use only