# ERP + household + school_count + PTV + median_rent + cbd_dis-> by sa2
##### INPUT:
School location
- ../data/raw/ACARA/School_location/School_location.csv
- School Name (String), SA2 (int), Latitude (float), Longitude (float), School Type (String)

Estimated Resident Population (ERP) (2001 to 2021) (By SA2)
- ../data/raw/ABS/ERP/ERP.csv
- SA2 (int), year (int), population (int)

Median household income (weekly) (2021) (By SA2)
- ../data/raw/ABS/Household_income/Household_income.csv
- SA2 (int), year (int),  household_type (String), income_level (String), popultaion (int)

School location
- ../data/raw/ACARA/School_location/School_location.csv
- School Name (String), SA2 (int), Latitude (float), Longitude (float), School Type (String)

Median rent
- ../data/raw/DHHS/history_rent.csv
- SA2 (int), year (int), month (int), count (int), median (float)

distance to cbd
- ../data/curated/sa2_to_cbd.csv
- SA2 (int), cbd_dis (float)
##### OUTPUT:
SA2_info
- ../data/curated/sa2_info.csv
- SA2 (int), school_count (int), ERP_population (int), median_income (float), metrobus_count (int), metrotrain_count (int), metrotram_count (int), regbus_count (int), regcoach_count (int), regtrain_count (int), skybus_count (int), recr_count (int), comm_count (int), cbd_dis(float)





# ERP + price_index + interest_rate + median_rent + exchange_rate + immigration + debt_income_ratio -> BY year and SA2
##### INPUT:
ERP
- ../data/raw/ABS/ERP/ERP.csv
- SA2 (int), year (int), population (int)

Residential Property Price Index (1/2011 - 12/2021) (AUS)
- ../data/raw/ABS/Price_index/Price_index.csv
- year (int), quarter (int), price_index (int)

Interest rate (2013 - 2021) (AUS)
- ../data/raw/rba/interest_rate/interest_rate.csv
- year(int), quarter(int), month(int), bond (float, risk free interest rate)

Median rent (1999 - 2021) BY sa2
- ../data/raw/DHHS/history_rent.csv
- SA2 (int), year (int), quarter (int), count (int), median (float)

Exchange Rate (2010 March - 2022 June) AUS
- ../data/raw/rba/exchange_rate/exchange_rate.csv
- year (int), quarter (int), month (int), to_USD (float)

Immigration data, 2004 - 2019, Victoria only
- ../data/raw/ABS/immigration/immigration.csv
- year (int), immi_count (int)

Household debt income ratio measured on each two years In Millions (2009-2019) (Victoria only)
- ../data/raw/ABS/debt_income_ratio/debt_income_ratio.csv
- year (int), debt_ratio (float)

##### OUTPUT:
history_info
- ../data/curated/history_info.csv
- SA2 (int), year (int), quarter (int), month (int), bond (float), price_index (int), population (int), median_rent (float), deal_count (int), to_USD (float), immi_count (int), debt_ratio (float)
- ERP + price_index + interest_rate + median_rent + exchange_rate + immigration + debt_income_ratio
- year: ERP, immigration, debt_ratio (2 years)
- quarter: price_index, median_rent
- month: interest_rate, exchange_rate


# SA2(ERP + household + school_count + PTV + FOI + Median_rent + cbd_distance) + rent(basic + distance) -> rent_price
##### INPUT:
SA2:
- ALL: ../data/curated/sa2_info.csv

rent information with distance
- ../data/curated/rent_distance.csv
- rent_index (int), SA2 (int), rent (float), bedroom (int), baths (int), parking (int), Latitude (float), Longitude (float), school_dis (float), station_dis (float)

##### OUTPUT:
- ../data/curated/rent_info.csv
- rent (float), bedroom (int), baths (int), parking (int), school_dis (float), station_dis (float), cbd_dis (float), median_rent (float), SA2 (int), school_count (int), ERP_population (int), median_income (float), metrobus_count (int), metrotrain_count (int), metrotram_count (int), regbus_count (int), regcoach_count (int), regtrain_count (int), skybus_count (int), recr_count (int), comm_count

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import re
from itertools import compress
import pandas as pd
import geopandas as gpd
import zipfile


spark = (
    SparkSession.builder.appName("Assignment_2")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)


headers = {"accept": "text/csv"}

In [2]:
def gpd_station_merge(poly_gdf, file_path, by_id_name = "SA2_CODE21",\
    station_id_name = "STOP_ID", method={"STOP_ID": "count"}):
    """
        A function used to merge shape file in path: {file_path} to a 
        geopandas dataframe {poly_gdf} with POLYGON geometry. 
        poly_gdf: a geopandas.GeoDataFrame object contains POLYGON geometry
        file_path: a String of file path to read target shape file
        by_id_name: a String of id name to perform groupby option
        station_id_name: a String of id name stated in the readed gdf
        method: a Dict of operations to perform after groupby
    """

    ### read station file
    station_gdf = gpd.read_file(file_path)

    # metro bus station feature selection
    station_gdf = station_gdf[[station_id_name, "geometry"]]
    


    # merge tabels
    join_gdf = gpd.sjoin(poly_gdf, station_gdf, how="left")
    join_gdf = join_gdf.groupby(by_id_name).agg(method)
    
    return join_gdf

## SA2_info
#### ERP + household + school_count + PTV + median_rent-> by sa2
Read and prepare all data sets and create TempView

In [3]:
# read school file
school_sdf = spark.read.csv(f"../data/raw/ACARA/School_location.csv", header=True)

# count school by SA2
school_count = school_sdf.groupBy("SA2").agg({
    "School Name": "count"
})
school_count = school_count.withColumnRenamed( "count(School Name)", "school_count")
school_count.show(1, vertical = True, truncate=100)
school_count.createOrReplaceTempView("school")

# read ERP file and create tempview
ERP_sdf = spark.read.csv(f"../data/raw/ABS/ERP/ERP.csv", header=True)
ERP_sdf = ERP_sdf.filter(F.col("year") == 2021)
ERP_sdf.show(1, vertical = True, truncate=100)
ERP_sdf.createOrReplaceTempView("ERP")

# read median household income file and create tempview
household_sdf = spark.read.csv(f"../data/raw/ABS/Household_income/Household_income.csv", header=True)
household_sdf.show(2, vertical = True, truncate=100)
household_sdf.createOrReplaceTempView("household")

# read PTV station file and create tempview
ptv_sdf = spark.read.csv(f"../data/raw/PTV/public_trans.csv", header=True)
ptv_sdf.show(1, vertical = True, truncate=100)
ptv_sdf.createOrReplaceTempView("ptv")

# read PTV station file and create tempview
foi_Sdf = spark.read.csv(f"../data/raw/FOI/foi_count_by_sa2.csv", header=True)
foi_Sdf.show(1, vertical = True, truncate=100)
foi_Sdf.createOrReplaceTempView("foi")

# read median rent file and create tempview
mrent_sdf = spark.read.csv(f"../data/raw/DHHS/history_rent.csv", header=True).dropna()
mrent_sdf = mrent_sdf.filter((F.col("year").cast("int") == 2020) & (F.col("quarter").cast("int") == 4))
mrent_sdf.show(1, vertical = True, truncate=100)
mrent_sdf.createOrReplaceTempView("mrent")

# read distance to cbd and create tempview
cbd_sdf = spark.read.csv(f"../data/curated/sa2_to_cbd.csv", header=True).dropna()
cbd_sdf.show(1, vertical = True, truncate=100)
cbd_sdf.createOrReplaceTempView("cbd")

-RECORD 0-----------------
 SA2          | 209031212 
 school_count | 9         
only showing top 1 row

-RECORD 0---------------
 SA2        | 201011481 
 year       | 2021      
 population | 9656      
only showing top 1 row

-RECORD 0------------------
 SA2           | 213051589 
 median_income | 1862      
-RECORD 1------------------
 SA2           | 209041437 
 median_income | 1979      
only showing top 2 rows

-RECORD 0----------------------------------------------------------------------------------------------------------------
 SA2_CODE21       | 201011001                                                                                            
 geometry         | POLYGON ((143.78282104711133 -37.566657808073295, 143.75557764214773 -37.56346721632544, 143.7480... 
 metrobus_count   | 0                                                                                                    
 metrotrain_count | 0                                                                     

Merge data

In [4]:
# inner join
print(school_count.columns)
print(ERP_sdf.columns)
print(household_sdf.columns)
print(ptv_sdf.columns)
print(foi_Sdf.columns)
print(mrent_sdf.columns)
combine_sdf = spark.sql("""
    SELECT  school.SA2, school.school_count, 
        ERP.population AS ERP_population, median_income, 
        metrobus_count, metrotrain_count, metrotram_count, 
        regbus_count, regcoach_count, regtrain_count, skybus_count,
        recr_count, comm_count, mrent.count AS deal_count,
        mrent.median AS median_rent, cbd_dis
    FROM school
    INNER JOIN ERP ON school.SA2 = ERP.SA2
    INNER JOIN household ON school.SA2 = household.SA2
    INNER JOIN ptv ON school.SA2 = ptv.SA2_CODE21
    INNER JOIN foi ON school.SA2 = foi.SA2
    INNER JOIN mrent ON school.SA2 = mrent.SA2
    INNER JOIN cbd ON school.SA2 = cbd.SA2
""")
combine_sdf.show(1, vertical = True, truncate=100)
print(combine_sdf.columns)
print(combine_sdf.count())
combine_sdf.toPandas().to_csv(
    f"../data/curated/sa2_info.csv"
)

['SA2', 'school_count']
['SA2', 'year', 'population']
['SA2', 'median_income']
['SA2_CODE21', 'geometry', 'metrobus_count', 'metrotrain_count', 'metrotram_count', 'regbus_count', 'regcoach_count', 'regtrain_count', 'skybus_count']
['SA2', 'recr_count', 'comm_count']
['index', 'SA2', 'year', 'quarter', 'count', 'median']
-RECORD 0---------------------
 SA2              | 202011018 
 school_count     | 13        
 ERP_population   | 14951     
 median_income    | 1267      
 metrobus_count   | 0         
 metrotrain_count | 0         
 metrotram_count  | 0         
 regbus_count     | 142       
 regcoach_count   | 2         
 regtrain_count   | 1         
 skybus_count     | 0         
 recr_count       | 1         
 comm_count       | 1         
 deal_count       | 709       
 median_rent      | 350.0     
 cbd_dis          | 152998.1  
only showing top 1 row

['SA2', 'school_count', 'ERP_population', 'median_income', 'metrobus_count', 'metrotrain_count', 'metrotram_count', 'regbus_cou

## history_info

#### ERP + price_index + interest_rate + median_rent + exchange_rate + immigration + debt_income_ratio -> BY year and SA2

#### Read and prepare all data sets and create TempView

fill data with mean

In [5]:
## read and reformate data
ratio_sdf = spark.read.csv(f"../data/raw/ABS/debt_income_ratio/debt_income_ratio.csv", header=True)
ratio_df = ratio_sdf.toPandas()[['year', 'debt_ratio']]
ratio_df["year"] = ratio_df["year"].astype(int)
ratio_df["debt_ratio"] = ratio_df["debt_ratio"].astype(float)
ratio_df = ratio_df.sort_values(by=["year"]).reset_index(drop=True)
ratio_df["quarter"] = 4

## Add second quarter data by taking average
num_row = ratio_df.shape[0]
new_df = pd.DataFrame(columns = ['year', 'debt_ratio', 'quarter'])
for i in range(num_row-1):
    cur_data = list(ratio_df.loc[i].values.flatten())
    new_data = list(ratio_df.loc[i+1].values.flatten())
    new_data[2] = 2
    new_data[1] = (cur_data[1] + new_data[1])/2
    new_df.loc[i] = new_data

ratio_df = pd.concat([ratio_df, new_df])
ratio_df = ratio_df.sort_values(by=['year', 'quarter']).reset_index(drop=True)

## Add first and third quarter data by taking average
num_row = ratio_df.shape[0]
new_df = pd.DataFrame(columns = ['year', 'debt_ratio', 'quarter'])
for i in range(num_row-1):
    cur_data = list(ratio_df.loc[i].values.flatten())
    new_data = list(ratio_df.loc[i+1].values.flatten())
    if new_data[2] == 2:
        new_data[2] = 1
    else:
        new_data[2] = 3
    new_data[1] = (cur_data[1] + new_data[1])/2
    new_df.loc[i] = new_data

ratio_df = pd.concat([ratio_df, new_df])
ratio_df = ratio_df.sort_values(by=['year', 'quarter']).reset_index(drop=True)

## create tempiew
# print(ratio_df["year"].unique())
ratio_df["year"] = ratio_df["year"].astype(int)
ratio_df["quarter"] = ratio_df["quarter"].astype(int)
ratio_df["debt_ratio"] = ratio_df["debt_ratio"].astype(float)
ratio_sdf = spark.createDataFrame(ratio_df)
ratio_sdf.show(1, vertical = True, truncate=100)
ratio_sdf.createOrReplaceTempView("ratio")

-RECORD 0----------
 year       | 2009 
 debt_ratio | 0.97 
 quarter    | 4    
only showing top 1 row



In [6]:
immigration_sdf = spark.read.csv(f"../data/raw/ABS/immigration/immigration.csv", header=True)
immi_df = immigration_sdf.toPandas()[['year', 'immi_count']]
immi_df = immi_df.astype(int)
immi_df = immi_df.sort_values(by=["year"]).reset_index(drop=True)
immi_df["quarter"] = 4

## Add second quarter data by taking average
num_row = immi_df.shape[0]
new_df = pd.DataFrame(columns = ['year', 'immi_count', 'quarter'])
for i in range(num_row-1):
    cur_data = list(immi_df.loc[i].values.flatten())
    new_data = list(immi_df.loc[i+1].values.flatten())
    new_data[2] = 2
    new_data[1] = int((cur_data[1] + new_data[1])/2)
    new_df.loc[i] = new_data

immi_df = pd.concat([immi_df, new_df])
immi_df = immi_df.sort_values(by=['year', 'quarter']).reset_index(drop=True)

## Add first and third quarter data by taking average
num_row = immi_df.shape[0]
new_df = pd.DataFrame(columns = ['year', 'immi_count', 'quarter'])
for i in range(num_row-1):
    cur_data = list(immi_df.loc[i].values.flatten())
    new_data = list(immi_df.loc[i+1].values.flatten())
    if new_data[2] == 2:
        new_data[2] = 1
    else:
        new_data[2] = 3
    new_data[1] = int((cur_data[1] + new_data[1])/2)
    new_df.loc[i] = new_data

immi_df = pd.concat([immi_df, new_df])
immi_df = immi_df.sort_values(by=['year', 'quarter']).reset_index(drop=True)

## create tempiew
# print(immi_df["year"].unique())
immigration_sdf = spark.createDataFrame(immi_df)
immigration_sdf.createOrReplaceTempView("immigration")
immigration_sdf.show(1, vertical = True, truncate=100)



-RECORD 0-----------
 year       | 2004  
 immi_count | 81230 
 quarter    | 4     
only showing top 1 row



In [7]:
ERP_sdf = spark.read.csv(f"../data/raw/ABS/ERP/ERP.csv", header=True)
ERP_df = ERP_sdf.toPandas()[['SA2', 'year', 'population']]
ERP_df = ERP_df.astype(int)
ERP_df = ERP_df.sort_values(by=["SA2", "year"]).reset_index(drop=True)
ERP_df["quarter"] = 4

## Add second quarter data by taking average
num_row = ERP_df.shape[0]
new_df = pd.DataFrame(columns = ['SA2', 'year', 'population', 'quarter'])
for i in range(num_row-1):
    cur_data = list(ERP_df.loc[i].values.flatten())
    new_data = list(ERP_df.loc[i+1].values.flatten())
    if(cur_data[0] == new_data[0]):
        new_data[3] = 2
        new_data[2] = int((cur_data[2] + new_data[2])/2)
        new_df.loc[i] = new_data

ERP_df = pd.concat([ERP_df, new_df])
ERP_df = ERP_df.sort_values(by=["SA2", "year", "quarter"]).reset_index(drop=True)

## Add first and third quarter data by taking average
num_row = ERP_df.shape[0]
new_df = pd.DataFrame(columns = ['SA2', 'year', 'population', 'quarter'])
for i in range(num_row-1):
    cur_data = list(ERP_df.loc[i].values.flatten())
    new_data = list(ERP_df.loc[i+1].values.flatten())
    if(cur_data[0] == new_data[0]):
        if new_data[3] == 2:
            new_data[3] = 1
        else:
            new_data[3] = 3
        new_data[2] = int((cur_data[2] + new_data[2])/2)
        new_df.loc[i] = new_data

ERP_df = pd.concat([ERP_df, new_df])
ERP_df = ERP_df.sort_values(by=["SA2", "year", "quarter"]).reset_index(drop=True)

## create tempiew
# print(ERP_df["year"].unique())
ERP_sdf = spark.createDataFrame(ERP_df)
ERP_sdf.createOrReplaceTempView("ERP")
ERP_sdf.show(1, vertical = True, truncate=100)


-RECORD 0---------------
 SA2        | 201011001 
 year       | 2010      
 population | 7894      
 quarter    | 4         
only showing top 1 row



Read other data

In [8]:
month_l = [3, 6, 9, 12]

# read population projection file and create tempview
interest_sdf = spark.read.csv(f"../data/raw/rba/interest_rate/interest_rate.csv", header=True)
interest_sdf = interest_sdf[["year", "quarter", "month", "bond"]]
interest_sdf = interest_sdf.withColumn("year", F.col("year").cast("int"))\
    .withColumn("quarter", F.col("quarter").cast("int"))\
    .withColumn("month", F.col("month").cast("int"))\
    .withColumn("bond", F.col("bond").cast("double"))
interest_sdf = interest_sdf.filter(F.col("month").isin(month_l))
interest_sdf = interest_sdf[["year", "quarter", "bond"]]
interest_sdf.show(1, vertical = True, truncate=100)
interest_sdf.createOrReplaceTempView("interest")

# read property price index file and create tempview
index_sdf = spark.read.csv(f"../data/raw/ABS/Price_index/Price_index.csv", header=True)
index_sdf = index_sdf[["year", "quarter", "price_index"]]
index_sdf = index_sdf.withColumn("year", F.col("year").cast("int"))\
    .withColumn("quarter", F.col("quarter").cast("int"))\
    .withColumn("price_index", F.col("price_index").cast("double"))
index_sdf.show(1, vertical = True, truncate=100)
index_sdf.createOrReplaceTempView("index")

# read median property rent file and create tempview
mrent_sdf = spark.read.csv(f"../data/raw/DHHS/history_rent.csv", header=True)
mrent_sdf = mrent_sdf[["SA2", "year", "quarter", "count", "median"]]
mrent_sdf = mrent_sdf.withColumn("year", F.col("year").cast("int"))\
    .withColumn("quarter", F.col("quarter").cast("int"))\
    .withColumn("SA2", F.col("SA2").cast("int"))\
    .withColumn("count", F.col("count").cast("int"))\
    .withColumn("median", F.col("median").cast("double"))
mrent_sdf.show(1, vertical = True, truncate=100)
mrent_sdf.createOrReplaceTempView("mrent")

# read exchange rate file and create tempview
exchange_sdf = spark.read.csv(f"../data/raw/rba/exchange_rate/exchange_rate.csv", header=True)
exchange_sdf = exchange_sdf[["year", "quarter", "month",  "to_USD"]]
exchange_sdf = exchange_sdf.withColumn("year", F.col("year").cast("int"))\
    .withColumn("quarter", F.col("quarter").cast("int"))\
    .withColumn("month", F.col("month").cast("int"))\
    .withColumn("to_USD", F.col("to_USD").cast("double"))
exchange_sdf = exchange_sdf.filter(F.col("month").isin(month_l))
exchange_sdf = exchange_sdf[["year", "quarter", "to_USD"]]
exchange_sdf.show(1, vertical = True, truncate=100)
exchange_sdf.createOrReplaceTempView("exchange")

-RECORD 0-------
 year    | 2013 
 quarter | 3    
 bond    | 2.9  
only showing top 1 row

-RECORD 0------------
 year        | 2011  
 quarter     | 1     
 price_index | 104.0 
only showing top 1 row

-RECORD 0------------
 SA2     | 201011001 
 year    | 1999      
 quarter | 2         
 count   | 687       
 median  | 136.5     
only showing top 1 row

-RECORD 0---------
 year    | 2010   
 quarter | 1      
 to_USD  | 0.9159 
only showing top 1 row



Merge data

In [9]:

# inner join
print(ERP_sdf.columns)
print(interest_sdf.columns)
print(index_sdf.columns)
print(mrent_sdf.columns)
print(exchange_sdf.columns)
print(immigration_sdf.columns)
print(ratio_sdf.columns)
combine_sdf = spark.sql("""
    SELECT ERP.SA2, ERP.year, ERP.quarter, population, bond, price_index,
        count AS deal_count, median AS median_rent, to_USD, immi_count, debt_ratio
    FROM ERP
    INNER JOIN interest ON ((ERP.year = interest.year)
        AND (ERP.quarter = interest.quarter))
    INNER JOIN index ON (ERP.year = index.year) 
        AND (ERP.quarter = index.quarter)
    INNER JOIN mrent ON (ERP.year = mrent.year) 
        AND (ERP.quarter = mrent.quarter)
        AND (ERP.SA2 = mrent.SA2)
    INNER JOIN exchange ON (ERP.year = exchange.year) 
        AND (ERP.quarter = exchange.quarter)
    INNER JOIN immigration ON (ERP.year = immigration.year) 
        AND (ERP.quarter = immigration.quarter)
    INNER JOIN ratio ON (ERP.year = ratio.year) 
        AND (ERP.quarter = ratio.quarter)
""")

combine_sdf = combine_sdf.sort("SA2", "year", "quarter")
combine_sdf.show(1, vertical = True, truncate=100)
print(combine_sdf.columns)
print(len(combine_sdf.select("SA2").distinct().collect()))
print(combine_sdf.select("year").distinct().collect())
combine_sdf.toPandas().to_csv(
    f"../data/curated/history_info.csv"
)

['SA2', 'year', 'population', 'quarter']
['year', 'quarter', 'bond']
['year', 'quarter', 'price_index']
['SA2', 'year', 'quarter', 'count', 'median']
['year', 'quarter', 'to_USD']
['year', 'immi_count', 'quarter']
['year', 'debt_ratio', 'quarter']
-RECORD 0-------------------------
 SA2         | 201011001          
 year        | 2013               
 quarter     | 3                  
 population  | 9550               
 bond        | 2.9                
 price_index | 105.0              
 deal_count  | 1027               
 median_rent | 280.0              
 to_USD      | 0.9309             
 immi_count  | 121500             
 debt_ratio  | 0.8587499999999999 
only showing top 1 row

['SA2', 'year', 'quarter', 'population', 'bond', 'price_index', 'deal_count', 'median_rent', 'to_USD', 'immi_count', 'debt_ratio']
501
[Row(year=2014), Row(year=2016), Row(year=2018), Row(year=2017), Row(year=2013), Row(year=2019), Row(year=2015)]


In [10]:
print(combine_sdf.filter(F.col("SA2").cast("int") == 201011481).show(50))

+---------+----+-------+----------+----+-----------+----------+-----------+------+----------+------------------+
|      SA2|year|quarter|population|bond|price_index|deal_count|median_rent|to_USD|immi_count|        debt_ratio|
+---------+----+-------+----------+----+-----------+----------+-----------+------+----------+------------------+
|201011481|2013|      3|      8981| 2.9|      105.0|      1140|      275.0|0.9309|    121500|0.8587499999999999|
|201011481|2013|      4|      9008|2.96|      109.0|      1146|      275.0|0.8948|    122250|              0.86|
|201011481|2014|      1|      9024|2.97|      110.0|      1344|      280.0|0.9221|    123730|0.8587499999999999|
|201011481|2014|      2|      9041| 2.8|      112.0|      1181|      280.0| 0.942|    125210|0.8574999999999999|
|201011481|2014|      3|      9057| 2.8|      113.0|      1153|      285.0|0.8752|    126690|           0.85625|
|201011481|2014|      4|      9074|2.28|      115.0|      1186|      285.0|0.8202|    128170|   

## rent_info
- SA2(ERP + household + school_count + PTV + FOI) + rent(basic + distance) -> rent_price
- ../data/curated/rent_info.csv
- rent (float), bedroom (int), baths (int), parking (int), school_dis (float), station_dis (float), SA2 (int), school_count (int), ERP_population (int), median_income (float), metrobus_count (int), metrotrain_count (int), metrotram_count (int), regbus_count (int), regcoach_count (int), regtrain_count (int), skybus_count (int), recr_count (int), comm_count

In [11]:
# read SA2 file and create tempview
SA2_sdf = spark.read.csv(f"../data/curated/sa2_info.csv", header=True)
SA2_sdf.show(1, vertical = True, truncate=100)
SA2_sdf.createOrReplaceTempView("SA2")

# read rent data file and create tempview
rent_sdf = spark.read.csv(f"../data/curated/rent_distance.csv", header=True)
rent_sdf.show(1, vertical = True, truncate=100)
rent_sdf.createOrReplaceTempView("rent")

-RECORD 0---------------------
 _c0              | 0         
 SA2              | 202011018 
 school_count     | 13        
 ERP_population   | 14951     
 median_income    | 1267      
 metrobus_count   | 0         
 metrotrain_count | 0         
 metrotram_count  | 0         
 regbus_count     | 142       
 regcoach_count   | 2         
 regtrain_count   | 1         
 skybus_count     | 0         
 recr_count       | 1         
 comm_count       | 1         
 deal_count       | 709       
 median_rent      | 350.0     
 cbd_dis          | 152998.1  
only showing top 1 row

-RECORD 0------------------
 _c0         | 0           
 SA2         | 201011001   
 rent        | 490.0       
 bedroom     | 4           
 baths       | 2           
 parking     | 2           
 Latitude    | -37.5630731 
 Longitude   | 143.7938749 
 school_dis  | 1651.7      
 station_dis | 5895.5      
 cbd_dis     | 350.0       
only showing top 1 row



Merge data

In [12]:
# inner join
print(SA2_sdf.columns)
print(rent_sdf.columns)
combine_sdf = spark.sql("""
    SELECT  rent, bedroom, baths, parking, school_dis, station_dis, 
    rent.cbd_dis, median_rent, rent.SA2, school_count, ERP_population, 
    median_income, metrobus_count, metrotrain_count, metrotram_count, 
    regbus_count, regcoach_count, regtrain_count, skybus_count, recr_count, comm_count
    FROM SA2
    INNER JOIN rent ON SA2.SA2 = rent.SA2
""")
combine_sdf.show(1, vertical = True, truncate=100)
print(combine_sdf.columns)
combine_sdf.toPandas().to_csv(
    f"../data/curated/rent_info.csv"
)

['_c0', 'SA2', 'school_count', 'ERP_population', 'median_income', 'metrobus_count', 'metrotrain_count', 'metrotram_count', 'regbus_count', 'regcoach_count', 'regtrain_count', 'skybus_count', 'recr_count', 'comm_count', 'deal_count', 'median_rent', 'cbd_dis']
['_c0', 'SA2', 'rent', 'bedroom', 'baths', 'parking', 'Latitude', 'Longitude', 'school_dis', 'station_dis', 'cbd_dis']
-RECORD 0---------------------
 rent             | 490.0     
 bedroom          | 4         
 baths            | 2         
 parking          | 2         
 school_dis       | 1651.7    
 station_dis      | 5895.5    
 cbd_dis          | 350.0     
 median_rent      | 367.5     
 SA2              | 201011001 
 school_count     | 4         
 ERP_population   | 16823     
 median_income    | 1952      
 metrobus_count   | 0         
 metrotrain_count | 0         
 metrotram_count  | 0         
 regbus_count     | 43        
 regcoach_count   | 0         
 regtrain_count   | 0         
 skybus_count     | 0         
 r

By Junhua Liu for study use only