In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
import openpyxl  
import sys

sys.path.append('../scripts')
from preprocess_script import count_outliers
from download_zip import download_zip_file

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Dataset Joining")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.driver.memory", "9g") 
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("OFF")

24/10/05 04:33:43 WARN Utils: Your hostname, codespaces-c6855a resolves to a loopback address: 127.0.0.1; using 10.0.0.128 instead (on interface eth0)
24/10/05 04:33:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/05 04:33:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Dataset downloaded from https://data.gov.au/dataset/ds-dga-2c79581f-600e-4560-80a8-98adb1922dfc/details?q=correspondence%20asgs, data dictionary also in the same. Data to be put into `tables/correspondence/`

In [3]:
# Download SA2 codes to postcodes correspondence file
fn = "CG_POSTCODE_2021_SA2_2021.xlsx"
folder = "../data/tables/correspondence"
src = "https://data.gov.au/data/dataset/2c79581f-600e-4560-80a8-98adb1922dfc/resource/33d822ba-138e-47ae-a15f-460279c3acc3/download/asgs2021_correspondences.zip"

download_zip_file(fn, folder, src)

File 'CG_POSTCODE_2021_SA2_2021.xlsx' downloaded and extracted to ../data/tables/correspondence.


In [4]:
# Read xlsx to dataframe
col_types = {"POSTCODE": str, "SA2_CODE_2021":str, "RATIO_FROM_TO": float}
correspondence_df = pd.read_excel("../data/tables/correspondence/CG_POSTCODE_2021_SA2_2021.xlsx", converters=col_types)

# Assign SA2 codes to postcode by the greatest area
preferred_SA2_ratio = correspondence_df.groupby('POSTCODE')['RATIO_FROM_TO'].max().reset_index()
preferred_SA2 = correspondence_df.merge(preferred_SA2_ratio, on=['POSTCODE','RATIO_FROM_TO'])
sa2_poa_codes = preferred_SA2.rename({"POSTCODE":'postcode', "SA2_CODE_2021":'sa2_code', "SA2_NAME_2021": 'sa2_name'}, axis=1).iloc[:,:3]
sa2_poa_codes

Unnamed: 0,postcode,sa2_code,sa2_name
0,0800,701011002,Darwin City
1,0810,701021021,Lyons (NT)
2,0812,701021019,Karama
3,0820,701011008,Stuart Park
4,0822,702041063,East Arnhem
...,...,...,...
2636,7466,604031097,West Coast (Tas.)
2637,7467,604031097,West Coast (Tas.)
2638,7468,604031097,West Coast (Tas.)
2639,7469,604031097,West Coast (Tas.)


In [5]:
# Saving the correspondence
sa2_poa_codes.to_parquet('../data/curated/correspondence.parquet')