# ABS preprocessing

This notebook aggregates the ABS data such that the relevant fields are extracted and the data is neatly formatted into a csv wherein the rows are associated with an SA2

In [21]:
import pandas as pd

In [22]:
# years we are dealing with
years = [(2021, "SA2_CODE_2021"), (2016, "SA2_MAINCODE_2016")]

# fields we want to extract and the file the data is in : 
# Creating the list of tuples with the shorthand name and G01 or G02 group.
fields_with_names = [
    ("Median_age_persons", "G02"),
    ("Median_mortgage_repay_monthly", "G02"),
    ("Median_tot_prsnl_inc_weekly", "G02"),
    ("Median_rent_weekly", "G02"),
    ("Median_tot_fam_inc_weekly", "G02"),
    ("Average_num_psns_per_bedroom", "G02"),
    ("Median_tot_hhd_inc_weekly", "G02"),
    ("Average_household_size", "G02")
]


fields_with_names

[('Median_age_persons', 'G02'),
 ('Median_mortgage_repay_monthly', 'G02'),
 ('Median_tot_prsnl_inc_weekly', 'G02'),
 ('Median_rent_weekly', 'G02'),
 ('Median_tot_fam_inc_weekly', 'G02'),
 ('Average_num_psns_per_bedroom', 'G02'),
 ('Median_tot_hhd_inc_weekly', 'G02'),
 ('Average_household_size', 'G02')]

In [23]:
datasets = []

path = '../../data/landing/'

for year, code in years:
    # Create an empty DataFrame for each year to append data into
    yearly_df = pd.DataFrame()
    
    full_path = path + f"{year}_GCP_SA2_for_VIC_short-header/{year} Census GCP Statistical Area 2 for VIC/"

    for field, group in fields_with_names:
        # Construct the file path based on year and group
        file = f"{full_path}{year}Census_{group}_VIC_SA2.csv"
        
        # Read the CSV into a pandas DataFrame
        pandas_df = pd.read_csv(file)
        
        # Filter to only keep the fields of interest (code and the specific field)
        pandas_df = pandas_df[[code, field]]
        
        # If the yearly_df is empty, initialize it with the code column
        if yearly_df.empty:
            yearly_df[code] = pandas_df[code]
        
        # Add the specific field to the yearly DataFrame
        yearly_df[field] = pandas_df[field]

    # Append the complete DataFrame for the year to the datasets list
    datasets.append(yearly_df)

datasets[0].head()

Unnamed: 0,SA2_CODE_2021,Median_age_persons,Median_mortgage_repay_monthly,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Average_num_psns_per_bedroom,Median_tot_hhd_inc_weekly,Average_household_size
0,201011001,34,1698,865,370,2218,0.8,1952,2.8
1,201011002,45,1700,842,313,2276,0.7,1573,2.2
2,201011005,42,1662,805,330,2270,0.8,1927,2.7
3,201011006,33,1500,775,360,1855,0.7,1627,2.6
4,201011007,41,1733,802,350,2236,0.8,2065,3.0


In [24]:
# standardise the first column name
for dataset in datasets:
    dataset.rename(columns={dataset.columns[0]: "SA2_CODE"}, inplace=True)

In [25]:
# export the files

export_path = '../../data/curated/'

for i,dataset in enumerate(datasets):
    dataset.to_csv(f"{export_path}ABS_data_{years[i][0]}.csv", index=False)