In [1]:
# for unzipping
import requests, zipfile
from io import BytesIO
import os

# for the good shit
import pandas as pd
import numpy as np

In [2]:
RELATIVE_PATH = "../../data/landing/"
SA2_CODE_NAME = "SA2 code"
MERGE_COLUMNS = ["SA2_MAINCODE_2016", "SA2_CODE_2021", SA2_CODE_NAME, "year"]
MERGE_FILE_NAME = "merge.csv"

## Creating File system

In [3]:
# directories to create
DIRECTORIES = ["ABS", "ABS/2021", "ABS/2016"]
directory_paths = [RELATIVE_PATH + dir for dir in DIRECTORIES]

# create the paths
for directory_path in directory_paths:
    os.makedirs(directory_path, exist_ok=True)

## Getting the zip file

#### Downloading and unzip

In [4]:
import shutil

def delete_contents(directory):
    # Check if the directory exists
    if os.path.exists(directory):
        # Clear the contents of the directory
        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)  # Remove files and symlinks
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)  # Remove directories
            except Exception as e:
                print(f"Failed to delete {file_path}. Reason: {e}")
    else:
        print(f"Directory '{directory}' does not exist.")

In [5]:
# get the request 
urls = ["https://www.abs.gov.au/census/find-census-data/datapacks/download/2021_GCP_SA2_for_VIC_short-header.zip",
        "https://www.abs.gov.au/census/find-census-data/datapacks/download/2016_GCP_SA2_for_VIC_short-header.zip"]
names = ["2021", "2016"]

# donwload and unzip each url
for url, name in zip(urls, names):
    print(f"Downloading started for {name}")
    response = requests.get(url)
    print("Download completed")

    # prepare the outfile
    out_path = RELATIVE_PATH + "ABS/" + name
    delete_contents(out_path)

    # extract the zip file
    zipfile_obj = zipfile.ZipFile(BytesIO(response.content))
    zipfile_obj.extractall(out_path)

    print("Finishing extracting")

Downloading started for 2021


Download completed
Finishing extracting
Downloading started for 2016
Download completed
Finishing extracting


#### Renaming files

In [6]:
# RENAME the data file
paths = [RELATIVE_PATH + "ABS/" + name + "/" for name in names]

for path in paths:
    # find the data file
    data_names = [dir for dir in os.listdir(path) if "Census GCP" in dir]

    if (len(data_names) != 1):
        print("something wrong, the old path doesn't exist")
        break

    # out path and in path
    curr_path = path + data_names[0]
    new_path = path + "data"

    # rename the file
    os.rename(curr_path, new_path)

In [7]:
# RENAME the meta data file
paths = [RELATIVE_PATH + "ABS/" + name + "/Metadata/" for name in names]

for path in paths:
    # find the data file
    data_names = [dir for dir in os.listdir(path) if "Metadata" in dir]

    if (len(data_names) != 1):
        print("something wrong, the old path doesn't exist")
        break

    # out path and in path
    curr_path = path + data_names[0]
    new_path = path + "column_names.xlsx"

    # rename the file
    os.rename(curr_path, new_path)

## Functions

#### Merging

In [8]:
def get_next_merge(new_df, final_df):
    if (final_df.empty):
        return new_df

    # get the attributes used for the merge
    merge_columns = list(set(new_df.columns) & set(MERGE_COLUMNS) & set(final_df.columns))

    # get the columns not already in the data frame
    new_columns = list(set(new_df.columns) - set(final_df.columns) - set(merge_columns))

    # merge columns
    merged_df = pd.merge(final_df, new_df[merge_columns + new_columns], on=merge_columns, how="inner")

    # check if any rows lost
    print(f"lost {final_df.shape[0] - merged_df.shape[0]} out of {final_df.shape[0]} records")
    
    return merged_df

In [9]:
def get_merged_df(df_list):
    # keep on aggregating
    agg_df = df_list[0]
    for new_df in df_list[1:]:
        agg_df = get_next_merge(new_df, agg_df)

    return agg_df

#### Renaming

In [10]:
""""Note: uses the actual index name"""
def rename_dict(df, csv_dict):
    # find the merge columns present in the dictionary
    merge_columns = list(set(df.columns) & set(MERGE_COLUMNS))

    if (len(merge_columns) != 1):
        print("more than one merge column in dictionary")
        return None

    # filter the dictionary with the old names
    df = df[merge_columns + list(csv_dict["rename"].keys())]

    # get the new names
    df.columns = [SA2_CODE_NAME] + [csv_dict["name"] + ": " + new_name for new_name in  csv_dict["rename"].values()]
    
    return df

#### Reading the data

In [11]:
def write_csv(csv_dict):
    # get the relative directory all files
    relative_path = RELATIVE_PATH + 'ABS/' + csv_dict["year"] + "/data/"

    # get all the files from the directory with the code
    all_files = os.listdir(RELATIVE_PATH + 'ABS/' + csv_dict["year"] + "/data")
    code_paths = [relative_path + file for file in all_files if csv_dict["code"] in file]

    # create a new dataframe for each path
    code_dataframes = []
    for code_path in code_paths:
        code_dataframes.append(pd.read_csv(code_path))

    # merge the dictionary
    out_df = get_merged_df(code_dataframes)

    # rename the columns
    out_df = rename_dict(out_df, csv_dict)

    # add a year if necessary
    if (csv_dict.get("add year")):
        out_df["year"] = int(csv_dict["year"])

        # write the csv
        out_df.to_csv(RELATIVE_PATH + "ABS/" + csv_dict["name"] + "_" + csv_dict["year"] + ".csv")
    else:
        # write csv with year code
        out_df.to_csv(RELATIVE_PATH + "ABS/" + csv_dict["name"] + ".csv")

In [12]:
def get_columns(code, metadata, start=False, ending=False):
    # filter for the code
    metadata = metadata[metadata["Profiletable"].str.startswith(code)]

    # if table is empty, raise a warning
    if (metadata.empty):
        print(f"no corresponding code to {code}")
        return
    
    # find the mask for the values of interest
    if (ending):
        select_mask = metadata["Long"].apply(lambda x: x[-len(ending):] == ending)
        metadata = metadata[select_mask][["Short", "Long"]]
    if (start):
        select_mask = metadata["Long"].str.startswith(start)
        metadata = metadata[select_mask][["Short", "Long"]]

    # create the out dict, and convert to lower
    out_dict = {short: long.lower() for short, long in metadata.values}

    # create a dictionary
    return out_dict


## Execution

#### Read metadata

In [13]:
metadata = pd.read_excel(RELATIVE_PATH + "ABS/2021/Metadata/column_names.xlsx",
                         sheet_name="Cell Descriptors Information",
                         skiprows = list(range(10)),
                         header=0,
                         index_col=0)

print(metadata.shape)
print(metadata.dtypes)
metadata.head(5)

(16984, 5)
Short                                object
Long                                 object
DataPackfile                         object
Profiletable                         object
Columnheadingdescriptioninprofile    object
dtype: object


Unnamed: 0_level_0,Short,Long,DataPackfile,Profiletable,Columnheadingdescriptioninprofile
Sequential,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
G1,Tot_P_M,Total_Persons_Males,G01,G01,Males
G2,Tot_P_F,Total_Persons_Females,G01,G01,Females
G3,Tot_P_P,Total_Persons_Persons,G01,G01,Persons
G4,Age_0_4_yr_M,Age_groups_0_4_years_Males,G01,G01,Males
G5,Age_0_4_yr_F,Age_groups_0_4_years_Females,G01,G01,Females


#### Types of relationships

In [14]:
CSV_RELATIONSHIPS_2021 = {
    "year": "2021",
    "code": "G27",
    "name": "relationships",
    "rename": {
        "P_Ptn_in_RM_Tot": "married",
        "P_Ptn_in_DFM_Tot": "defacto",
        "P_LonePnt_Tot": "lone parents",
        "P_CU15_Tot": "child under 15",
        "P_DpStu_Tot": "dependent student",
        "P_NDpChl_Tot": "non dependent child",
        "P_OthRI_Tot": "other related individual",
        "P_GrpH_Mem_Tot": "group household",
        "P_LonePsn_Tot": "lone persons"
    },
    "add year": False
}

In [15]:
write_csv(CSV_RELATIONSHIPS_2021)

lost 0 out of 524 records


In [16]:
CSV_RELATIONSHIPS_2016 = {
    "year": "2016",
    "code": "G23",
    "name": "relationships",
    "rename": {
        "P_H_or_W_in_RM_Tot": "married",
        "P_Ptn_in_DFM_Tot": "defacto",
        "P_LonePnt_Tot": "lone parents",
        "P_CU15_Tot": "child under 15",
        "P_DpStu_Tot": "dependent student",
        "P_NDpChl_Tot": "non dependent child",
        "P_OthRI_Tot": "other related individual",
        "P_GrpH_Mem_Tot": "group household",
        "P_LonePsn_Tot": "lone persons"
    },
    "add year": True
}

In [17]:
#write_csv(CSV_RELATIONSHIPS_2016)

#### Overseas

In [18]:
CSV_OVERSEAS = {
    "year": "2021",
    "code": "G45",
    "name": "overseas",
    "rename": {"Difnt_Usl_add_5_yr_ago_OS_P": "5 years"}
}

In [19]:
write_csv(CSV_OVERSEAS)

#### Birth country

In [20]:
birth_dict = get_columns("G09", metadata, start="PERSONS", ending="Total")

# get the keys and values
birth_dict_keys = birth_dict.keys()
birth_dict_values = birth_dict.values()

# remove the start and end
birth_dict_values = [" ".join(ethnic_string.split("_")[1:-1]) for ethnic_string in birth_dict_values]

# rerecreate the dict
birth_dict = {key: value for key, value in zip(birth_dict_keys, birth_dict_values)}

In [21]:
CSV_BIRTH = {
    "year": "2021",
    "code": "G09",
    "name": "birth",
    "rename": birth_dict
}

In [22]:
write_csv(CSV_BIRTH)

lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records


#### Studying

In [23]:
CSV_STUDYING = {
    "year": "2021",
    "code": "G15",
    "name": "studying",
    "rename": {
        "Preschool_P": "preschool",
        "Primary_Government_P": "primary government",
        "Primary_Catholic_P": "primary catholic",
        "Primry_Othr_non_Govt_P": "primary other",
        "Primary_Tot_Primary_P": "primary total",
        "Secondary_Government_P": "secondary government",
        "Secondary_Catholic_P": "secondary catholic",
        "Secondary_Tot_Secondary_P": "secondary total",
        "Tert_Voc_edu_Tot_P": "tafe total",
        "Tert_Uni_oth_h_edu_Ft_15_24_P": "tertiary FT 14-25",
        "Tert_Uni_oth_h_edu_Ft_25_ov_P": "tertiary FT 25+",
        "Tert_Uni_oth_h_edu_Pt_15_24_P": "tertiary PT 14-25",
        "Tert_Uni_oth_h_edu_Pt_25_ov_P": "tertiary PT 25+",
        "Tert_Uni_other_high_edu_Tot_P": "tertiary total"
    }
}

In [24]:
write_csv(CSV_STUDYING)

## Merge

In [26]:
# get all the csvs
merge_path = RELATIVE_PATH + "ABS/"
merging_files = [merge_path + file for file in os.listdir(merge_path) 
                 if (file != MERGE_FILE_NAME) and (file.endswith(".csv"))
                 and (not file.endswith("_2016.csv") and (not file.endswith("_2021.csv")))]

# get the list of dataframes
merging_frames = []
for file in merging_files:
    merging_frames.append(pd.read_csv(file))

get_merged_df(merging_frames).to_csv(merge_path + MERGE_FILE_NAME, index=False)

lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records
