In [21]:
import numpy as np
import pandas as pd
import os

## Functions

In [22]:
SA2_CODE_NAME = "SA2 code"
MERGE_COLUMNS = ["SA2_MAINCODE_2016", "SA2_CODE_2021", SA2_CODE_NAME, "year"]

RELATIVE_PATH_IN = "../../data/1. landing/ABS/2021/data"
RELATIVE_PATH_OUT = "../../data/2. raw/1. renamed/ABS"

### Merge

In [23]:
def get_next_merge(new_df, final_df):
    if (final_df.empty):
        return new_df

    # get the attributes used for the merge
    merge_columns = list(set(new_df.columns) & set(MERGE_COLUMNS) & set(final_df.columns))

    # get the columns not already in the data frame
    new_columns = list(set(new_df.columns) - set(final_df.columns) - set(merge_columns))

    # merge columns
    merged_df = pd.merge(final_df, new_df[merge_columns + new_columns], on=merge_columns, how="inner")

    # check if any rows lost
    print(f"lost {final_df.shape[0] - merged_df.shape[0]} out of {final_df.shape[0]} records")
    
    return merged_df

In [24]:
def get_merged_df(df_list):
    # keep on aggregating
    agg_df = df_list[0]
    for new_df in df_list[1:]:
        agg_df = get_next_merge(new_df, agg_df)

    return agg_df

### Renaming

In [25]:
""""Note: uses the actual index name"""
def rename_dict(df, csv_dict):
    # find the merge columns present in the dictionary
    merge_columns = list(set(df.columns) & set(MERGE_COLUMNS))

    if (len(merge_columns) != 1):
        print("more than one merge column in dictionary")
        return None

    # filter the dictionary with the old names
    df = df[merge_columns + list(csv_dict["rename"].keys())]

    # get the new names
    df.columns = [SA2_CODE_NAME] + [csv_dict["name"] + ": " + new_name for new_name in  csv_dict["rename"].values()]
    
    return df

### Reading Data

In [38]:
def write_csv(csv_dict):
    # get all the files from the directory with the code
    all_files = os.listdir(RELATIVE_PATH_IN)
    code_paths = [f"{RELATIVE_PATH_IN}/{file}" for file in all_files if csv_dict["code"] in file]

    # create a new dataframe for each path
    code_dataframes = []
    for code_path in code_paths:
        code_dataframes.append(pd.read_csv(code_path))

    # merge the dictionary
    out_df = get_merged_df(code_dataframes)

    # rename the columns
    out_df = rename_dict(out_df, csv_dict)

    print(out_df.dtypes)

    # add a year if necessary
    if (csv_dict.get("add year")):
        out_df["year"] = int(csv_dict["year"])

        # write the csv
        out_df.to_csv(f"{RELATIVE_PATH_OUT}/{csv_dict['name']}.csv")
    else:
        # write csv with year code
        out_df.to_csv(f"{RELATIVE_PATH_OUT}/{csv_dict['name']}.csv")

In [32]:
def get_columns(code, metadata, start=False, ending=False):
    # filter for the code
    metadata = metadata[metadata["Profiletable"].str.startswith(code)]

    # if table is empty, raise a warning
    if (metadata.empty):
        print(f"no corresponding code to {code}")
        return
    
    # find the mask for the values of interest
    if (ending):
        select_mask = metadata["Long"].apply(lambda x: x[-len(ending):] == ending)
        metadata = metadata[select_mask][["Short", "Long"]]
    if (start):
        select_mask = metadata["Long"].str.startswith(start)
        metadata = metadata[select_mask][["Short", "Long"]]

    # create the out dict, and convert to lower
    out_dict = {short: long.lower() for short, long in metadata.values}

    # create a dictionary
    return out_dict


## Execution

In [28]:
os.makedirs(RELATIVE_PATH_OUT, exist_ok=True)

#### Read metadata

In [15]:
metadata = pd.read_excel(f"{RELATIVE_PATH_IN}/Metadata/column_names.xlsx",
                         sheet_name="Cell Descriptors Information",
                         skiprows = list(range(10)),
                         header=0,
                         index_col=0)

print(metadata.shape)
print(metadata.dtypes)
metadata.head(5)

(16984, 5)
Short                                object
Long                                 object
DataPackfile                         object
Profiletable                         object
Columnheadingdescriptioninprofile    object
dtype: object


Unnamed: 0_level_0,Short,Long,DataPackfile,Profiletable,Columnheadingdescriptioninprofile
Sequential,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
G1,Tot_P_M,Total_Persons_Males,G01,G01,Males
G2,Tot_P_F,Total_Persons_Females,G01,G01,Females
G3,Tot_P_P,Total_Persons_Persons,G01,G01,Persons
G4,Age_0_4_yr_M,Age_groups_0_4_years_Males,G01,G01,Males
G5,Age_0_4_yr_F,Age_groups_0_4_years_Females,G01,G01,Females


#### Types of relationships

In [39]:
CSV_RELATIONSHIPS_2021 = {
    "year": "2021",
    "code": "G27",
    "name": "relationships",
    "rename": {
        "P_Ptn_in_RM_Tot": "married",
        "P_Ptn_in_DFM_Tot": "defacto",
        "P_LonePnt_Tot": "lone parents",
        "P_CU15_Tot": "child under 15",
        "P_DpStu_Tot": "dependent student",
        "P_NDpChl_Tot": "non dependent child",
        "P_OthRI_Tot": "other related individual",
        "P_GrpH_Mem_Tot": "group household",
        "P_LonePsn_Tot": "lone persons"
    },
    "add year": False
}
write_csv(CSV_RELATIONSHIPS_2021)

lost 0 out of 524 records
SA2 code                                   int64
relationships: married                     int64
relationships: defacto                     int64
relationships: lone parents                int64
relationships: child under 15              int64
relationships: dependent student           int64
relationships: non dependent child         int64
relationships: other related individual    int64
relationships: group household             int64
relationships: lone persons                int64
dtype: object


#### Overseas

In [40]:
CSV_OVERSEAS = {
    "year": "2021",
    "code": "G45",
    "name": "overseas",
    "rename": {"Difnt_Usl_add_5_yr_ago_OS_P": "5 years"}
}
write_csv(CSV_OVERSEAS)

SA2 code             int64
overseas: 5 years    int64
dtype: object


#### Birth country

In [41]:
birth_dict = get_columns("G09", metadata, start="PERSONS", ending="Total")

# get the keys and values
birth_dict_keys = birth_dict.keys()
birth_dict_values = birth_dict.values()

# remove the start and end
birth_dict_values = [" ".join(ethnic_string.split("_")[1:-1]) for ethnic_string in birth_dict_values]

# rerecreate the dict
birth_dict = {key: value for key, value in zip(birth_dict_keys, birth_dict_values)}
CSV_BIRTH = {
    "year": "2021",
    "code": "G09",
    "name": "birth",
    "rename": birth_dict
}
write_csv(CSV_BIRTH)

lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records
SA2 code                                 int64
birth: afghanistan age                   int64
birth: australia                         int64
birth: bangladesh                        int64
birth: bosnia and herzegovina            int64
birth: brazil                            int64
birth: cambodia                          int64
birth: canada                            int64
birth: chile                             int64
birth: china excludes sars and taiwan    int64
birth: croatia                           int64
birth: egypt                             int64
birth: england                           int64
birth: fiji                              int64
birth: france                            int64
birth: germany                           int64
birth: greece                            int64
birth: hong kong sa

#### Studying

In [42]:
CSV_STUDYING = {
    "year": "2021",
    "code": "G15",
    "name": "studying",
    "rename": {
        "Preschool_P": "preschool",
        "Primary_Government_P": "primary government",
        "Primary_Catholic_P": "primary catholic",
        "Primry_Othr_non_Govt_P": "primary other",
        "Primary_Tot_Primary_P": "primary total",
        "Secondary_Government_P": "secondary government",
        "Secondary_Catholic_P": "secondary catholic",
        "Secondary_Tot_Secondary_P": "secondary total",
        "Tert_Voc_edu_Tot_P": "tafe total",
        "Tert_Uni_oth_h_edu_Ft_15_24_P": "tertiary FT 14-25",
        "Tert_Uni_oth_h_edu_Ft_25_ov_P": "tertiary FT 25+",
        "Tert_Uni_oth_h_edu_Pt_15_24_P": "tertiary PT 14-25",
        "Tert_Uni_oth_h_edu_Pt_25_ov_P": "tertiary PT 25+",
        "Tert_Uni_other_high_edu_Tot_P": "tertiary total"
    }
}
write_csv(CSV_STUDYING)

SA2 code                          int64
studying: preschool               int64
studying: primary government      int64
studying: primary catholic        int64
studying: primary other           int64
studying: primary total           int64
studying: secondary government    int64
studying: secondary catholic      int64
studying: secondary total         int64
studying: tafe total              int64
studying: tertiary FT 14-25       int64
studying: tertiary FT 25+         int64
studying: tertiary PT 14-25       int64
studying: tertiary PT 25+         int64
studying: tertiary total          int64
dtype: object
