In [13]:
import pandas as pd
import numpy as np
import os

## Create File System

In [14]:

# get the relative output path
RELATIVE_PATH_IN = "../../data/1. landing/"
RELATIVE_PATH_OUT = "../../data/2. raw/1. renamed/"

# get the directory names
directory_names = ["housing", "population", "economic", "economic_by_region", "school"]

# create each new directory
for directory_name in directory_names:
    new_path = RELATIVE_PATH_OUT + directory_name

    # if path doesn't exist, create
    if (not os.path.exists(new_path)):
        os.mkdir(new_path)

## Functions

### Type Conversions

In [2]:
""""assumes all input columns are strings"""
def change_numeric(df, columns):
    for col in columns:
        df[col] = df[col].astype(str)
        df[col] = df[col].str.replace(",", "")
        df[col] = pd.to_numeric(df[col], errors="coerce")

    return df

### Renaming

In [15]:
def fill_previous(df, na="Unnamed"):
    # get the current columns
    curr_columns = list(df.columns)

    # basically make the columns by level and create a data frame
    flat_column_list = [[group[i] for group in curr_columns] for i in range(len(curr_columns[0]))]
    column_dataframe = pd.DataFrame(flat_column_list)

    # create the missing values then fill them again
    column_dataframe = column_dataframe.applymap(lambda x: None if (not x) or (na in str(x)) else x)
    column_dataframe = column_dataframe.T.ffill().T

    # get as a numpy array once again
    flat_column_list = column_dataframe.to_numpy()

    # get the 
    df.columns = pd.MultiIndex.from_arrays(flat_column_list)

    return df

In [82]:
import warnings

def rename_columns(df, index_dict={}, column_dict={}, keep_columns=False, multilevel=False, warn_me=True):
    # get the renamed dictionary using indices
    new_column_names = list(df.columns.copy())

    # get the column and index indices
    index_indices = list(index_dict.keys())
    column_indices = [new_column_names.index(col) for col in column_dict.keys() if col in new_column_names]

    # check no overlap
    if (index_indices and column_indices):
        overlap = set(column_indices) & set(index_indices)
        assert (not column_indices or not index_indices or not overlap), f"index_dict and column_dict both use {[new_column_names[x] for x in overlap]}"

    # check within suitable range
    if (index_indices):
        assert (min(index_indices) >= 1), "index_dict indices must be above 0"
        assert (max(index_indices) <= len(new_column_names)), f"index_dict indices must be below {len(new_column_names)}"

    # create a warning for columns not included
    columns_not_used = [col for col in column_dict.keys() if not col in new_column_names]
    if (warn_me and columns_not_used):
        warnings.warn(f"the following columns were not included: {columns_not_used}", UserWarning)

    # use indices to change the dictionary
    if (index_dict | column_dict):
        for col_num in range(1, len(new_column_names)+1):
            if (index_dict.get(col_num) != None):
                new_column_names[col_num-1] = index_dict[col_num]
            elif (column_dict.get(new_column_names[col_num-1]) != None):
                new_column_names[col_num-1] = column_dict[new_column_names[col_num-1]]
            elif (multilevel == True):
                new_column_names[col_num-1] = (new_column_names[col_num-1], None)

    # convert the rest of the indices necessary to multi index
    if (multilevel == True):
        new_column_names = pd.MultiIndex.from_tuples(new_column_names)

    # add the new columns
    df.columns = new_column_names

    # drop the columns not of interest
    if (not keep_columns):
        index_indices = [x-1 for x in index_indices]
        keep_columns = sorted(index_indices + column_indices)
    elif (keep_columns == "all"):
        keep_columns = list(range(len(new_column_names)))
    else:
        keep_columns = [x-1 for x in keep_columns]
    
    # filter the columns
    df = df.iloc[:, keep_columns]

    return df

### Excel

In [17]:
def get_new_columns(old_columns, missing_columns):
    new_columns = []
    for column in old_columns:
        if "Unnamed" in column:
            new_columns.append(missing_columns.pop(0))
        else:
            new_columns.append(column)

    return new_columns

In [18]:
def get_csv(src_path, sheet_dict, read_excel_dict):
    if (not read_excel_dict.get("csv")):
        # read the excel data
        df = pd.read_excel(src_path,
            sheet_name=sheet_dict["sheet"],
            skiprows=read_excel_dict["skiprows"],
            index_col=read_excel_dict["index col"], 
            header=read_excel_dict["header rows"],
            nrows=read_excel_dict.get("nrows")
        )
    else:
        # read csv data
        df = pd.read_csv(src_path, index_col=read_excel_dict["index col"], header=read_excel_dict["header rows"])

    # skip the final records if necessary
    if (read_excel_dict.get("skip tail")):
        df = df.iloc[:-read_excel_dict["skip tail"]]

    # if there are missing columns
    if (read_excel_dict.get("missing header")):
        df.columns = get_new_columns(df.columns, read_excel_dict.get("missing header"))

    return df

## Housing

In [168]:
# setting up the parameters for the reading data
read_excel_dict_houses = {
    "skiprows": [0, 2],
    "index col": None,
    "header rows": 0,
}

# get all the URLS and sheets to get
SHEET_NAMES_HOUSES = [{"sheet": f"{i} bedroom flat", "name": f"flat_{i}_bed"} for i in range(1, 4)] + \
                     [{"sheet": f"{i} bedroom house", "name": f"house_{i}_bed"} for i in range(2, 5)] + \
                     [{"sheet": "All properties", "name": "all_properties"}]

In [169]:
# get the measure columns
time_stamps = [str(year) + "-" + month 
               for year in range(2000, 2025) 
               for month in ["03", "06", "09", "12"]]

# remove the last 2 quarters (because don't exist in the dataframe)
time_stamps = time_stamps[:-2]

# get the columns as tuples and get the rename dict
measure_columns = [(time_stamp, measure) for time_stamp in time_stamps for measure in ["count", "median"]]
new_columns = [("suburbs", None)] + measure_columns
rename_dict_housing = {i+2: new_columns[i] for i in range(len(new_columns))}

In [1]:
housing_df_list = []
for sheet_dict in SHEET_NAMES_HOUSES:
    # get the paths
    src_path = RELATIVE_PATH_IN + "housing/moving quarterly rent.xlsx"
    dst_path = RELATIVE_PATH_OUT + "housing/" + sheet_dict["name"] + ".csv"

    # get the dataframe
    housing_df = get_csv(src_path, sheet_dict, read_excel_dict_houses)

    # rename the dataframe
    housing_df = rename_columns(housing_df, index_dict=rename_dict_housing, multilevel=True)

    # change to numeric
    housing_df = change_numeric(housing_df, housing_df.columns[1:])

    # write to csv
    housing_df.to_csv(dst_path)

NameError: name 'SHEET_NAMES_HOUSES' is not defined

## Economic

### Interest Rates

In [145]:
# setting up the parameters for the reading data
read_excel_dict_interest = {
    "skiprows": [x for x in range(11) if x != 1],
    "index col": None,
    "header rows": 0,
}

# get the renamed dict
rename_dict_interest = {1: "year", 4: "economic: variable interest rate"}

In [146]:
# get the dataframe
interest = get_csv(RELATIVE_PATH_IN + "economic/interest.xlsx", {"sheet": "Data"}, read_excel_dict_interest)

# rename the dataframe
interest = rename_columns(interest, index_dict=rename_dict_interest)

# split up the time stamp
interest[["year", "month", "day"]] = interest["year"].astype(str).str.split("-", expand=True).astype(int)
interest = interest.drop(columns="day")

# change the column types
interest = change_numeric(interest, interest.columns)
interest["year"] = interest["year"].astype(int)

# write to csv
interest.to_csv(RELATIVE_PATH_OUT + "economic/interest.csv")

interest.head(5)

Unnamed: 0,year,economic: variable interest rate,month
0,1959,5.0,1
1,1959,5.0,2
2,1959,5.0,3
3,1959,5.0,4
4,1959,5.0,5


### GDP

In [147]:
# setting up the parameters for the reading data
read_excel_dict_gdp = {
    "skiprows": list(range(9)),
    "index col": None,
    "header rows": 0,
}

rename_dict_gdp_index = {1: "year"}
rename_dict_gdp_columns = {"A2304370T": "economic: gdp quarterly", "A2304372W": "economic: gdp per capita quarterly"}

In [148]:
# get the dataframe
gdp = get_csv(RELATIVE_PATH_IN + "economic/gdp.xlsx", {"sheet": "Data1"}, read_excel_dict_gdp)

# rename the dataframe
gdp = rename_columns(gdp, index_dict=rename_dict_gdp_index, column_dict=rename_dict_gdp_columns)

# split up the month and the year
gdp[["year", "month", "day"]] = gdp["year"].astype(str).str.split("-", expand=True).astype(int)
gdp = gdp.drop(columns="day")

# change the column types
gdp = change_numeric(gdp, gdp.columns)
gdp["year"] = gdp["year"].astype(int)

# write to csv
gdp.to_csv(RELATIVE_PATH_OUT + "economic/gdp.csv")

### Inflation

In [149]:
# setting up the parameters for the reading data
read_excel_dict_inflation = {
    "skiprows": list(range(1, 10)),
    "index col": None,
    "header rows": 0,
}

rename_dict_inflation_columns = {"Percentage Change from Previous Period ;  Trimmed Mean ;  Australia ;": "economic: trimmed mean quarterly"}
rename_dict_inflation_index = {1: "year"}

In [150]:
# get the dataframe
inflation = get_csv(RELATIVE_PATH_IN + "economic/inflation.xlsx", {"sheet": "Data1"}, read_excel_dict_inflation)

# rename the dataframe
inflation = rename_columns(inflation, column_dict=rename_dict_inflation_columns, index_dict=rename_dict_inflation_index)

# get the columns
inflation[["year", "month", "day"]] = inflation.astype(str)["year"].str.split("-", expand=True)
inflation = inflation.drop(columns="day")

# change the column types
inflation = change_numeric(inflation, inflation.columns)
inflation["year"] = inflation["year"].astype(int)

# write to csv
inflation.to_csv(RELATIVE_PATH_OUT + "economic/inflation.csv")

## Population

### Age Demos

In [156]:
# setting up the parameters for the reading data
read_excel_dict_age = {
    "skiprows": list(range(6)),
    "index col": None,
    "header rows": 0,
    "skip tail": 1
}

# rename the columns
rename_dict_age = {1: "year", 3: "state", 10: "SA2 code"}
rename_dict_age.update({i+11+1: f"population: {i*5}-{i*5+4}" for i in range(17)})# need the +1 because it's the actual column position
rename_dict_age.update({29: "population: 85+", 30: "population: total"})

In [157]:
# get the dataframe
df = get_csv(RELATIVE_PATH_IN + "population/age demographics.xlsx", {"sheet": "Table 3"}, read_excel_dict_age)

# rename the dataframe
df = rename_columns(df, index_dict=rename_dict_age)

# change numeric columns
df = change_numeric(df, list(df.columns[:1]) + list(df.columns[3:]))
df["year"] = df["year"].astype(int)
df["SA2 code"] = df["SA2 code"].astype(float)

# write to csv
df.to_csv(RELATIVE_PATH_OUT + "population/age demographics.csv")

### Future Population

In [158]:
# setting up the parameters for the reading data
read_excel_dict_population = {
    "skiprows": [x for x in range(11) if x != 9],
    "index col": None,
    "header rows": 0,
    "skip tail": 5
}

# rename the columns
rename_dict_population = {year: f"population: {year}" for year in range(2026, 2036+1, 5)}
rename_dict_population.update({"SA2  code": "SA2 code"})

In [159]:
# get the dataframe
df = get_csv(RELATIVE_PATH_IN + "population/projections.xlsx", {"sheet": "Total_Population"}, read_excel_dict_population)

# rename the dataframe
df = rename_columns(df, column_dict=rename_dict_population, keep_columns=[4, 8, 9, 10])

df = change_numeric(df, df.columns[1:])
df["SA2 code"] = df["SA2 code"].astype(float)

# write to csv
df.to_csv(RELATIVE_PATH_OUT + "population/projections.csv")

## Economic by Region

### Geography

In [160]:
# setting up the parameters for the reading data
read_excel_dict_economic_by_region = {
    "skiprows": list(range(5)),
    "index col": None,
    "header rows": [0, 1],
    "skip tail": 5
}

# rename the columns
rename_dict_economic_geography = {1: ("SA2 code",None), 2: ("SA2 name",None)}

In [163]:
# get the dataframe
df = get_csv(RELATIVE_PATH_IN + "economic_by_region/income_by_geography.xlsx", {"sheet": "Table 1.4"}, read_excel_dict_economic_by_region)

# rename the dataframe
df = rename_columns(df, index_dict=rename_dict_economic_geography, keep_columns="all")

# will in missing attributes for the multi index
df = fill_previous(df)

# change to numeric
df = change_numeric(df, list(df.columns[:1]) + list(df.columns[2:]))
df["SA2 code"] = df["SA2 code"].astype(float)

# write to csv
df.to_csv(RELATIVE_PATH_OUT + "economic_by_region/income_by_geography.csv")

### Distribution

In [164]:
rename_dict_economic_distribution = {
    1: "SA2 code", 
    12: "economic: gini coefficient",
    13: "economic: top 1% suburb",
    14: "economic: top 5% suburb"
}

In [165]:
# get the dataframe
df = get_csv(RELATIVE_PATH_IN + "economic_by_region/income_distribution_by_geography.xlsx", {"sheet": "Table 2.4"}, read_excel_dict_economic_by_region)

# rename the dataframe
df = rename_columns(df, index_dict=rename_dict_economic_distribution)

# change to numeric
df = change_numeric(df, df.columns)
df["SA2 code"] = df["SA2 code"].astype(float)

# write to csv
df.to_csv(RELATIVE_PATH_OUT + "economic_by_region/income_distribution_by_geography.csv")

## Schools

- Not sure what to do here

## ABS