In [2]:
import pandas as pd

## Functions

#### Merging

In [90]:
MERGE_COLUMNS = ["year", "SA2 code"]

def get_merged_df(new_df, final_df):
    # get the attributes used for the merge
    merge_columns = list(set(new_df.columns) & set(MERGE_COLUMNS))

    # merge columns
    merged_df = pd.merge(final_df, new_df, on=merge_columns, how="inner")

    # check if any columns lost
    if (merged_df.shape[0] < final_df.shape[0]):
        print(f"lost {final_df.shape[0] - merged_df.shape[0]} out of {final_df.shape[0]} records")
    
    return merged_df

### General

In [91]:
def rename_dict(df, rename_dict, keep_columns):
    # get the renamed dictionary
    new_column_names = list(df.columns.copy())
    for index, new_name in rename_dict.items():
        new_column_names[index - 1] = new_name
    df.columns = new_column_names

    # drop the columns not of interest
    keep_columns = [x-1 for x in keep_columns]
    df = df.iloc[:, keep_columns]

    return df

### Preparing suburb name

In [92]:
import numpy as np

DIRECTIONS = ["north", "south", "west", "east"]

"""`df` should have unique indicies and"""
def suburb_name_split(df):
    indicies = list(df.index)
    final_indicies = []
    repeated_indicies = []
    counter = 0

    print(len(indicies))

    # for each index, gather how many indicies to repeat and the final product
    for index in indicies:
        # remove (.vic) if present
        clean_index = index.replace("(Vic.)", "")

        # replace the brackets around "west"
        clean_index = clean_index.replace("(", "").replace(")", "")

        # make sure lower case, no confusion
        clean_index = clean_index.lower()

        # nothing fancy
        if (not "-" in clean_index):
            final_indicies.append(clean_index)
            repeated_indicies.append(index)
            continue
        
        counter += 1

        # need to split
        index_split = clean_index.split(" - ")

        if (len(index_split) != 2):
            print("length of index split is not 2?")
            exit()
        
        # if only one direction, reintroduce without the weird thing
        if (index_split[1].lower() in DIRECTIONS):
            final_indicies.append(" ".join(index_split))
            repeated_indicies.append(index)
        
        # else will need to strip everything
        else:
            final_indicies.extend(index_split)
            repeated_indicies.extend(np.repeat(index, 2))
    
    print(counter / len(indicies))

    # duplicate
    df = df.loc[repeated_indicies]

    print(df.shape)
    print(len(final_indicies))

    # change the names accordingly
    df.index = final_indicies
    
    return df


### Column and imputation

In [93]:
def impute_previous(columns, na):
    flat_column_list = [[group[i] for group in columns] for i in range(len(columns[0]))]

    column_dataframe = pd.DataFrame(flat_column_list)

    column_dataframe = column_dataframe.applymap(lambda x: None if (not x) or (na in str(x)) else x)

    column_dataframe = column_dataframe.T.ffill().T

    flat_column_list = column_dataframe.to_numpy()

    return flat_column_list

In [94]:
def fill_columns(df, column_dict, na="Unnamed"):
    columns = list(df.columns)

    for index, new_column in column_dict.items():
        columns[index] = new_column

    if (type(df.columns) == pd.MultiIndex):
        flat_column_list = impute_previous(columns, na)

        df.columns = pd.MultiIndex.from_arrays(flat_column_list)
    
    else:
        df.columns = columns

    return df

## Economic by Region

In [95]:
def filter_victoria(df, column_name):
    region_mask = ~df[column_name].str.isnumeric()

    # get the region indexes and names
    regions_index, regions_names = list(df[region_mask].index), list(df[region_mask][column_name].values)

    # find the index for victoria in the list
    index_victoria = regions_names.index("Victoria")

    # reference the mask between victoria and the next index
    df = df.loc[regions_index[index_victoria] + 1: regions_index[index_victoria + 1] - 1]

    return df

### Geography summary

- Note the income of a given year (say 2020) is acutally from july year before to jun next year (July 2019 to June 2020)
  - Because the data was given as financial years

In [96]:
COLUMNS_DICT_INCOME_GEOGRAPHY = {
    0: ("SA2 code",None),
    1: ("SA2 name",None)
}

In [104]:
# get the df
income_region = pd.read_csv("./data/landing/economic_by_region/income_by_geography_b2022.csv", header=[0, 1])

# fill empty columns for the join
income_region = fill_columns(income_region, COLUMNS_DICT_INCOME_GEOGRAPHY)

# only look at columns of interest
income_region = filter_victoria(income_region, ("SA2 code", None))

# create a massive list of stuff, each record has a unique year, location and measure
income_region = income_region.melt(id_vars=income_region.columns.to_list()[:2], 
                                   value_vars = income_region.columns.to_list()[2:],
                                   var_name=["Measure", "Year"],
                                   value_name="Value")

# make it so each record only has a unique year and location
income_region = income_region.pivot(index=list(income_region.columns[:2]) + ["Year"], columns="Measure", values="Value")
income_region = income_region.reset_index()
income_region.columns = [x[0] for x in income_region.columns[:2]] + list(income_region.columns[2:])

In [105]:
# filter and rename
NEW_NAMES = {
    3: "year",
    4: "economic: number of earners",
    6: "economic: median income", 
    7: "economic: median age of earners"
}
KEEP_COLUMNS = [1, 2, 3, 4, 6, 7]

income_region = rename_dict(income_region, NEW_NAMES, KEEP_COLUMNS)
print(income_region.shape[0])
income_region.head(5)

2610


Unnamed: 0,SA2 code,SA2 name,year,economic: number of earners,economic: median income,economic: median age of earners
0,201011001,Alfredton,2016-17,7117,50596,42
1,201011001,Alfredton,2017-18,7558,52448,42
2,201011001,Alfredton,2018-19,7987,53932,42
3,201011001,Alfredton,2019-20,8665,55204,41
4,201011001,Alfredton,2020-21,9438,58036,41


In [106]:
import numpy as np

MEASURE_COLUMNS = income_region.columns[3:]

# drop any na values in measure columns
income_region = income_region.replace("np", np.nan)
income_region = income_region.dropna(subset=MEASURE_COLUMNS, how="any")
income_region.shape[0]

# convert to first year
income_region["year"] = income_region["year"].apply(lambda x: x[:4]).astype(int)

# get numeric columns
for measure_column in MEASURE_COLUMNS:
    income_region[measure_column] = pd.to_numeric(income_region[measure_column].str.replace(',', ''))

print(income_region.shape[0])
income_region.head(5)

2602


Unnamed: 0,SA2 code,SA2 name,year,economic: number of earners,economic: median income,economic: median age of earners
0,201011001,Alfredton,2016,7117,50596,42
1,201011001,Alfredton,2017,7558,52448,42
2,201011001,Alfredton,2018,7987,53932,42
3,201011001,Alfredton,2019,8665,55204,41
4,201011001,Alfredton,2020,9438,58036,41


In [107]:
# REMOVE SA2 regions without all the years

# get the regions with 5 counts
SA2_year_counts = income_region.groupby(["SA2 code"])["year"].count()
SA2_regions = SA2_year_counts[SA2_year_counts == 5].index

# remove anything doesn't have these from the final_df
income_region = income_region[income_region["SA2 code"].isin(SA2_regions)]
income_region.shape[0]

2595

In [108]:
START_YEAR = 2016
END_YEAR = 2021

for curr_year in range(START_YEAR, END_YEAR):
    # get the years
    curr_year_df = income_region[(income_region["year"] == curr_year)]
    both_years_df = income_region[(income_region["year"] == curr_year) | (income_region["year"] == curr_year + 1)]

    # average using the next
    avg_year_df = both_years_df[["SA2 code"] + list(MEASURE_COLUMNS)].groupby("SA2 code").mean()

    # only concerned about SA2 regions of current year
    avg_year_df = avg_year_df.loc[curr_year_df["SA2 code"].unique()]

    # get the indices of the original year
    avg_year_df.index = curr_year_df.index

    # assign the new chunk
    income_region.loc[curr_year_df.index, MEASURE_COLUMNS] = avg_year_df

income_region.head(5)


Unnamed: 0,SA2 code,SA2 name,year,economic: number of earners,economic: median income,economic: median age of earners
0,201011001,Alfredton,2016,7337.5,51522.0,42.0
1,201011001,Alfredton,2017,7772.5,53190.0,42.0
2,201011001,Alfredton,2018,8326.0,54568.0,41.5
3,201011001,Alfredton,2019,9051.5,56620.0,41.0
4,201011001,Alfredton,2020,9438.0,58036.0,41.0


### Distribution

In [110]:
# get the economic distribtuion
economic_distribution = pd.read_csv("./data/landing/economic_by_region/income_distribution_by_geography_2021.csv", header=0)
economic_distribution = economic_distribution.drop(0)
economic_distribution.head(5)

# rename and filter
NEW_NAMES = {
    1: "SA2 code", 
    13: "economic: top 1% suburb",
    14: "economic: top 5% suburb"
}
KEEP_COLUMNS = [1, 12, 13, 14]
economic_distribution = rename_dict(economic_distribution, NEW_NAMES, KEEP_COLUMNS)

print(economic_distribution.dtypes)
economic_distribution.head()

SA2 code                   object
Gini coefficient           object
economic: top 1% suburb    object
economic: top 5% suburb    object
dtype: object


Unnamed: 0,SA2 code,Gini coefficient,economic: top 1% suburb,economic: top 5% suburb
1,Australia,0.483,9.8,22.5
2,New South Wales,0.497,11.2,24.3
3,101021007,0.615,21.2,32.6
4,101021008,0.365,4.4,14.5
5,101021009,0.368,4.6,14.7


In [115]:
# check no Nan values
economic_distribution.isna().any(axis=1).sum()

0

### Economic

#### Inflation

- As this is annual change, will use december of the previous year to be the inflation change for the future year
- In the end, the metric will track the inflation from the previous year

In [36]:
# donwload inflation
inflation = pd.read_csv("./data/landing/economic/inflation.csv")

# rename the columns
NEW_NAMES = {1: "year"}
KEEP_COLUMNS = list(range(3))
inflation = rename_dict(inflation, NEW_NAMES, KEEP_COLUMNS)

print(inflation.dtypes)
inflation.head(3)

Trimmed mean (%)      float64
year                   object
All groups CPI (%)    float64
dtype: object


Unnamed: 0,Trimmed mean (%),year,All groups CPI (%)
0,2.8,Jun-14,3.0
1,2.4,Sep-14,2.3
2,2.2,Dec-14,1.7


In [37]:
# filter for the december data
inflation = inflation[inflation["year"].str.contains("Dec")]

# get the year in the right format
inflation["year"] = inflation["year"].apply(lambda x: "20" + x[-2:])

# make sure an integer
inflation["year"] = pd.to_numeric(inflation["year"])

# Note: increasing the year here to ensure it tracks the change from the previous year
inflation["year"] = inflation["year"] + 1

inflation

Unnamed: 0,Trimmed mean (%),year,All groups CPI (%)
2,2.2,2015,1.7
6,2.1,2016,1.7
10,1.5,2017,1.5
14,1.7,2018,1.9
18,1.8,2019,1.8
22,1.5,2020,1.8
26,1.2,2021,0.9
30,2.6,2022,3.5
34,6.8,2023,7.8
38,4.1,2024,4.1


#### Interest rates

Have decided only to take the variable interest rate for standard owners (as this is most reflective of normal people)

In [84]:
# download the lending data
lending = pd.read_csv("data/landing/economic/lending.csv")

# rename the columns
NEW_NAMES = {1: "year", 4: "economic: variable interest rate"}
KEEP_COLUMNS = [1, 4]
lending = rename_dict(lending, NEW_NAMES, KEEP_COLUMNS)

print(lending.dtypes)
lending.tail(5)

year                                 object
economic: variable interest rate    float64
dtype: object


Unnamed: 0,year,economic: variable interest rate
783,2024-04-30,8.77
784,2024-05-31,8.77
785,2024-06-30,8.77
786,2024-07-31,8.77
787,2024-08-31,8.77


Basically want to take the average of all the records for each year

In [85]:
# get the year as an integer for each record
lending["year"] = lending["year"].apply(lambda x: int(x[:4]))

# remove records before 2000
lending = lending[lending["year"] >= 2000]

# take the average for each year
lending = lending.groupby("year").apply(lambda x: x.mean())
lending.tail(5)

Unnamed: 0_level_0,year,economic: variable interest rate
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2020,2020.0,4.566667
2021,2021.0,4.52
2022,2022.0,5.728333
2023,2023.0,8.311667
2024,2024.0,8.77


## Housing