In [3]:
import pandas as pd
import numpy as np
import os

## Functions

#### Merging

In [4]:
MERGE_COLUMNS = ["SA2 code", "year", "quarter"]

def get_merged_df(new_df, final_df):
    if (final_df.empty):
        return new_df

    # get the attributes used for the merge
    merge_columns = list(set(new_df.columns) & set(MERGE_COLUMNS) & set(final_df.columns))

    # get the columns not already in the data frame
    new_columns = list(set(new_df.columns) - set(final_df.columns) - set(merge_columns))

    # merge columns
    merged_df = pd.merge(final_df, new_df[merge_columns + new_columns], on=merge_columns, how="outer")

    # check if any rows lost
    print(f"lost {final_df.shape[0] - merged_df.shape[0]} out of {final_df.shape[0]} records")
    
    return merged_df

#### Renaming columns

In [5]:
def rename_dict(df, rename_dict, keep_columns):
    # get the renamed dictionary
    new_column_names = list(df.columns.copy())
    for index, new_name in rename_dict.items():
        new_column_names[index - 1] = new_name
    df.columns = new_column_names

    # drop the columns not of interest
    keep_columns = [x-1 for x in keep_columns]
    df = df.iloc[:, keep_columns]

    return df

#### Economic by region functions

In [6]:
def fill_previous(df, na="Unnamed"):
    # get the current columns
    curr_columns = list(df.columns)

    # basically make the columns by level and create a data frame
    flat_column_list = [[group[i] for group in curr_columns] for i in range(len(curr_columns[0]))]
    column_dataframe = pd.DataFrame(flat_column_list)

    # create the missing values then fill them again
    column_dataframe = column_dataframe.applymap(lambda x: None if (not x) or (na in str(x)) else x)
    column_dataframe = column_dataframe.T.ffill().T

    # get as a numpy array once again
    flat_column_list = column_dataframe.to_numpy()

    # get the 
    df.columns = pd.MultiIndex.from_arrays(flat_column_list)

    return df

In [7]:
def fill_columns(df, column_dict, na="Unnamed"):
    columns = list(df.columns)

    for index, new_column in column_dict.items():
        columns[index] = new_column

    if (type(df.columns) == pd.MultiIndex):
        flat_column_list = impute_previous(columns, na)

        df.columns = pd.MultiIndex.from_arrays(flat_column_list)
    
    else:
        df.columns = columns

    return df

In [8]:
"""will basically create a record for each entry of one of the columns, and a new column for another entry
Note: assumes a multi index
Note: index_stop is the number of columns in the original dataframe that don't need to be melted"""
def unpack_levels(df, index_stop, level_1_name, level_2_name, record_level_2=False):
    # make a massive dataframe, everything is a record
    df = df.melt(id_vars=df.columns.to_list()[:index_stop], 
                 value_vars = df.columns.to_list()[index_stop:],
                 var_name=[level_1_name, level_2_name],
                 value_name="value")

    if (not record_level_2):
        # basically extract the column data from the second level
        df = df.pivot(index=list(df.columns[:index_stop]) + [level_1_name], 
                      columns=level_2_name, values="value")
        
    else:
        # switch around the melted columns so the end on gets melted
        df = df.pivot(index=list(df.columns[:index_stop]) + [level_2_name], 
                      columns=level_1_name, values="value")

    # reset the indices and rename
    df = df.reset_index()
    #print([column[0] for column in df.columns[:index_stop]])
    df.columns = [column[0] for column in df.columns[:index_stop]] + list(df.columns[index_stop:])

    return df


## Document notes + set up

- Note that the final file produced, each timestamp will correspond to the growth from the previous time stamp
  - For example, if the population is 1000 in the 2022-03 quarter, then 1100 in the 2022-06 quater, then the value for the 2022-06 quarter population growth is 10%
  - Similarly for inflation, measures the change from previous quarter
- Year represents the start year

In [11]:
RELATIVE_PATH = "../../data/1. landing/"

## Economic by Region (check)

### Geography summary

- Note the income of a given year (say 2020) is acutally from july year before to jun next year (July 2019 to June 2020)
  - Because the data was given as financial years

In [12]:
COLUMNS_DICT_INCOME_GEOGRAPHY = {
    0: ("SA2 code",None),
    1: ("SA2 name",None)
}

# get the csv
income_region = pd.read_csv(RELATIVE_PATH + "economic_by_region/income_by_geography_2016_2021.csv", header=[0, 1])

# prepare the columns
income_region = fill_columns(income_region, COLUMNS_DICT_INCOME_GEOGRAPHY)

income_region

Unnamed: 0_level_0,SA2 code,SA2 name,Earners (persons),Earners (persons),Earners (persons),Earners (persons),Earners (persons),Median age of earners (years),Median age of earners (years),Median age of earners (years),...,Median ($),Median ($),Median ($),Median ($),Median ($),Mean ($),Mean ($),Mean ($),Mean ($),Mean ($)
Unnamed: 0_level_1,NaN,NaN,2016-17,2017-18,2018-19,2019-20,2020-21,2016-17,2017-18,2018-19,...,2016-17,2017-18,2018-19,2019-20,2020-21,2016-17,2017-18,2018-19,2019-20,2020-21
0,Australia,SA2 NAME,13675002,14069078,14425034,14619600,14760008,42,42,42,...,48083,49805,51389,52338,54890,63508,64247,65954,67255,70522
1,New South Wales,SA2 NAME,4344142,4466939,4569649,4614939,4603736,42,42,42,...,48394,50153,51818,52849,55854,66055,67195,68813,70114,74094
2,101021007,Braidwood,2261,2311,2362,2427,2467,50,51,51,...,40790,42003,41593,44246,46640,52068,51639,51192,61506,68904
3,101021008,Karabar,4989,5057,5099,5131,5103,42,42,42,...,57460,59295,61777,62946,65564,63294,63808,66381,67442,69672
4,101021009,Queanbeyan,6482,6594,6699,6773,7028,39,39,39,...,55033,57848,60119,61724,63528,61747,62878,65809,67298,69174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2454,801101139,Wright,2028,2069,2117,2232,2382,33,34,35,...,74520,76754,79150,80498,82392,79922,82707,86052,87012,90720
2455,801101145,Molonglo - East,np,np,np,np,np,np,np,np,...,np,np,np,np,np,np,np,np,np,np
2456,801101146,Whitlam,np,np,np,np,15,np,np,np,...,np,np,np,np,67504,np,np,np,np,65967
2457,801111140,ACT - South West,352,370,359,354,348,39,39,40,...,55208,61096,64227,68987,71392,71098,67791,73401,76735,80675


In [13]:
# for filling columns
COLUMNS_DICT_INCOME_GEOGRAPHY = {
    0: ("SA2 code",None),
    1: ("SA2 name",None)
}

# get the csv
income_region = pd.read_csv(RELATIVE_PATH + "economic_by_region/income_by_geography_2016_2021.csv", header=[0, 1])

# prepare the columns
income_region = fill_columns(income_region, COLUMNS_DICT_INCOME_GEOGRAPHY)

# filter victoria columns
vic_mask = income_region[("SA2 code", np.nan)].str.startswith("2")
income_region = income_region[vic_mask]

# make a mega dataframe
income_region = unpack_levels(income_region, 2, "measure", "year", record_level_2=True)

# filter columns and rename
NEW_NAMES = {
    3: "year",
    4: "economic: number of earners",
    6: "economic: median income", 
    7: "economic: median age of earners"
}
KEEP_COLUMNS = [1, 2, 3, 4, 6, 7]

income_region = rename_dict(income_region, NEW_NAMES, KEEP_COLUMNS)
print(income_region.shape[0])
income_region.head(10)

2610


Unnamed: 0,SA2 code,SA2 name,year,economic: number of earners,economic: median income,economic: median age of earners
0,201011001,Alfredton,2016-17,7117,50596,42
1,201011001,Alfredton,2017-18,7558,52448,42
2,201011001,Alfredton,2018-19,7987,53932,42
3,201011001,Alfredton,2019-20,8665,55204,41
4,201011001,Alfredton,2020-21,9438,58036,41
5,201011002,Ballarat,2016-17,7465,50093,47
6,201011002,Ballarat,2017-18,7587,51736,47
7,201011002,Ballarat,2018-19,7592,53688,47
8,201011002,Ballarat,2019-20,7646,53784,47
9,201011002,Ballarat,2020-21,7522,55998,47


In [9]:
import numpy as np

MEASURE_COLUMNS = income_region.columns[3:]

# drop any na values in measure columns
income_region = income_region.replace("np", np.nan)
income_region = income_region.dropna(subset=MEASURE_COLUMNS, how="any")
income_region.shape[0]

# get numeric columns
for measure_column in MEASURE_COLUMNS:
    income_region[measure_column] = pd.to_numeric(income_region[measure_column].str.replace(',', ''))

print(income_region.shape[0])
income_region.head(5)

2602


Unnamed: 0,SA2 code,SA2 name,year,economic: number of earners,economic: median income,economic: median age of earners
0,201011001,Alfredton,2016-17,7117,50596,42
1,201011001,Alfredton,2017-18,7558,52448,42
2,201011001,Alfredton,2018-19,7987,53932,42
3,201011001,Alfredton,2019-20,8665,55204,41
4,201011001,Alfredton,2020-21,9438,58036,41


In [10]:
# get the financial quarters for the year
QUARTERS = 4

all_frames = []
for quarter in range(QUARTERS):
    curr_quarter_df = income_region.copy()

    # create the current quarter series
    curr_quarter_df["quarter"] = quarter + 1

    # aggregate the result
    all_frames.append(curr_quarter_df)

# combine the results
income_region = pd.concat(all_frames, ignore_index=True)

# change the year
income_region["year"] = income_region.apply(
    lambda x: int(x["year"][:4])
    if (x["quarter"] in [3, 4])  # this step is necessary as defining quarter as the part of the year, so this is the end results desired
    else 2000 + int(x["year"][5:]),
    axis=1
)

# sort the values
income_region = income_region.sort_values(by=["SA2 code", "year", "quarter"])

income_region.head(8)

Unnamed: 0,SA2 code,SA2 name,year,economic: number of earners,economic: median income,economic: median age of earners,quarter
5204,201011001,Alfredton,2016,7117,50596,42,3
7806,201011001,Alfredton,2016,7117,50596,42,4
0,201011001,Alfredton,2017,7117,50596,42,1
2602,201011001,Alfredton,2017,7117,50596,42,2
5205,201011001,Alfredton,2017,7558,52448,42,3
7807,201011001,Alfredton,2017,7558,52448,42,4
1,201011001,Alfredton,2018,7558,52448,42,1
2603,201011001,Alfredton,2018,7558,52448,42,2


### Distribution

In [11]:
# get the economic distribtuion
economic_distribution = pd.read_csv(RELATIVE_PATH + "/economic_by_region/income_distribution_by_geography_2021.csv", header=0)
economic_distribution = economic_distribution.drop(0)
economic_distribution.head(5)

# rename and filter
NEW_NAMES = {
    1: "SA2 code", 
    12: "economic: gini coefficient",
    13: "economic: top 1% suburb",
    14: "economic: top 5% suburb"
}
KEEP_COLUMNS = [1, 12, 13, 14]
economic_distribution = rename_dict(economic_distribution, NEW_NAMES, KEEP_COLUMNS)

# convert to numeric
for i in range(1, 4):
    economic_distribution.iloc[:, i] = pd.to_numeric(economic_distribution.iloc[:, i], errors="coerce")

print(economic_distribution.dtypes)
economic_distribution.head()

SA2 code                       object
economic: gini coefficient    float64
economic: top 1% suburb       float64
economic: top 5% suburb       float64
dtype: object


  economic_distribution.iloc[:, i] = pd.to_numeric(economic_distribution.iloc[:, i], errors="coerce")


Unnamed: 0,SA2 code,economic: gini coefficient,economic: top 1% suburb,economic: top 5% suburb
1,Australia,0.483,9.8,22.5
2,New South Wales,0.497,11.2,24.3
3,101021007,0.615,21.2,32.6
4,101021008,0.365,4.4,14.5
5,101021009,0.368,4.6,14.7


In [12]:
# check no Nan values
economic_distribution.isna().any(axis=1).sum()

142

## Economic (Check)

#### Inflation (Check)

- Basically, remember that it is annualized
- Will therefore calculate the average percent change from the previous year
  - Note December the year before, + march + Jun + Sep of the year will be used for the average

In [13]:
# donwload inflation
inflation = pd.read_csv(RELATIVE_PATH + "economic/inflation.csv")

# rename the columns
index = 1 + list(inflation.columns).index("Percentage Change from Previous Period ;  Trimmed Mean ;  Australia ;")
NEW_NAMES = {1: "year", index: "economic: trimmed mean quarterly"}
KEEP_COLUMNS = NEW_NAMES.keys()
inflation = rename_dict(inflation, NEW_NAMES, KEEP_COLUMNS)

print(inflation.dtypes)
inflation.tail(10)

year                                 object
economic: trimmed mean quarterly    float64
dtype: object


Unnamed: 0,year,economic: trimmed mean quarterly
294,2022-03-01,1.5
295,2022-06-01,1.5
296,2022-09-01,1.8
297,2022-12-01,1.7
298,2023-03-01,1.2
299,2023-06-01,0.9
300,2023-09-01,1.2
301,2023-12-01,0.8
302,2024-03-01,1.0
303,2024-06-01,0.8


In [14]:
# split up the month and the year
inflation[["year", "quarter", "day"]] = inflation["year"].str.split("-", expand=True)
inflation = inflation.drop(columns="day")

# get the actual year
inflation["year"] = inflation["year"].astype(int)
inflation["economic: trimmed mean quarterly"] = inflation["economic: trimmed mean quarterly"].astype(float)

# get the quarter of the year
inflation["quarter"] = inflation["quarter"].astype(int) // 3

# remove records before 2000
inflation = inflation[inflation["year"] >= 2000]

inflation.head(5)

Unnamed: 0,year,economic: trimmed mean quarterly,quarter
206,2000,0.7,1
207,2000,0.5,2
208,2000,0.6,3
209,2000,0.6,4
210,2001,1.0,1


#### Interest rates (Check)

Have decided only to take the variable interest rate for standard owners (as this is most reflective of normal people)

In [15]:
# download the lending data
lending = pd.read_csv(RELATIVE_PATH + "economic/lending.csv")

# rename the columns
NEW_NAMES = {1: "year", 4: "economic: variable interest rate"}
KEEP_COLUMNS = [1, 4]
lending = rename_dict(lending, NEW_NAMES, KEEP_COLUMNS)

print(lending.dtypes)
lending.tail(5)

year                                 object
economic: variable interest rate    float64
dtype: object


Unnamed: 0,year,economic: variable interest rate
783,2024-04-30,8.77
784,2024-05-31,8.77
785,2024-06-30,8.77
786,2024-07-31,8.77
787,2024-08-31,8.77


- Basically want to take the average of all the records for each year
- Seems to make most sense to use the actual percentage (above 100%) rather than say 1.02

In [16]:
# split up the time stamp
lending[["year", "quarter", "day"]] = lending["year"].str.split("-", expand=True).astype(int)
lending = lending.drop(columns="day")

# convert month to quarter
lending["quarter"] = ((lending["quarter"] - 1) // 3) + 1

# take the average over each financial quarter
lending = lending.groupby(["year", "quarter"])["economic: variable interest rate"].apply(lambda x: x.mean()).reset_index()

# remove records before 2000
lending = lending[lending["year"] >= 2000]

lending.tail(5)

Unnamed: 0,year,quarter,economic: variable interest rate
258,2023,3,8.52
259,2023,4,8.686667
260,2024,1,8.77
261,2024,2,8.77
262,2024,3,8.77


### GDP (Check)

- GDP is basically add from each year (even tho this misses out on a small amount of gdp growth)
- Same process as inflation, except this shows the quarterly growth instead

In [17]:
# download the lending data
gdp = pd.read_csv(RELATIVE_PATH + "economic/gdp.csv")

# rename the columns
index = 1 + list(gdp.columns).index("A2304370T")
NEW_NAMES = {1: "year", index: "economic: gdp quarterly", index+1: "economic: gdp per capita quarterly"}
KEEP_COLUMNS = [1, index, index+1]
gdp = rename_dict(gdp, NEW_NAMES, KEEP_COLUMNS)

print(gdp.dtypes)
gdp.tail(5)

year                                   object
economic: gdp quarterly               float64
economic: gdp per capita quarterly    float64
dtype: object


Unnamed: 0,year,economic: gdp quarterly,economic: gdp per capita quarterly
255,2023-06-01,0.5,-0.1
256,2023-09-01,0.3,-0.4
257,2023-12-01,0.2,-0.3
258,2024-03-01,0.2,-0.4
259,2024-06-01,0.2,-0.4


In [18]:
# split up the month and the year
gdp[["year", "quarter", "day"]] = gdp["year"].str.split("-", expand=True).astype(int)
gdp = gdp.drop(columns="day")

# get the actual year
gdp[gdp.columns[3:]] = gdp[gdp.columns[3:]].astype(float)

# get the quarter of the year
gdp["quarter"] = gdp["quarter"].astype(int) // 3

# remove records before 2000
gdp = gdp[gdp["year"] >= 2000]

gdp.tail(5)

Unnamed: 0,year,economic: gdp quarterly,economic: gdp per capita quarterly,quarter
255,2023,0.5,-0.1,2
256,2023,0.3,-0.4,3
257,2023,0.2,-0.3,4
258,2024,0.2,-0.4,1
259,2024,0.2,-0.4,2


## Population (Check)

Greater Capital City Statistical Area (GCCSA) code and Greater Capital City Statistical Area (GCCSA) name

### Getting data

In [1]:
NEW_NAMES

NameError: name 'NEW_NAMES' is not defined

In [24]:
# Use the absolute path to the file
vic_population_filepath = RELATIVE_PATH + 'population/age_demos.csv'

# Read the CSV file
vic_population_df = pd.read_csv(vic_population_filepath, header=[0, 1])

# drop the first level
first_level = vic_population_df.columns.get_level_values(0)
vic_population_df.columns = vic_population_df.columns.droplevel(0)

# rename the columns
NEW_NAMES = {1: "year", 3: "state", 10: "SA2 code"}
NEW_NAMES.update({i+1: "population: " + first_level[i].lower() for i in range(11, len(first_level))}) # need the +1 because it's the actual column position
KEEP_COLUMNS = list(NEW_NAMES.keys())
vic_population_df = rename_dict(vic_population_df, NEW_NAMES, KEEP_COLUMNS)

print(vic_population_df.dtypes)
vic_population_df.head(5)


year                           int64
state                         object
SA2 code                     float64
population: 0-4              float64
population: 5-9              float64
population: 10-14            float64
population: 15-19            float64
population: 20-24            float64
population: 25-29            float64
population: 30-34            float64
population: 35-39            float64
population: 40-44            float64
population: 45-49            float64
population: 50-54            float64
population: 55-59            float64
population: 60-64            float64
population: 65-69            float64
population: 70-74            float64
population: 75-79            float64
population: 80-84            float64
population: 85 and over      float64
population: total persons    float64
dtype: object


Unnamed: 0,year,state,SA2 code,population: 0-4,population: 5-9,population: 10-14,population: 15-19,population: 20-24,population: 25-29,population: 30-34,...,population: 45-49,population: 50-54,population: 55-59,population: 60-64,population: 65-69,population: 70-74,population: 75-79,population: 80-84,population: 85 and over,population: total persons
0,2001,New South Wales,101021007.0,154.0,165.0,173.0,157.0,94.0,129.0,153.0,...,221.0,262.0,233.0,197.0,125.0,108.0,79.0,57.0,37.0,2760.0
1,2001,New South Wales,101021008.0,720.0,749.0,741.0,688.0,623.0,657.0,735.0,...,675.0,585.0,491.0,300.0,223.0,201.0,144.0,64.0,32.0,9129.0
2,2001,New South Wales,101021009.0,592.0,509.0,514.0,624.0,799.0,757.0,767.0,...,633.0,620.0,520.0,444.0,384.0,394.0,320.0,214.0,226.0,9717.0
3,2001,New South Wales,101021010.0,247.0,242.0,237.0,269.0,338.0,336.0,327.0,...,323.0,297.0,199.0,152.0,97.0,94.0,82.0,54.0,20.0,3925.0
4,2001,New South Wales,101021012.0,969.0,923.0,777.0,602.0,451.0,640.0,1068.0,...,600.0,508.0,294.0,224.0,159.0,105.0,70.0,29.0,24.0,9425.0


In [25]:
# Filter for Victoria entries
vic_population_df = vic_population_df[vic_population_df['state'] == 'Victoria']

# sort the values
vic_population_df =  vic_population_df.sort_values(by=["SA2 code", "year"])

# convert to string for easy
vic_population_df["SA2 code"] = vic_population_df["SA2 code"].astype(int).astype(str)

# drop columns not of interest
vic_population_df = vic_population_df.drop(columns=["state"])

vic_population_df.head(5)

Unnamed: 0,year,SA2 code,population: 0-4,population: 5-9,population: 10-14,population: 15-19,population: 20-24,population: 25-29,population: 30-34,population: 35-39,...,population: 45-49,population: 50-54,population: 55-59,population: 60-64,population: 65-69,population: 70-74,population: 75-79,population: 80-84,population: 85 and over,population: total persons
642,2001,201011001,353.0,467.0,584.0,556.0,310.0,266.0,364.0,469.0,...,435.0,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0
3096,2002,201011001,385.0,473.0,623.0,556.0,339.0,279.0,383.0,494.0,...,452.0,384.0,297.0,244.0,188.0,179.0,148.0,73.0,42.0,6092.0
5550,2003,201011001,404.0,492.0,608.0,584.0,376.0,293.0,371.0,506.0,...,475.0,386.0,328.0,256.0,205.0,176.0,150.0,93.0,38.0,6293.0
8004,2004,201011001,403.0,499.0,613.0,601.0,398.0,290.0,391.0,530.0,...,494.0,394.0,362.0,255.0,214.0,187.0,154.0,91.0,46.0,6480.0
10458,2005,201011001,421.0,520.0,624.0,604.0,414.0,282.0,403.0,528.0,...,518.0,427.0,374.0,242.0,229.0,179.0,140.0,120.0,52.0,6648.0


In [26]:
# get the financial quarters for the year
QUARTERS = 4

all_frames = []
for quarter in range(QUARTERS):
    curr_quarter_df = vic_population_df.copy()

    # create the current quarter series
    curr_quarter_df["quarter"] = quarter + 1

    # aggregate the result
    all_frames.append(curr_quarter_df)

# combine the results
vic_population_df = pd.concat(all_frames, ignore_index=True)

# sort
vic_population_df = vic_population_df.sort_values(by=["SA2 code", "year", "quarter"])

vic_population_df.head(10)

Unnamed: 0,year,SA2 code,population: 0-4,population: 5-9,population: 10-14,population: 15-19,population: 20-24,population: 25-29,population: 30-34,population: 35-39,...,population: 50-54,population: 55-59,population: 60-64,population: 65-69,population: 70-74,population: 75-79,population: 80-84,population: 85 and over,population: total persons,quarter
0,2001,201011001,353.0,467.0,584.0,556.0,310.0,266.0,364.0,469.0,...,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0,1
12006,2001,201011001,353.0,467.0,584.0,556.0,310.0,266.0,364.0,469.0,...,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0,2
24012,2001,201011001,353.0,467.0,584.0,556.0,310.0,266.0,364.0,469.0,...,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0,3
36018,2001,201011001,353.0,467.0,584.0,556.0,310.0,266.0,364.0,469.0,...,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0,4
1,2002,201011001,385.0,473.0,623.0,556.0,339.0,279.0,383.0,494.0,...,384.0,297.0,244.0,188.0,179.0,148.0,73.0,42.0,6092.0,1
12007,2002,201011001,385.0,473.0,623.0,556.0,339.0,279.0,383.0,494.0,...,384.0,297.0,244.0,188.0,179.0,148.0,73.0,42.0,6092.0,2
24013,2002,201011001,385.0,473.0,623.0,556.0,339.0,279.0,383.0,494.0,...,384.0,297.0,244.0,188.0,179.0,148.0,73.0,42.0,6092.0,3
36019,2002,201011001,385.0,473.0,623.0,556.0,339.0,279.0,383.0,494.0,...,384.0,297.0,244.0,188.0,179.0,148.0,73.0,42.0,6092.0,4
2,2003,201011001,404.0,492.0,608.0,584.0,376.0,293.0,371.0,506.0,...,386.0,328.0,256.0,205.0,176.0,150.0,93.0,38.0,6293.0,1
12008,2003,201011001,404.0,492.0,608.0,584.0,376.0,293.0,371.0,506.0,...,386.0,328.0,256.0,205.0,176.0,150.0,93.0,38.0,6293.0,2


## ABS data

In [27]:
abs_data = pd.read_csv(RELATIVE_PATH + "ABS/merge.csv", index_col=0)
abs_data.columns = ["SA2 code"] + list(abs_data.columns[1:])
abs_data["SA2 code"] = abs_data["SA2 code"].astype(str)
abs_data.head(5)

Unnamed: 0_level_0,SA2 code,relationships: married,relationships: defacto,relationships: lone parents,relationships: child under 15,relationships: dependent student,relationships: non dependent child,relationships: other related individual,relationships: group household,relationships: lone persons,...,birth: turkey,birth: wales,birth: south africa,birth: philippines,birth: croatia,birth: cambodia,birth: greece,birth: japan,birth: nepal,birth: myanmar
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,201011001,6056,1328,772,3884,974,854,252,447,1096,...,0,12,74,117,14,0,21,15,16,0
1,201011002,4129,906,471,1669,604,464,132,538,1776,...,11,13,26,33,12,3,7,21,38,0
2,201011005,2935,514,248,1427,395,367,74,134,484,...,3,5,14,15,3,0,3,9,6,0
3,201011006,3302,1238,584,2150,411,599,243,373,826,...,0,6,8,75,9,0,4,9,53,0
4,201011007,1836,319,105,861,261,323,64,41,202,...,0,0,6,4,9,0,0,0,0,0


## Merging Together

In [28]:
len(vic_population_df["SA2 code"].unique())

522

In [29]:
for frame in all_frames:
    if ("year" in frame.columns):
        print(frame["year"].unique())
    else:
        print("nothing")

[2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
 2015 2016 2017 2018 2019 2020 2021 2022 2023]
[2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
 2015 2016 2017 2018 2019 2020 2021 2022 2023]
[2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
 2015 2016 2017 2018 2019 2020 2021 2022 2023]
[2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
 2015 2016 2017 2018 2019 2020 2021 2022 2023]


In [30]:
len(income_region["SA2 code"].unique())

521

In [31]:
# all the frames to merge
all_frames = [vic_population_df, income_region, economic_distribution, inflation, lending, gdp, abs_data]


merged_df = pd.DataFrame({})
for frame in all_frames:
    merged_df = get_merged_df(frame, merged_df)

# order the columns
indentifiers = ["SA2 code", "SA2 name", "year", "quarter"]
merged_df = merged_df[indentifiers + [x for x in merged_df.columns if not x in indentifiers]]

# sort the values
merged_df = merged_df.sort_values(by=["SA2 name", "year", "quarter"])

merged_df.head(5)

lost 0 out of 48024 records
lost -1937 out of 48024 records
lost -6 out of 49961 records
lost -1 out of 49967 records
lost 0 out of 49968 records
lost -2 out of 49968 records


Unnamed: 0,SA2 code,SA2 name,year,quarter,population: 0-4,population: 5-9,population: 10-14,population: 15-19,population: 20-24,population: 25-29,...,birth: zimbabwe,birth: myanmar,birth: total,birth: lebanon,relationships: married,birth: pakistan,birth: hong kong sar of china,overseas: 5 years,relationships: lone parents,birth: bosnia and herzegovina
14414,206071139,Abbotsford,2016.0,3.0,366.0,193.0,163.0,234.0,849.0,1844.0,...,12.0,3.0,9088.0,0.0,1784.0,11.0,45.0,1076.0,190.0,4.0
14415,206071139,Abbotsford,2016.0,4.0,366.0,193.0,163.0,234.0,849.0,1844.0,...,12.0,3.0,9088.0,0.0,1784.0,11.0,45.0,1076.0,190.0,4.0
14416,206071139,Abbotsford,2017.0,1.0,401.0,217.0,170.0,226.0,887.0,1906.0,...,12.0,3.0,9088.0,0.0,1784.0,11.0,45.0,1076.0,190.0,4.0
14417,206071139,Abbotsford,2017.0,2.0,401.0,217.0,170.0,226.0,887.0,1906.0,...,12.0,3.0,9088.0,0.0,1784.0,11.0,45.0,1076.0,190.0,4.0
14418,206071139,Abbotsford,2017.0,3.0,401.0,217.0,170.0,226.0,887.0,1906.0,...,12.0,3.0,9088.0,0.0,1784.0,11.0,45.0,1076.0,190.0,4.0


In [32]:
# the final out path
OUT_PATH = "../../data/raw/historic/"

# make directory necessary
if (not os.path.exists(OUT_PATH)):
    os.makedirs(OUT_PATH)

# save data frame
merged_df.to_csv(OUT_PATH + "merged.csv")

## Combining at Suburb Level

In [34]:
import ast

In [35]:
# don't edit this (except for regions_df location)

regions_df = pd.read_csv('../../data/raw/location/sa2_to_rental_suburb_groups.csv')

regions_df_list = regions_df.copy()
regions_df_list['code'] = regions_df_list['code'].apply(ast.literal_eval)

exploded_regions = regions_df_list.explode('code')
exploded_regions['code'] = exploded_regions['code'].apply(pd.to_numeric, errors='coerce')

def df_to_regions(df, sa2_col_name, aggregation_functions, year_col_name = None, quarter_col_name = None):
    df[sa2_col_name] = df[sa2_col_name].apply(pd.to_numeric, errors='coerce')
    regions_with_stats = pd.merge(exploded_regions, df, left_on='code', right_on=sa2_col_name, how='left')

    for col in aggregation_functions:
        regions_with_stats[col] = regions_with_stats[col].apply(pd.to_numeric, errors='coerce')

    if quarter_col_name:
        grouped_by = regions_with_stats.groupby(['suburbs', year_col_name, quarter_col_name]).agg(aggregation_functions)
    elif year_col_name:
        grouped_by = regions_with_stats.groupby(['suburbs', year_col_name]).agg(aggregation_functions)
    else:
        grouped_by = regions_with_stats.groupby('suburbs').agg(aggregation_functions)

    return grouped_by.reset_index()

exploded_regions

Unnamed: 0.1,Unnamed: 0,geometry,suburbs,regions,code
0,0,POLYGON ((144.97018001032677 -37.8606998481848...,Albert Park-Middle Park-West St Kilda,"['Albert Park', 'St Kilda - West']",206051128
0,0,POLYGON ((144.97018001032677 -37.8606998481848...,Albert Park-Middle Park-West St Kilda,"['Albert Park', 'St Kilda - West']",206051514
1,1,POLYGON ((144.79015492149858 -37.8621593959499...,Altona,"['Altona', 'Altona North']",213021341
1,1,POLYGON ((144.79015492149858 -37.8621593959499...,Altona,"['Altona', 'Altona North']",213021343
2,2,POLYGON ((145.01167433388778 -37.8535692509816...,Armadale,['Armadale'],206061135
...,...,...,...,...,...
141,141,POLYGON ((144.8869958358719 -37.85078700244943...,Williamstown,['Williamstown'],213021346
142,142,POLYGON ((146.77393701213975 -36.1280104970832...,Wodonga,"['Wodonga', 'West Wodonga']",204031073
142,142,POLYGON ((146.77393701213975 -36.1280104970832...,Wodonga,"['Wodonga', 'West Wodonga']",204031492
143,143,POLYGON ((144.85984002458792 -37.8138202803743...,Yarraville-Seddon,"['Seddon - Kingsville', 'Yarraville']",213031352


In [36]:
# EDIT THESE FIELDS
LOAD_PATH = '../../data/raw/historic/merged.csv'
OUTPUT_PATH = '../../data/raw/historic/merged_as_suburbs.csv'

df = pd.read_csv(LOAD_PATH, index_col=0)

  df = pd.read_csv(LOAD_PATH, index_col=0)


In [37]:
# This section is just to make the aggregation a bit easier
# is in the form { 'col_name': 'func', 'col2': 'func' }
# ---------------------------------------------------------

# list of columns to take the sum of the aggregation for
sum_list = [x for x in list(df.columns) if "population" in x] + \
           [x for x in list(df.columns) if "birth" in x] + \
           [x for x in list(df.columns) if "studying" in x] + \
           ['overseas: 5 years'] + \
           [x for x in list(df.columns) if "relationships" in x] + \
           ["economic: number of earners"]


# list of columns to take the avg of the aggregation for
avg_list = [x for x in df.columns if ("economic: " in x) and (not "number" in x)]

agg_functions = {col_name: 'mean' for col_name in avg_list}
agg_functions.update(
    {col_name: 'sum' for col_name in sum_list}
)

# ----------------------------------------------------------

# EDIT ME: Calls the actual join function
out = df_to_regions(df = df, sa2_col_name = 'SA2 code', aggregation_functions=agg_functions, year_col_name = 'year', quarter_col_name="quarter")
out.to_csv(OUTPUT_PATH)
out

Unnamed: 0,suburbs,year,quarter,economic: median income,economic: median age of earners,economic: top 1% suburb,economic: gini coefficient,economic: top 5% suburb,economic: trimmed mean quarterly,economic: variable interest rate,...,relationships: other related individual,relationships: group household,relationships: non dependent child,relationships: defacto,relationships: dependent student,relationships: child under 15,relationships: lone persons,relationships: married,relationships: lone parents,economic: number of earners
0,Albert Park-Middle Park-West St Kilda,2001.0,1.0,,,14.55,0.5565,30.75,1.0,7.653333,...,426.0,1798.0,654.0,4336.0,994.0,2914.0,6156.0,7732.0,761.0,0.0
1,Albert Park-Middle Park-West St Kilda,2001.0,2.0,,,14.55,0.5565,30.75,0.8,6.820000,...,426.0,1798.0,654.0,4336.0,994.0,2914.0,6156.0,7732.0,761.0,0.0
2,Albert Park-Middle Park-West St Kilda,2001.0,3.0,,,14.55,0.5565,30.75,0.7,6.736667,...,426.0,1798.0,654.0,4336.0,994.0,2914.0,6156.0,7732.0,761.0,0.0
3,Albert Park-Middle Park-West St Kilda,2001.0,4.0,,,14.55,0.5565,30.75,0.9,6.236667,...,426.0,1798.0,654.0,4336.0,994.0,2914.0,6156.0,7732.0,761.0,0.0
4,Albert Park-Middle Park-West St Kilda,2002.0,1.0,,,14.55,0.5565,30.75,0.7,6.070000,...,426.0,1798.0,654.0,4336.0,994.0,2914.0,6156.0,7732.0,761.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13243,Yarraville-Seddon,2022.0,4.0,,,5.85,0.4255,17.65,1.7,7.270000,...,495.0,1277.0,1012.0,3694.0,1013.0,4517.0,2566.0,7616.0,813.0,0.0
13244,Yarraville-Seddon,2023.0,1.0,,,5.85,0.4255,17.65,1.2,7.770000,...,495.0,1277.0,1012.0,3694.0,1013.0,4517.0,2566.0,7616.0,813.0,0.0
13245,Yarraville-Seddon,2023.0,2.0,,,5.85,0.4255,17.65,0.9,8.270000,...,495.0,1277.0,1012.0,3694.0,1013.0,4517.0,2566.0,7616.0,813.0,0.0
13246,Yarraville-Seddon,2023.0,3.0,,,5.85,0.4255,17.65,1.2,8.520000,...,495.0,1277.0,1012.0,3694.0,1013.0,4517.0,2566.0,7616.0,813.0,0.0
