In [758]:
import pandas as pd
import numpy as np
import os

## Functions

#### Merging

In [760]:
MERGE_COLUMNS = ["SA2 code", "year", "quarter"]

def get_merged_df(new_df, final_df):
    if (final_df.empty):
        return new_df

    # get the attributes used for the merge
    merge_columns = list(set(new_df.columns) & set(MERGE_COLUMNS) & set(final_df.columns))

    # get the columns not already in the data frame
    new_columns = list(set(new_df.columns) - set(final_df.columns) - set(merge_columns))

    # merge columns
    merged_df = pd.merge(final_df, new_df[merge_columns + new_columns], on=merge_columns, how="inner")

    # check if any rows lost
    print(f"lost {final_df.shape[0] - merged_df.shape[0]} out of {final_df.shape[0]} records")
    
    return merged_df

#### Renaming columns

In [761]:
def rename_dict(df, rename_dict, keep_columns):
    # get the renamed dictionary
    new_column_names = list(df.columns.copy())
    for index, new_name in rename_dict.items():
        new_column_names[index - 1] = new_name
    df.columns = new_column_names

    # drop the columns not of interest
    keep_columns = [x-1 for x in keep_columns]
    df = df.iloc[:, keep_columns]

    return df

#### Multindex functions

In [762]:
def impute_previous(columns, na):
    flat_column_list = [[group[i] for group in columns] for i in range(len(columns[0]))]

    column_dataframe = pd.DataFrame(flat_column_list)

    column_dataframe = column_dataframe.applymap(lambda x: None if (not x) or (na in str(x)) else x)

    column_dataframe = column_dataframe.T.ffill().T

    flat_column_list = column_dataframe.to_numpy()

    return flat_column_list

In [763]:
def fill_columns(df, column_dict, na="Unnamed"):
    columns = list(df.columns)

    for index, new_column in column_dict.items():
        columns[index] = new_column

    if (type(df.columns) == pd.MultiIndex):
        flat_column_list = impute_previous(columns, na)

        df.columns = pd.MultiIndex.from_arrays(flat_column_list)
    
    else:
        df.columns = columns

    return df

## Document notes + set up

- Note that the final file produced, each timestamp will correspond to the growth from the previous time stamp
  - For example, if the population is 1000 in the 2022-03 quarter, then 1100 in the 2022-06 quater, then the value for the 2022-06 quarter population growth is 10%
  - Similarly for inflation, measures the change from previous quarter
- Year represents the start year

In [764]:
RELATIVE_PATH = "../../data/landing/"

## Economic by Region (check)

### Geography summary

- Note the income of a given year (say 2020) is acutally from july year before to jun next year (July 2019 to June 2020)
  - Because the data was given as financial years

In [765]:
COLUMNS_DICT_INCOME_GEOGRAPHY = {
    0: ("SA2 code",None),
    1: ("SA2 name",None)
}

In [766]:
# get the csv
income_region = pd.read_csv(RELATIVE_PATH + "economic_by_region/income_by_geography_2016_2021.csv", header=[0, 1])

# prepare the columns
income_region = fill_columns(income_region, COLUMNS_DICT_INCOME_GEOGRAPHY)

# filter victoria columns
vic_mask = income_region[("SA2 code", np.nan)].str.startswith("2")
income_region = income_region[vic_mask]

# create a massive list of stuff, each record has a unique year, location and measure
income_region = income_region.melt(id_vars=income_region.columns.to_list()[:2], 
                                   value_vars = income_region.columns.to_list()[2:],
                                   var_name=["Measure", "Year"],
                                   value_name="Value")

# make it so each record only has a unique year and location
income_region = income_region.pivot(index=list(income_region.columns[:2]) + ["Year"], columns="Measure", values="Value")
income_region = income_region.reset_index()
income_region.columns = [x[0] for x in income_region.columns[:2]] + list(income_region.columns[2:])

In [767]:
# filter columns and rename
NEW_NAMES = {
    3: "year",
    4: "economic: number of earners",
    6: "economic: median income", 
    7: "economic: median age of earners"
}
KEEP_COLUMNS = [1, 2, 3, 4, 6, 7]

income_region = rename_dict(income_region, NEW_NAMES, KEEP_COLUMNS)
print(income_region.shape[0])
income_region.head(5)

2610


Unnamed: 0,SA2 code,SA2 name,year,economic: number of earners,economic: median income,economic: median age of earners
0,201011001,Alfredton,2016-17,7117,50596,42
1,201011001,Alfredton,2017-18,7558,52448,42
2,201011001,Alfredton,2018-19,7987,53932,42
3,201011001,Alfredton,2019-20,8665,55204,41
4,201011001,Alfredton,2020-21,9438,58036,41


In [768]:
import numpy as np

MEASURE_COLUMNS = income_region.columns[3:]

# drop any na values in measure columns
income_region = income_region.replace("np", np.nan)
income_region = income_region.dropna(subset=MEASURE_COLUMNS, how="any")
income_region.shape[0]

# get numeric columns
for measure_column in MEASURE_COLUMNS:
    income_region[measure_column] = pd.to_numeric(income_region[measure_column].str.replace(',', ''))

print(income_region.shape[0])
income_region.head(5)

2602


Unnamed: 0,SA2 code,SA2 name,year,economic: number of earners,economic: median income,economic: median age of earners
0,201011001,Alfredton,2016-17,7117,50596,42
1,201011001,Alfredton,2017-18,7558,52448,42
2,201011001,Alfredton,2018-19,7987,53932,42
3,201011001,Alfredton,2019-20,8665,55204,41
4,201011001,Alfredton,2020-21,9438,58036,41


In [769]:
# get the financial quarters for the year
QUARTERS = 4

all_frames = []
for quarter in range(QUARTERS):
    curr_quarter_df = income_region.copy()

    # create the current quarter series
    curr_quarter_df["quarter"] = quarter + 1

    # aggregate the result
    all_frames.append(curr_quarter_df)

# combine the results
income_region = pd.concat(all_frames, ignore_index=True)

# change the year
income_region["year"] = income_region.apply(
    lambda x: int(x["year"][:4])
    if (x["quarter"] in [3, 4])  # this step is necessary as defining quarter as the part of the year, so this is the end results desired
    else 2000 + int(x["year"][5:]),
    axis=1
)

# sort the values
income_region = income_region.sort_values(by=["SA2 code", "year", "quarter"])

income_region.head(8)

Unnamed: 0,SA2 code,SA2 name,year,economic: number of earners,economic: median income,economic: median age of earners,quarter
5204,201011001,Alfredton,2016,7117,50596,42,3
7806,201011001,Alfredton,2016,7117,50596,42,4
0,201011001,Alfredton,2017,7117,50596,42,1
2602,201011001,Alfredton,2017,7117,50596,42,2
5205,201011001,Alfredton,2017,7558,52448,42,3
7807,201011001,Alfredton,2017,7558,52448,42,4
1,201011001,Alfredton,2018,7558,52448,42,1
2603,201011001,Alfredton,2018,7558,52448,42,2


### Distribution

In [770]:
# get the economic distribtuion
economic_distribution = pd.read_csv(RELATIVE_PATH + "/economic_by_region/income_distribution_by_geography_2021.csv", header=0)
economic_distribution = economic_distribution.drop(0)
economic_distribution.head(5)

# rename and filter
NEW_NAMES = {
    1: "SA2 code", 
    12: "economic: gini coefficient",
    13: "economic: top 1% suburb",
    14: "economic: top 5% suburb"
}
KEEP_COLUMNS = [1, 12, 13, 14]
economic_distribution = rename_dict(economic_distribution, NEW_NAMES, KEEP_COLUMNS)

print(economic_distribution.dtypes)
economic_distribution.head()

SA2 code                      object
economic: gini coefficient    object
economic: top 1% suburb       object
economic: top 5% suburb       object
dtype: object


Unnamed: 0,SA2 code,economic: gini coefficient,economic: top 1% suburb,economic: top 5% suburb
1,Australia,0.483,9.8,22.5
2,New South Wales,0.497,11.2,24.3
3,101021007,0.615,21.2,32.6
4,101021008,0.365,4.4,14.5
5,101021009,0.368,4.6,14.7


In [771]:
# check no Nan values
economic_distribution.isna().any(axis=1).sum()

0

## Economic (Check)

#### Inflation (Check)

- Basically, remember that it is annualized
- Will therefore calculate the average percent change from the previous year
  - Note December the year before, + march + Jun + Sep of the year will be used for the average

In [772]:
# donwload inflation
inflation = pd.read_csv(RELATIVE_PATH + "economic/inflation.csv")

# rename the columns
index = 1 + list(inflation.columns).index("Percentage Change from Previous Period ;  Trimmed Mean ;  Australia ;")
NEW_NAMES = {1: "year", index: "economic: trimmed mean quarterly"}
KEEP_COLUMNS = NEW_NAMES.keys()
inflation = rename_dict(inflation, NEW_NAMES, KEEP_COLUMNS)

print(inflation.dtypes)
inflation.tail(10)

year                                 object
economic: trimmed mean quarterly    float64
dtype: object


Unnamed: 0,year,economic: trimmed mean quarterly
294,2022-03-01,1.5
295,2022-06-01,1.5
296,2022-09-01,1.8
297,2022-12-01,1.7
298,2023-03-01,1.2
299,2023-06-01,0.9
300,2023-09-01,1.2
301,2023-12-01,0.8
302,2024-03-01,1.0
303,2024-06-01,0.8


In [773]:
# split up the month and the year
inflation[["year", "quarter", "day"]] = inflation["year"].str.split("-", expand=True)
inflation = inflation.drop(columns="day")

# get the actual year
inflation["year"] = inflation["year"].astype(int)
inflation["economic: trimmed mean quarterly"] = inflation["economic: trimmed mean quarterly"].astype(float)

# get the quarter of the year
inflation["quarter"] = inflation["quarter"].astype(int) // 3

# remove records before 2000
inflation = inflation[inflation["year"] >= 2000]

inflation.head(5)

Unnamed: 0,year,economic: trimmed mean quarterly,quarter
206,2000,0.7,1
207,2000,0.5,2
208,2000,0.6,3
209,2000,0.6,4
210,2001,1.0,1


#### Interest rates (Check)

Have decided only to take the variable interest rate for standard owners (as this is most reflective of normal people)

In [774]:
# download the lending data
lending = pd.read_csv(RELATIVE_PATH + "economic/lending.csv")

# rename the columns
NEW_NAMES = {1: "year", 4: "economic: variable interest rate"}
KEEP_COLUMNS = [1, 4]
lending = rename_dict(lending, NEW_NAMES, KEEP_COLUMNS)

print(lending.dtypes)
lending.tail(5)

year                                 object
economic: variable interest rate    float64
dtype: object


Unnamed: 0,year,economic: variable interest rate
783,2024-04-30,8.77
784,2024-05-31,8.77
785,2024-06-30,8.77
786,2024-07-31,8.77
787,2024-08-31,8.77


- Basically want to take the average of all the records for each year
- Seems to make most sense to use the actual percentage (above 100%) rather than say 1.02

In [775]:
# split up the time stamp
lending[["year", "quarter", "day"]] = lending["year"].str.split("-", expand=True).astype(int)
lending = lending.drop(columns="day")

# convert month to quarter
lending["quarter"] = ((lending["quarter"] - 1) // 3) + 1

# take the average over each financial quarter
lending = lending.groupby(["year", "quarter"])["economic: variable interest rate"].apply(lambda x: x.mean()).reset_index()

# remove records before 2000
lending = lending[lending["year"] >= 2000]

lending.tail(5)

Unnamed: 0,year,quarter,economic: variable interest rate
258,2023,3,8.52
259,2023,4,8.686667
260,2024,1,8.77
261,2024,2,8.77
262,2024,3,8.77


### GDP (Check)

- GDP is basically add from each year (even tho this misses out on a small amount of gdp growth)
- Same process as inflation, except this shows the quarterly growth instead

In [776]:
# download the lending data
gdp = pd.read_csv(RELATIVE_PATH + "economic/gdp.csv")

# rename the columns
index = 1 + list(gdp.columns).index("A2304370T")
NEW_NAMES = {1: "year", index: "economic: gdp quarterly", index+1: "economic: gdp per capita quarterly"}
KEEP_COLUMNS = [1, index, index+1]
gdp = rename_dict(gdp, NEW_NAMES, KEEP_COLUMNS)

print(gdp.dtypes)
gdp.tail(5)

year                                   object
economic: gdp quarterly               float64
economic: gdp per capita quarterly    float64
dtype: object


Unnamed: 0,year,economic: gdp quarterly,economic: gdp per capita quarterly
255,2023-06-01,0.5,-0.1
256,2023-09-01,0.3,-0.4
257,2023-12-01,0.2,-0.3
258,2024-03-01,0.2,-0.4
259,2024-06-01,0.2,-0.4


In [777]:
# split up the month and the year
gdp[["year", "quarter", "day"]] = gdp["year"].str.split("-", expand=True).astype(int)
gdp = gdp.drop(columns="day")

# get the actual year
gdp[gdp.columns[3:]] = gdp[gdp.columns[3:]].astype(float)

# get the quarter of the year
gdp["quarter"] = gdp["quarter"].astype(int) // 3

# remove records before 2000
gdp = gdp[gdp["year"] >= 2000]

gdp.tail(5)

Unnamed: 0,year,economic: gdp quarterly,economic: gdp per capita quarterly,quarter
255,2023,0.5,-0.1,2
256,2023,0.3,-0.4,3
257,2023,0.2,-0.3,4
258,2024,0.2,-0.4,1
259,2024,0.2,-0.4,2


## Population (Check)

Greater Capital City Statistical Area (GCCSA) code and Greater Capital City Statistical Area (GCCSA) name

### Getting data

In [778]:
# Use the absolute path to the file
vic_population_filepath = RELATIVE_PATH + 'population/age_demos.csv'

# Read the CSV file
vic_population_df = pd.read_csv(vic_population_filepath, header=[0, 1])

# drop the first level
vic_population_df.columns = vic_population_df.columns.droplevel(0)

# rename the columns
NEW_NAMES = {1: "year", 3: "state", 10: "SA2 code", len(vic_population_df.columns): "population: total"}
KEEP_COLUMNS = list(NEW_NAMES.keys())
vic_population_df = rename_dict(vic_population_df, NEW_NAMES, KEEP_COLUMNS)

print(vic_population_df.dtypes)
vic_population_df.head(5)


year                   int64
state                 object
SA2 code             float64
population: total    float64
dtype: object


Unnamed: 0,year,state,SA2 code,population: total
0,2001,New South Wales,101021007.0,2760.0
1,2001,New South Wales,101021008.0,9129.0
2,2001,New South Wales,101021009.0,9717.0
3,2001,New South Wales,101021010.0,3925.0
4,2001,New South Wales,101021012.0,9425.0


In [779]:
# Filter for Victoria entries
vic_population_df = vic_population_df[vic_population_df['state'] == 'Victoria']

# sort the values
vic_population_df =  vic_population_df.sort_values(by=["SA2 code", "year"])

# calculate the percentage change
vic_population_df["population: growth"] = vic_population_df.groupby("SA2 code")["population: total"] \
                                        .pct_change() * 100

# remove missing values
vic_population_df = vic_population_df[~vic_population_df["population: growth"].isna()]

# convert to string for easy
vic_population_df["SA2 code"] = vic_population_df["SA2 code"].astype(int).astype(str)

# drop columns not of interest
vic_population_df = vic_population_df.drop(columns=["state"])

vic_population_df.head(5)

Unnamed: 0,year,SA2 code,population: total,population: growth
3096,2002,201011001,6092.0,5.837387
5550,2003,201011001,6293.0,3.299409
8004,2004,201011001,6480.0,2.971556
10458,2005,201011001,6648.0,2.592593
12912,2006,201011001,6761.0,1.699759


In [780]:
# get the financial quarters for the year
QUARTERS = 4

all_frames = []
for quarter in range(QUARTERS):
    curr_quarter_df = vic_population_df.copy()

    # create the current quarter series
    curr_quarter_df["quarter"] = quarter + 1

    # aggregate the result
    all_frames.append(curr_quarter_df)

# combine the results
vic_population_df = pd.concat(all_frames, ignore_index=True)

# divide the population growth by 4, to account for the fact that it changes (dodegy, but should work)
vic_population_df["population: growth"] = vic_population_df["population: growth"] / 4

# sort
vic_population_df = vic_population_df.sort_values(by=["SA2 code", "year", "quarter"])

vic_population_df.head(10)

Unnamed: 0,year,SA2 code,population: total,population: growth,quarter
0,2002,201011001,6092.0,1.459347,1
11372,2002,201011001,6092.0,1.459347,2
22744,2002,201011001,6092.0,1.459347,3
34116,2002,201011001,6092.0,1.459347,4
1,2003,201011001,6293.0,0.824852,1
11373,2003,201011001,6293.0,0.824852,2
22745,2003,201011001,6293.0,0.824852,3
34117,2003,201011001,6293.0,0.824852,4
2,2004,201011001,6480.0,0.742889,1
11374,2004,201011001,6480.0,0.742889,2


### Population future

### Nigel other stuff

In [781]:
# Save the new dataframe to a CSV file in the curated folder
curated_folder = '../../data/curated/population/'
os.makedirs(curated_folder, exist_ok=True)

# Save the dataframe to a CSV file
curated_csv_path = os.path.join(curated_folder, 'vic_population_df.csv')
vic_population_df.to_csv(curated_csv_path, index=False)

In [782]:
#sa2_to_rental_suburb_groups_filepath = '../../data/raw/location/sa2_to_rental_suburb_groups.csv'
#sa2_to_rental_suburb_groups_df = pd.read_csv(sa2_to_rental_suburb_groups_filepath)

## ABS data

In [796]:
abs_data = pd.read_csv(RELATIVE_PATH + "ABS/merge.csv", index_col=0)
abs_data.columns = ["SA2 code"] + list(abs_data.columns[1:])
abs_data["SA2 code"] = abs_data["SA2 code"].astype(str)
abs_data.head(5)

Unnamed: 0_level_0,SA2 code,ages: 15-24,ages: 25-34,ages: 35-44,ages: 45-54,ages: 55-64,ages: 65-74,ages: 75-84,ages: 0-14,relationships: other related individual,...,birth: sri lanka,birth: taiwan,birth: united states of america,birth: chile,birth: cambodia,birth: australia,birth: japan,birth: nepal,birth: philippines,birth: india
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,201011001,2224,1994,2361,2043,1477,1210,634,3907,252,...,91,4,27,15,0,13677,15,16,117,681
1,201011002,1325,1237,1241,1547,1515,1326,808,1691,132,...,21,14,67,3,3,9931,21,38,33,180
2,201011005,785,528,822,993,952,804,326,1437,74,...,7,0,19,0,0,6225,9,6,15,40
3,201011006,1352,1882,1212,1074,970,819,414,2175,243,...,18,0,19,4,0,9041,9,53,75,265
4,201011007,539,330,501,670,618,411,141,878,64,...,9,0,12,3,0,3806,0,0,4,7


## Merging Together

In [797]:
# all the frames to merge
all_frames = [income_region, economic_distribution, inflation, lending, gdp, vic_population_df, abs_data]


merged_df = pd.DataFrame({})
for frame in all_frames:
    merged_df = get_merged_df(frame, merged_df)

# order the columns
indentifiers = ["SA2 code", "SA2 name", "year", "quarter"]
merged_df = merged_df[indentifiers + [x for x in merged_df.columns if not x in indentifiers]]

# sort the values
merged_df = merged_df.sort_values(by=["SA2 name", "year", "quarter"])

merged_df.head(5)

lost 0 out of 10408 records
lost 0 out of 10408 records
lost 0 out of 10408 records
lost 0 out of 10408 records
lost 18 out of 10408 records
lost 0 out of 10390 records


Unnamed: 0,SA2 code,SA2 name,year,quarter,economic: number of earners,economic: median income,economic: median age of earners,economic: top 1% suburb,economic: top 5% suburb,economic: gini coefficient,...,relationships: other related individual,birth: united states of america,ages: 75-84,birth: lebanon,birth: bosnia and herzegovina,birth: ireland,birth: cambodia,relationships: lone parents,birth: australia,birth: hong kong sar of china
3078,206071139,Abbotsford,2016,3,6366,58219,32,7.4,19.7,0.44,...,173,108,213,0,4,53,12,190,5507,45
3079,206071139,Abbotsford,2016,4,6366,58219,32,7.4,19.7,0.44,...,173,108,213,0,4,53,12,190,5507,45
3080,206071139,Abbotsford,2017,1,6366,58219,32,7.4,19.7,0.44,...,173,108,213,0,4,53,12,190,5507,45
3081,206071139,Abbotsford,2017,2,6366,58219,32,7.4,19.7,0.44,...,173,108,213,0,4,53,12,190,5507,45
3082,206071139,Abbotsford,2017,3,6433,61476,33,7.4,19.7,0.44,...,173,108,213,0,4,53,12,190,5507,45


In [798]:
# the final out path
OUT_PATH = "../../data/raw/historic/"

# make directory necessary
if (not os.path.exists(OUT_PATH)):
    os.makedirs(OUT_PATH)

# save data frame
merged_df.to_csv(OUT_PATH + "merged.csv")

## Mering to Housing

### Joining SA2 and vic population

In [None]:
import os
import pandas as pd
import numpy as np

# Use the absolute path to the file
vic_population_filepath = '../../data/landing/population/age_demos.csv'
sa2_to_rental_suburb_groups_filepath = '../../data/raw/location/sa2_to_rental_suburb_groups.csv'

# Read the CSV files
vic_population_df = pd.read_csv(vic_population_filepath, header=[0, 1])
sa2_to_rental_suburb_groups_df = pd.read_csv(sa2_to_rental_suburb_groups_filepath)

# Preprocess vic_population_df
vic_population_df.columns = vic_population_df.columns.droplevel(0)
vic_population_df.columns = ['Year', 'S/T code', 'S/T name', 'GCCSA code', 'GCCSA name', 'SA4 code', 'SA4 name', 'SA3 code', 'SA3 name', 'SA2 code', 'SA2 name', '0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85 and over', 'Total persons']
vic_population_df = vic_population_df[vic_population_df['S/T name'] == 'Victoria']
vic_population_df = vic_population_df.drop(columns=['SA4 code', 'SA4 name', 'SA3 code', 'SA3 name'])
age_columns = ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85 and over']
vic_population_df = vic_population_df.drop(columns=age_columns)

# Merge the dataframes on 'SA2 code'
merged_df = pd.merge(sa2_to_rental_suburb_groups_df, vic_population_df[['SA2 code', 'Year', 'Total persons']], left_on='code', right_on='SA2 code', how='left')

# Display the merged DataFrame
print(merged_df.head())

# Save the merged dataframe to a CSV file in the curated folder
curated_folder = '../../data/curated/population/'
os.makedirs(curated_folder, exist_ok=True)
curated_csv_path = os.path.join(curated_folder, 'merged_population_df.csv')
merged_df.to_csv(curated_csv_path, index=False)

### Population forecast by suburbs

In [None]:
forecast_population_filepath = '../../data/landing/forecast_population_sa2/VIF2023_SA2_Pop_Hhold_Dwelling_Projections_to_2036_Release_2.xlsx'
forecast_population_df = pd.read_excel(forecast_population_filepath)

print(forecast_population_df.head())