In [86]:
import pandas as pd
import numpy as np
import os

## Functions

#### Merging

In [87]:
MERGE_COLUMNS = ["SA2 code", "year", "quarter"]

def get_merged_df(new_df, final_df):
    if (final_df.empty):
        return new_df

    # get the attributes used for the merge
    merge_columns = list(set(new_df.columns) & set(MERGE_COLUMNS) & set(final_df.columns))

    # get the columns not already in the data frame
    new_columns = list(set(new_df.columns) - set(final_df.columns) - set(merge_columns))

    # merge columns
    merged_df = pd.merge(final_df, new_df[merge_columns + new_columns], on=merge_columns, how="inner")

    # check if any rows lost
    print(f"lost {final_df.shape[0] - merged_df.shape[0]} out of {final_df.shape[0]} records")
    
    return merged_df

#### Renaming columns

In [88]:
def rename_dict(df, rename_dict, keep_columns):
    # get the renamed dictionary
    new_column_names = list(df.columns.copy())
    for index, new_name in rename_dict.items():
        new_column_names[index - 1] = new_name
    df.columns = new_column_names

    # drop the columns not of interest
    keep_columns = [x-1 for x in keep_columns]
    df = df.iloc[:, keep_columns]

    return df

#### Multindex functions

In [89]:
def impute_previous(columns, na):
    flat_column_list = [[group[i] for group in columns] for i in range(len(columns[0]))]

    column_dataframe = pd.DataFrame(flat_column_list)

    column_dataframe = column_dataframe.applymap(lambda x: None if (not x) or (na in str(x)) else x)

    column_dataframe = column_dataframe.T.ffill().T

    flat_column_list = column_dataframe.to_numpy()

    return flat_column_list

In [90]:
def fill_columns(df, column_dict, na="Unnamed"):
    columns = list(df.columns)

    for index, new_column in column_dict.items():
        columns[index] = new_column

    if (type(df.columns) == pd.MultiIndex):
        flat_column_list = impute_previous(columns, na)

        df.columns = pd.MultiIndex.from_arrays(flat_column_list)
    
    else:
        df.columns = columns

    return df

## Document notes + set up

- Note that the final file produced, each timestamp will correspond to the growth from the previous time stamp
  - For example, if the population is 1000 in the 2022-03 quarter, then 1100 in the 2022-06 quater, then the value for the 2022-06 quarter population growth is 10%
  - Similarly for inflation, measures the change from previous quarter
- Year represents the start year

In [91]:
RELATIVE_PATH = "../../data/landing/"

## Economic by Region (check)

### Geography summary

- Note the income of a given year (say 2020) is acutally from july year before to jun next year (July 2019 to June 2020)
  - Because the data was given as financial years

In [92]:
COLUMNS_DICT_INCOME_GEOGRAPHY = {
    0: ("SA2 code",None),
    1: ("SA2 name",None)
}

In [93]:
# get the csv
income_region = pd.read_csv(RELATIVE_PATH + "economic_by_region/income_by_geography_2016_2021.csv", header=[0, 1])

# prepare the columns
income_region = fill_columns(income_region, COLUMNS_DICT_INCOME_GEOGRAPHY)

# filter victoria columns
vic_mask = income_region[("SA2 code", np.nan)].str.startswith("2")
income_region = income_region[vic_mask]

# create a massive list of stuff, each record has a unique year, location and measure
income_region = income_region.melt(id_vars=income_region.columns.to_list()[:2], 
                                   value_vars = income_region.columns.to_list()[2:],
                                   var_name=["Measure", "Year"],
                                   value_name="Value")

# make it so each record only has a unique year and location
income_region = income_region.pivot(index=list(income_region.columns[:2]) + ["Year"], columns="Measure", values="Value")
income_region = income_region.reset_index()
income_region.columns = [x[0] for x in income_region.columns[:2]] + list(income_region.columns[2:])

In [94]:
# filter columns and rename
NEW_NAMES = {
    3: "year",
    4: "economic: number of earners",
    6: "economic: median income", 
    7: "economic: median age of earners"
}
KEEP_COLUMNS = [1, 2, 3, 4, 6, 7]

income_region = rename_dict(income_region, NEW_NAMES, KEEP_COLUMNS)
print(income_region.shape[0])
income_region.head(5)

2610


Unnamed: 0,SA2 code,SA2 name,year,economic: number of earners,economic: median income,economic: median age of earners
0,201011001,Alfredton,2016-17,7117,50596,42
1,201011001,Alfredton,2017-18,7558,52448,42
2,201011001,Alfredton,2018-19,7987,53932,42
3,201011001,Alfredton,2019-20,8665,55204,41
4,201011001,Alfredton,2020-21,9438,58036,41


In [95]:
import numpy as np

MEASURE_COLUMNS = income_region.columns[3:]

# drop any na values in measure columns
income_region = income_region.replace("np", np.nan)
income_region = income_region.dropna(subset=MEASURE_COLUMNS, how="any")
income_region.shape[0]

# get numeric columns
for measure_column in MEASURE_COLUMNS:
    income_region[measure_column] = pd.to_numeric(income_region[measure_column].str.replace(',', ''))

print(income_region.shape[0])
income_region.head(5)

2602


Unnamed: 0,SA2 code,SA2 name,year,economic: number of earners,economic: median income,economic: median age of earners
0,201011001,Alfredton,2016-17,7117,50596,42
1,201011001,Alfredton,2017-18,7558,52448,42
2,201011001,Alfredton,2018-19,7987,53932,42
3,201011001,Alfredton,2019-20,8665,55204,41
4,201011001,Alfredton,2020-21,9438,58036,41


In [96]:
# get the financial quarters for the year
QUARTERS = 4

all_frames = []
for quarter in range(QUARTERS):
    curr_quarter_df = income_region.copy()

    # create the current quarter series
    curr_quarter_df["quarter"] = quarter + 1

    # aggregate the result
    all_frames.append(curr_quarter_df)

# combine the results
income_region = pd.concat(all_frames, ignore_index=True)

# change the year
income_region["year"] = income_region.apply(
    lambda x: int(x["year"][:4])
    if (x["quarter"] in [3, 4])  # this step is necessary as defining quarter as the part of the year, so this is the end results desired
    else 2000 + int(x["year"][5:]),
    axis=1
)

# sort the values
income_region = income_region.sort_values(by=["SA2 code", "year", "quarter"])

income_region.head(8)

Unnamed: 0,SA2 code,SA2 name,year,economic: number of earners,economic: median income,economic: median age of earners,quarter
5204,201011001,Alfredton,2016,7117,50596,42,3
7806,201011001,Alfredton,2016,7117,50596,42,4
0,201011001,Alfredton,2017,7117,50596,42,1
2602,201011001,Alfredton,2017,7117,50596,42,2
5205,201011001,Alfredton,2017,7558,52448,42,3
7807,201011001,Alfredton,2017,7558,52448,42,4
1,201011001,Alfredton,2018,7558,52448,42,1
2603,201011001,Alfredton,2018,7558,52448,42,2


### Distribution

In [128]:
# get the economic distribtuion
economic_distribution = pd.read_csv(RELATIVE_PATH + "/economic_by_region/income_distribution_by_geography_2021.csv", header=0)
economic_distribution = economic_distribution.drop(0)
economic_distribution.head(5)

# rename and filter
NEW_NAMES = {
    1: "SA2 code", 
    12: "economic: gini coefficient",
    13: "economic: top 1% suburb",
    14: "economic: top 5% suburb"
}
KEEP_COLUMNS = [1, 12, 13, 14]
economic_distribution = rename_dict(economic_distribution, NEW_NAMES, KEEP_COLUMNS)

# convert to numeric
for i in range(1, 4):
    economic_distribution.iloc[:, i] = pd.to_numeric(economic_distribution.iloc[:, i], errors="coerce")

print(economic_distribution.dtypes)
economic_distribution.head()

SA2 code                       object
economic: gini coefficient    float64
economic: top 1% suburb       float64
economic: top 5% suburb       float64
dtype: object


  economic_distribution.iloc[:, i] = pd.to_numeric(economic_distribution.iloc[:, i], errors="coerce")


Unnamed: 0,SA2 code,economic: gini coefficient,economic: top 1% suburb,economic: top 5% suburb
1,Australia,0.483,9.8,22.5
2,New South Wales,0.497,11.2,24.3
3,101021007,0.615,21.2,32.6
4,101021008,0.365,4.4,14.5
5,101021009,0.368,4.6,14.7


In [129]:
# check no Nan values
economic_distribution.isna().any(axis=1).sum()

142

## Economic (Check)

#### Inflation (Check)

- Basically, remember that it is annualized
- Will therefore calculate the average percent change from the previous year
  - Note December the year before, + march + Jun + Sep of the year will be used for the average

In [99]:
# donwload inflation
inflation = pd.read_csv(RELATIVE_PATH + "economic/inflation.csv")

# rename the columns
index = 1 + list(inflation.columns).index("Percentage Change from Previous Period ;  Trimmed Mean ;  Australia ;")
NEW_NAMES = {1: "year", index: "economic: trimmed mean quarterly"}
KEEP_COLUMNS = NEW_NAMES.keys()
inflation = rename_dict(inflation, NEW_NAMES, KEEP_COLUMNS)

print(inflation.dtypes)
inflation.tail(10)

year                                 object
economic: trimmed mean quarterly    float64
dtype: object


Unnamed: 0,year,economic: trimmed mean quarterly
294,2022-03-01,1.5
295,2022-06-01,1.5
296,2022-09-01,1.8
297,2022-12-01,1.7
298,2023-03-01,1.2
299,2023-06-01,0.9
300,2023-09-01,1.2
301,2023-12-01,0.8
302,2024-03-01,1.0
303,2024-06-01,0.8


In [100]:
# split up the month and the year
inflation[["year", "quarter", "day"]] = inflation["year"].str.split("-", expand=True)
inflation = inflation.drop(columns="day")

# get the actual year
inflation["year"] = inflation["year"].astype(int)
inflation["economic: trimmed mean quarterly"] = inflation["economic: trimmed mean quarterly"].astype(float)

# get the quarter of the year
inflation["quarter"] = inflation["quarter"].astype(int) // 3

# remove records before 2000
inflation = inflation[inflation["year"] >= 2000]

inflation.head(5)

Unnamed: 0,year,economic: trimmed mean quarterly,quarter
206,2000,0.7,1
207,2000,0.5,2
208,2000,0.6,3
209,2000,0.6,4
210,2001,1.0,1


#### Interest rates (Check)

Have decided only to take the variable interest rate for standard owners (as this is most reflective of normal people)

In [101]:
# download the lending data
lending = pd.read_csv(RELATIVE_PATH + "economic/lending.csv")

# rename the columns
NEW_NAMES = {1: "year", 4: "economic: variable interest rate"}
KEEP_COLUMNS = [1, 4]
lending = rename_dict(lending, NEW_NAMES, KEEP_COLUMNS)

print(lending.dtypes)
lending.tail(5)

year                                 object
economic: variable interest rate    float64
dtype: object


Unnamed: 0,year,economic: variable interest rate
783,2024-04-30,8.77
784,2024-05-31,8.77
785,2024-06-30,8.77
786,2024-07-31,8.77
787,2024-08-31,8.77


- Basically want to take the average of all the records for each year
- Seems to make most sense to use the actual percentage (above 100%) rather than say 1.02

In [102]:
# split up the time stamp
lending[["year", "quarter", "day"]] = lending["year"].str.split("-", expand=True).astype(int)
lending = lending.drop(columns="day")

# convert month to quarter
lending["quarter"] = ((lending["quarter"] - 1) // 3) + 1

# take the average over each financial quarter
lending = lending.groupby(["year", "quarter"])["economic: variable interest rate"].apply(lambda x: x.mean()).reset_index()

# remove records before 2000
lending = lending[lending["year"] >= 2000]

lending.tail(5)

Unnamed: 0,year,quarter,economic: variable interest rate
258,2023,3,8.52
259,2023,4,8.686667
260,2024,1,8.77
261,2024,2,8.77
262,2024,3,8.77


### GDP (Check)

- GDP is basically add from each year (even tho this misses out on a small amount of gdp growth)
- Same process as inflation, except this shows the quarterly growth instead

In [103]:
# download the lending data
gdp = pd.read_csv(RELATIVE_PATH + "economic/gdp.csv")

# rename the columns
index = 1 + list(gdp.columns).index("A2304370T")
NEW_NAMES = {1: "year", index: "economic: gdp quarterly", index+1: "economic: gdp per capita quarterly"}
KEEP_COLUMNS = [1, index, index+1]
gdp = rename_dict(gdp, NEW_NAMES, KEEP_COLUMNS)

print(gdp.dtypes)
gdp.tail(5)

year                                   object
economic: gdp quarterly               float64
economic: gdp per capita quarterly    float64
dtype: object


Unnamed: 0,year,economic: gdp quarterly,economic: gdp per capita quarterly
255,2023-06-01,0.5,-0.1
256,2023-09-01,0.3,-0.4
257,2023-12-01,0.2,-0.3
258,2024-03-01,0.2,-0.4
259,2024-06-01,0.2,-0.4


In [104]:
# split up the month and the year
gdp[["year", "quarter", "day"]] = gdp["year"].str.split("-", expand=True).astype(int)
gdp = gdp.drop(columns="day")

# get the actual year
gdp[gdp.columns[3:]] = gdp[gdp.columns[3:]].astype(float)

# get the quarter of the year
gdp["quarter"] = gdp["quarter"].astype(int) // 3

# remove records before 2000
gdp = gdp[gdp["year"] >= 2000]

gdp.tail(5)

Unnamed: 0,year,economic: gdp quarterly,economic: gdp per capita quarterly,quarter
255,2023,0.5,-0.1,2
256,2023,0.3,-0.4,3
257,2023,0.2,-0.3,4
258,2024,0.2,-0.4,1
259,2024,0.2,-0.4,2


## Population (Check)

Greater Capital City Statistical Area (GCCSA) code and Greater Capital City Statistical Area (GCCSA) name

### Getting data

In [120]:
# Use the absolute path to the file
vic_population_filepath = RELATIVE_PATH + 'population/age_demos.csv'

# Read the CSV file
vic_population_df = pd.read_csv(vic_population_filepath, header=[0, 1])

# drop the first level
first_level = vic_population_df.columns.get_level_values(0)
vic_population_df.columns = vic_population_df.columns.droplevel(0)

# rename the columns
NEW_NAMES = {1: "year", 3: "state", 10: "SA2 code"}
NEW_NAMES.update({i: "population: " + first_level[i].lower() for i in range(11, len(first_level))})
KEEP_COLUMNS = list(NEW_NAMES.keys())
vic_population_df = rename_dict(vic_population_df, NEW_NAMES, KEEP_COLUMNS)

print(vic_population_df.dtypes)
vic_population_df.head(5)


year                           int64
state                         object
SA2 code                     float64
population: 0-4               object
population: 5-9              float64
population: 10-14            float64
population: 15-19            float64
population: 20-24            float64
population: 25-29            float64
population: 30-34            float64
population: 35-39            float64
population: 40-44            float64
population: 45-49            float64
population: 50-54            float64
population: 55-59            float64
population: 60-64            float64
population: 65-69            float64
population: 70-74            float64
population: 75-79            float64
population: 80-84            float64
population: 85 and over      float64
population: total persons    float64
dtype: object


Unnamed: 0,year,state,SA2 code,population: 0-4,population: 5-9,population: 10-14,population: 15-19,population: 20-24,population: 25-29,population: 30-34,...,population: 45-49,population: 50-54,population: 55-59,population: 60-64,population: 65-69,population: 70-74,population: 75-79,population: 80-84,population: 85 and over,population: total persons
0,2001,New South Wales,101021007.0,Braidwood,154.0,165.0,173.0,157.0,94.0,129.0,...,218.0,221.0,262.0,233.0,197.0,125.0,108.0,79.0,57.0,37.0
1,2001,New South Wales,101021008.0,Karabar,720.0,749.0,741.0,688.0,623.0,657.0,...,755.0,675.0,585.0,491.0,300.0,223.0,201.0,144.0,64.0,32.0
2,2001,New South Wales,101021009.0,Queanbeyan,592.0,509.0,514.0,624.0,799.0,757.0,...,692.0,633.0,620.0,520.0,444.0,384.0,394.0,320.0,214.0,226.0
3,2001,New South Wales,101021010.0,Queanbeyan - East,247.0,242.0,237.0,269.0,338.0,336.0,...,299.0,323.0,297.0,199.0,152.0,97.0,94.0,82.0,54.0,20.0
4,2001,New South Wales,101021012.0,Queanbeyan West - Jerrabomberra,969.0,923.0,777.0,602.0,451.0,640.0,...,808.0,600.0,508.0,294.0,224.0,159.0,105.0,70.0,29.0,24.0


In [121]:
# Filter for Victoria entries
vic_population_df = vic_population_df[vic_population_df['state'] == 'Victoria']

# sort the values
vic_population_df =  vic_population_df.sort_values(by=["SA2 code", "year"])

# calculate the percentage change
vic_population_df["population: growth"] = vic_population_df.groupby("SA2 code")["population: total persons"] \
                                        .pct_change() * 100

# remove missing values
vic_population_df = vic_population_df[~vic_population_df["population: growth"].isna()]

# convert to string for easy
vic_population_df["SA2 code"] = vic_population_df["SA2 code"].astype(int).astype(str)

# drop columns not of interest
vic_population_df = vic_population_df.drop(columns=["state"])

vic_population_df.head(5)

Unnamed: 0,year,SA2 code,population: 0-4,population: 5-9,population: 10-14,population: 15-19,population: 20-24,population: 25-29,population: 30-34,population: 35-39,...,population: 50-54,population: 55-59,population: 60-64,population: 65-69,population: 70-74,population: 75-79,population: 80-84,population: 85 and over,population: total persons,population: growth
3096,2002,201011001,Alfredton,385.0,473.0,623.0,556.0,339.0,279.0,383.0,...,452.0,384.0,297.0,244.0,188.0,179.0,148.0,73.0,42.0,0.0
5550,2003,201011001,Alfredton,404.0,492.0,608.0,584.0,376.0,293.0,371.0,...,475.0,386.0,328.0,256.0,205.0,176.0,150.0,93.0,38.0,-9.52381
8004,2004,201011001,Alfredton,403.0,499.0,613.0,601.0,398.0,290.0,391.0,...,494.0,394.0,362.0,255.0,214.0,187.0,154.0,91.0,46.0,21.052632
10458,2005,201011001,Alfredton,421.0,520.0,624.0,604.0,414.0,282.0,403.0,...,518.0,427.0,374.0,242.0,229.0,179.0,140.0,120.0,52.0,13.043478
12912,2006,201011001,Alfredton,433.0,525.0,617.0,641.0,416.0,281.0,404.0,...,501.0,451.0,393.0,239.0,241.0,187.0,138.0,118.0,48.0,-7.692308


In [122]:
# get the financial quarters for the year
QUARTERS = 4

all_frames = []
for quarter in range(QUARTERS):
    curr_quarter_df = vic_population_df.copy()

    # create the current quarter series
    curr_quarter_df["quarter"] = quarter + 1

    # aggregate the result
    all_frames.append(curr_quarter_df)

# combine the results
vic_population_df = pd.concat(all_frames, ignore_index=True)

# divide the population growth by 4, to account for the fact that it changes (dodegy, but should work)
vic_population_df["population: growth"] = vic_population_df["population: growth"] / 4

# sort
vic_population_df = vic_population_df.sort_values(by=["SA2 code", "year", "quarter"])

vic_population_df.head(10)

Unnamed: 0,year,SA2 code,population: 0-4,population: 5-9,population: 10-14,population: 15-19,population: 20-24,population: 25-29,population: 30-34,population: 35-39,...,population: 55-59,population: 60-64,population: 65-69,population: 70-74,population: 75-79,population: 80-84,population: 85 and over,population: total persons,population: growth,quarter
0,2002,201011001,Alfredton,385.0,473.0,623.0,556.0,339.0,279.0,383.0,...,384.0,297.0,244.0,188.0,179.0,148.0,73.0,42.0,0.0,1
11112,2002,201011001,Alfredton,385.0,473.0,623.0,556.0,339.0,279.0,383.0,...,384.0,297.0,244.0,188.0,179.0,148.0,73.0,42.0,0.0,2
22224,2002,201011001,Alfredton,385.0,473.0,623.0,556.0,339.0,279.0,383.0,...,384.0,297.0,244.0,188.0,179.0,148.0,73.0,42.0,0.0,3
33336,2002,201011001,Alfredton,385.0,473.0,623.0,556.0,339.0,279.0,383.0,...,384.0,297.0,244.0,188.0,179.0,148.0,73.0,42.0,0.0,4
1,2003,201011001,Alfredton,404.0,492.0,608.0,584.0,376.0,293.0,371.0,...,386.0,328.0,256.0,205.0,176.0,150.0,93.0,38.0,-2.380952,1
11113,2003,201011001,Alfredton,404.0,492.0,608.0,584.0,376.0,293.0,371.0,...,386.0,328.0,256.0,205.0,176.0,150.0,93.0,38.0,-2.380952,2
22225,2003,201011001,Alfredton,404.0,492.0,608.0,584.0,376.0,293.0,371.0,...,386.0,328.0,256.0,205.0,176.0,150.0,93.0,38.0,-2.380952,3
33337,2003,201011001,Alfredton,404.0,492.0,608.0,584.0,376.0,293.0,371.0,...,386.0,328.0,256.0,205.0,176.0,150.0,93.0,38.0,-2.380952,4
2,2004,201011001,Alfredton,403.0,499.0,613.0,601.0,398.0,290.0,391.0,...,394.0,362.0,255.0,214.0,187.0,154.0,91.0,46.0,5.263158,1
11114,2004,201011001,Alfredton,403.0,499.0,613.0,601.0,398.0,290.0,391.0,...,394.0,362.0,255.0,214.0,187.0,154.0,91.0,46.0,5.263158,2


### Nigel other stuff

In [123]:
# Save the new dataframe to a CSV file in the curated folder
curated_folder = '../../data/curated/population/'
os.makedirs(curated_folder, exist_ok=True)

# Save the dataframe to a CSV file
curated_csv_path = os.path.join(curated_folder, 'vic_population_df.csv')
vic_population_df.to_csv(curated_csv_path, index=False)

In [110]:
#sa2_to_rental_suburb_groups_filepath = '../../data/raw/location/sa2_to_rental_suburb_groups.csv'
#sa2_to_rental_suburb_groups_df = pd.read_csv(sa2_to_rental_suburb_groups_filepath)

## ABS data

In [124]:
abs_data = pd.read_csv(RELATIVE_PATH + "ABS/merge.csv", index_col=0)
abs_data.columns = ["SA2 code"] + list(abs_data.columns[1:])
abs_data["SA2 code"] = abs_data["SA2 code"].astype(str)
abs_data.head(5)

Unnamed: 0_level_0,SA2 code,relationships: married,relationships: defacto,relationships: lone parents,relationships: child under 15,relationships: dependent student,relationships: non dependent child,relationships: other related individual,relationships: group household,relationships: lone persons,...,birth: turkey,birth: wales,birth: south africa,birth: philippines,birth: croatia,birth: cambodia,birth: greece,birth: japan,birth: nepal,birth: myanmar
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,201011001,6056,1328,772,3884,974,854,252,447,1096,...,0,12,74,117,14,0,21,15,16,0
1,201011002,4129,906,471,1669,604,464,132,538,1776,...,11,13,26,33,12,3,7,21,38,0
2,201011005,2935,514,248,1427,395,367,74,134,484,...,3,5,14,15,3,0,3,9,6,0
3,201011006,3302,1238,584,2150,411,599,243,373,826,...,0,6,8,75,9,0,4,9,53,0
4,201011007,1836,319,105,861,261,323,64,41,202,...,0,0,6,4,9,0,0,0,0,0


## Merging Together

In [125]:
for frame in all_frames:
    print(frame.dtypes)

year                           int64
SA2 code                      object
population: 0-4               object
population: 5-9              float64
population: 10-14            float64
population: 15-19            float64
population: 20-24            float64
population: 25-29            float64
population: 30-34            float64
population: 35-39            float64
population: 40-44            float64
population: 45-49            float64
population: 50-54            float64
population: 55-59            float64
population: 60-64            float64
population: 65-69            float64
population: 70-74            float64
population: 75-79            float64
population: 80-84            float64
population: 85 and over      float64
population: total persons    float64
population: growth           float64
quarter                        int64
dtype: object
year                           int64
SA2 code                      object
population: 0-4               object
population: 5-9         

In [130]:
# all the frames to merge
all_frames = [income_region, economic_distribution, inflation, lending, gdp, vic_population_df, abs_data]


merged_df = pd.DataFrame({})
for frame in all_frames:
    merged_df = get_merged_df(frame, merged_df)

# order the columns
indentifiers = ["SA2 code", "SA2 name", "year", "quarter"]
merged_df = merged_df[indentifiers + [x for x in merged_df.columns if not x in indentifiers]]

# sort the values
merged_df = merged_df.sort_values(by=["SA2 name", "year", "quarter"])

merged_df.head(5)

lost 0 out of 10408 records
lost 0 out of 10408 records
lost 0 out of 10408 records
lost 0 out of 10408 records
lost 208 out of 10408 records
lost 0 out of 10200 records


Unnamed: 0,SA2 code,SA2 name,year,quarter,economic: number of earners,economic: median income,economic: median age of earners,economic: top 5% suburb,economic: top 1% suburb,economic: gini coefficient,...,birth: canada,relationships: dependent student,birth: korea republic of south,birth: cambodia,birth: born elsewhere,birth: italy,birth: croatia,relationships: lone parents,birth: germany,studying: tertiary total
2958,206071139,Abbotsford,2016,3,6366,58219,32,19.7,7.4,0.44,...,46,165,44,12,360,50,10,190,46,827
2959,206071139,Abbotsford,2016,4,6366,58219,32,19.7,7.4,0.44,...,46,165,44,12,360,50,10,190,46,827
2960,206071139,Abbotsford,2017,1,6366,58219,32,19.7,7.4,0.44,...,46,165,44,12,360,50,10,190,46,827
2961,206071139,Abbotsford,2017,2,6366,58219,32,19.7,7.4,0.44,...,46,165,44,12,360,50,10,190,46,827
2962,206071139,Abbotsford,2017,3,6433,61476,33,19.7,7.4,0.44,...,46,165,44,12,360,50,10,190,46,827


In [131]:
# the final out path
OUT_PATH = "../../data/raw/historic/"

# make directory necessary
if (not os.path.exists(OUT_PATH)):
    os.makedirs(OUT_PATH)

# save data frame
merged_df.to_csv(OUT_PATH + "merged.csv")

## Mering to Housing

### Joining SA2 and vic population

In [114]:
import os
import pandas as pd
import numpy as np

# Use the absolute path to the file
vic_population_filepath = '../../data/landing/population/age_demos.csv'
sa2_to_rental_suburb_groups_filepath = '../../data/raw/location/sa2_to_rental_suburb_groups.csv'

# Read the CSV files
vic_population_df = pd.read_csv(vic_population_filepath, header=[0, 1])
sa2_to_rental_suburb_groups_df = pd.read_csv(sa2_to_rental_suburb_groups_filepath)

# Preprocess vic_population_df
vic_population_df.columns = vic_population_df.columns.droplevel(0)
vic_population_df.columns = ['Year', 'S/T code', 'S/T name', 'GCCSA code', 'GCCSA name', 'SA4 code', 'SA4 name', 'SA3 code', 'SA3 name', 'SA2 code', 'SA2 name', '0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85 and over', 'Total persons']
vic_population_df = vic_population_df[vic_population_df['S/T name'] == 'Victoria']
vic_population_df = vic_population_df.drop(columns=['SA4 code', 'SA4 name', 'SA3 code', 'SA3 name'])
age_columns = ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85 and over']
vic_population_df = vic_population_df.drop(columns=age_columns)

# Merge the dataframes on 'SA2 code'
merged_df = pd.merge(sa2_to_rental_suburb_groups_df, vic_population_df[['SA2 code', 'Year', 'Total persons']], left_on='code', right_on='SA2 code', how='left')

# Display the merged DataFrame
print(merged_df.head())

# Save the merged dataframe to a CSV file in the curated folder
curated_folder = '../../data/curated/population/'
os.makedirs(curated_folder, exist_ok=True)
curated_csv_path = os.path.join(curated_folder, 'merged_population_df.csv')
merged_df.to_csv(curated_csv_path, index=False)

ValueError: You are trying to merge on object and float64 columns. If you wish to proceed you should use pd.concat

### Population forecast by suburbs

In [None]:
forecast_population_filepath = '../../data/landing/forecast_population_sa2/VIF2023_SA2_Pop_Hhold_Dwelling_Projections_to_2036_Release_2.xlsx'
forecast_population_df = pd.read_excel(forecast_population_filepath)

print(forecast_population_df.head())