In [14]:
import numpy as np
import pandas as pd
import os
import ast

In [15]:
RELATIVE_PATH_IN_RAW = "../../../data/2. raw"
RELATIVE_PATH_IN_CURATED = "../../../data/3. curated"
RELATIVE_PATH_OUT = "../../../data/4. modelling"

In [None]:
os.makedirs(RELATIVE_PATH_OUT, exist_ok=True)

## Execution

In [16]:
SELECT_FEATURES = ["housing: previous price", "housing: previous count growth", "housing: previous growth", "housing: previous count log"] + \
                  ["population: total persons log", "economic: trimmed mean quarterly"] + \
                  ["interaction: (studying: tertiary total (%) log) & (population: total persons log)", "interaction: (economic: median income) & (economic: trimmed mean quarterly)"]
SELECT_FEATURES

['housing: previous price',
 'housing: previous count growth',
 'housing: previous growth',
 'housing: previous count log',
 'population: total persons log',
 'economic: trimmed mean quarterly',
 'interaction: (studying: tertiary total (%) log) & (population: total persons log)',
 'interaction: (economic: median income) & (economic: trimmed mean quarterly)']

### Get Population Data

In [17]:
def is_vic(value):
    # filter out na
    if (pd.isna(value)):
        return False
    
    # if have a numeric type
    return np.floor(value / 1e8) == 2

# get the df
future_population_sa2 = pd.read_csv(f"{RELATIVE_PATH_IN_RAW}/1. renamed/population/projections.csv", header=0, index_col=0)

# filter victoria columns
vic_mask = future_population_sa2["SA2 code"].apply(is_vic)
future_population_sa2 = future_population_sa2[vic_mask]

# get the right year and filter
future_population_sa2 = future_population_sa2[["SA2 code", "population: 2026"]]
future_population_sa2.rename(columns={"population: 2026": "population: total"}, inplace=True)

future_population_sa2.head(5)

Unnamed: 0,SA2 code,population: total
0,201011001.0,20756.256163
1,201011002.0,11698.293593
2,201011005.0,7372.079773
3,201011006.0,15915.186041
4,201011007.0,4312.09853


In [18]:
# merging stuff
regions_df = pd.read_csv(f'{RELATIVE_PATH_IN_RAW}/location/sa2_to_rental_suburb_groups.csv')

regions_df_list = regions_df.copy()
regions_df_list['code'] = regions_df_list['code'].apply(ast.literal_eval)

exploded_regions = regions_df_list.explode('code')
exploded_regions['code'] = exploded_regions['code'].apply(pd.to_numeric, errors='coerce')

def df_to_regions(df, sa2_col_name, aggregation_functions):
    df[sa2_col_name] = df[sa2_col_name].apply(pd.to_numeric, errors='coerce')
    regions_with_stats = pd.merge(exploded_regions, df, left_on='code', right_on=sa2_col_name, how='left')

    for col in aggregation_functions:
        regions_with_stats[col] = regions_with_stats[col].apply(pd.to_numeric, errors='coerce')

    grouped_by = regions_with_stats.groupby('suburbs').agg(aggregation_functions)

    return grouped_by.reset_index()

In [19]:
# do the merge
agg_functions = {"population: total": "sum"}

# EDIT ME: Calls the actual join function
future_population_suburbs = df_to_regions(df = future_population_sa2, sa2_col_name = 'SA2 code', aggregation_functions=agg_functions)
future_population_suburbs.head(5)

Unnamed: 0,suburbs,population: total
0,Albert Park-Middle Park-West St Kilda,32306.2679
1,Altona,30915.396427
2,Armadale,10096.580596
3,Aspendale-Chelsea-Carrum,43084.95812
4,Bairnsdale,16557.084499


### Get Previous Data Frame

The below has
- Previous count log
- Previous growth
- Previous count growth
- Previous Price

In [20]:
# get the curated and filter out noise
curated = pd.read_csv(f"{RELATIVE_PATH_IN_CURATED}/external prices.csv")
curated = curated[curated["housing: count"] >= 200]

# get the log transforms
for feature in ["housing: count", "studying: tertiary total (%)"]:
    curated.loc[:, feature + " log"] = curated[feature].apply(np.log)

In [21]:
# filter for the most recent year
predictions_df = curated[curated["year groups"] == curated["year groups"].max()]

# select columns and rename
predictions_df = predictions_df[["suburbs", "housing: type",
                                 "housing: count growth", "housing: median growth", "housing: median", "housing: count log", 
                                 "economic: median income", "studying: tertiary total (%) log"]]
predictions_df.rename(columns={"housing: median": "housing: previous price",
                               "housing: count growth": "housing: previous count growth",
                               "housing: median growth": "housing: previous growth",
                               "housing: count log": "housing: previous count log"}, inplace=True)

predictions_df.head(5)

Unnamed: 0,suburbs,housing: type,housing: previous count growth,housing: previous growth,housing: previous price,housing: previous count log,economic: median income,studying: tertiary total (%) log
21,Albert Park-Middle Park-West St Kilda,flat,-0.1391,0.158489,542.0,7.443078,63415.3,-2.753571
43,Albert Park-Middle Park-West St Kilda,house,-0.0901,0.124129,1077.5,6.656727,63415.3,-2.753571
65,Altona,flat,-0.1434,0.101183,390.0,8.251664,58053.2,-3.254503
87,Altona,house,-0.1079,0.120374,440.0,9.455402,58053.2,-3.254503
109,Armadale,flat,-0.2242,0.160793,523.25,7.682482,68858.0,-2.574394


Need to get
- Interaction teriary and population log
- Interaction economic median income and trimmend mean
- population count log
Done
- trimmed mean quarterly

In [22]:
# https://www.rba.gov.au/publications/smp/2023/feb/forecasts.html
new_inflation = 2.9/4

### Join the stuff together

In [23]:
def get_interaction(df, interaction_pairs):
    # get all the interaction pairs
    for column_1, column_2 in interaction_pairs:
        new_name = f"interaction: ({column_1}) & ({column_2})"
        df[new_name] = df[column_1] * df[column_2]
    
    return df

In [24]:
# merge together
out_df = pd.merge(future_population_suburbs, predictions_df, on="suburbs")
out_df["economic: trimmed mean quarterly"] = new_inflation
out_df["population: total log"] = out_df["population: total"].apply(np.log)

# get interaction
interaction_pairs = [("studying: tertiary total (%) log", "population: total log"), 
                     ("economic: median income", "economic: trimmed mean quarterly")]
out_df = get_interaction(out_df, interaction_pairs)

out_df

Unnamed: 0,suburbs,population: total,housing: type,housing: previous count growth,housing: previous growth,housing: previous price,housing: previous count log,economic: median income,studying: tertiary total (%) log,economic: trimmed mean quarterly,population: total log,interaction: (studying: tertiary total (%) log) & (population: total log),interaction: (economic: median income) & (economic: trimmed mean quarterly)
0,Albert Park-Middle Park-West St Kilda,32306.267900,flat,-0.1391,0.158489,542.00,7.443078,63415.3,-2.753571,0.725,10.383017,-28.590370,45976.0925
1,Albert Park-Middle Park-West St Kilda,32306.267900,house,-0.0901,0.124129,1077.50,6.656727,63415.3,-2.753571,0.725,10.383017,-28.590370,45976.0925
2,Altona,30915.396427,flat,-0.1434,0.101183,390.00,8.251664,58053.2,-3.254503,0.725,10.339010,-33.648338,42088.5700
3,Altona,30915.396427,house,-0.1079,0.120374,440.00,9.455402,58053.2,-3.254503,0.725,10.339010,-33.648338,42088.5700
4,Armadale,10096.580596,flat,-0.2242,0.160793,523.25,7.682482,68858.0,-2.574394,0.725,9.219952,-23.735788,49922.0500
...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,Williamstown,16900.023858,house,-0.0925,0.074245,689.50,6.783325,65977.6,-2.859455,0.725,9.735070,-27.836992,47833.7600
275,Wodonga,30341.376641,flat,0.0929,0.084905,322.50,7.192934,49470.0,-3.816713,0.725,10.320268,-39.389498,35865.7500
276,Wodonga,30341.376641,house,-0.0779,0.083492,427.50,8.039802,49470.0,-3.816713,0.725,10.320268,-39.389498,35865.7500
277,Yarraville-Seddon,27318.890274,flat,-0.1562,0.053708,425.75,7.621685,68030.2,-2.852498,0.725,10.215334,-29.139220,49321.8950


### Save

In [25]:
out_df.to_csv(f"{RELATIVE_PATH_OUT}/forecast_test.csv")