## Converts SA2 granular dataframe to suburbs

In [2]:
import pandas as pd
import ast

In [4]:
# don't edit this (except for regions_df location)

regions_df = pd.read_csv('../../data/2. raw/location/sa2_to_rental_suburb_groups.csv')

regions_df_list = regions_df.copy()
regions_df_list['code'] = regions_df_list['code'].apply(ast.literal_eval)

exploded_regions = regions_df_list.explode('code')
exploded_regions['code'] = exploded_regions['code'].apply(pd.to_numeric, errors='coerce')

def df_to_regions(df, sa2_col_name, aggregation_functions, year_col_name = None, quarter_col_name = None):
    df[sa2_col_name] = df[sa2_col_name].apply(pd.to_numeric, errors='coerce')
    regions_with_stats = pd.merge(exploded_regions, df, left_on='code', right_on=sa2_col_name, how='left')

    for col in aggregation_functions:
        regions_with_stats[col] = regions_with_stats[col].apply(pd.to_numeric, errors='coerce')

    if quarter_col_name:
        grouped_by = regions_with_stats.groupby(['suburbs', year_col_name, quarter_col_name]).agg(aggregation_functions)
    elif year_col_name:
        grouped_by = regions_with_stats.groupby(['suburbs', year_col_name]).agg(aggregation_functions)
    else:
        grouped_by = regions_with_stats.groupby('suburbs').agg(aggregation_functions)

    return grouped_by.reset_index()

exploded_regions

Unnamed: 0.1,Unnamed: 0,geometry,suburbs,regions,code
0,0,POLYGON ((144.97018001032677 -37.8606998481848...,Albert Park-Middle Park-West St Kilda,"['Albert Park', 'St Kilda - West']",206051128
0,0,POLYGON ((144.97018001032677 -37.8606998481848...,Albert Park-Middle Park-West St Kilda,"['Albert Park', 'St Kilda - West']",206051514
1,1,POLYGON ((144.79015492149858 -37.8621593959499...,Altona,"['Altona', 'Altona North']",213021341
1,1,POLYGON ((144.79015492149858 -37.8621593959499...,Altona,"['Altona', 'Altona North']",213021343
2,2,POLYGON ((145.01167433388778 -37.8535692509816...,Armadale,['Armadale'],206061135
...,...,...,...,...,...
141,141,POLYGON ((144.8869958358719 -37.85078700244943...,Williamstown,['Williamstown'],213021346
142,142,POLYGON ((146.77393701213975 -36.1280104970832...,Wodonga,"['Wodonga', 'West Wodonga']",204031073
142,142,POLYGON ((146.77393701213975 -36.1280104970832...,Wodonga,"['Wodonga', 'West Wodonga']",204031492
143,143,POLYGON ((144.85984002458792 -37.8138202803743...,Yarraville-Seddon,"['Seddon - Kingsville', 'Yarraville']",213031352


Need to edit:
1. The load path of the dataframe to aggregate
2. The output path to save the output
3. Create the aggregation function dictinary
4. Call the aggregation function and save the output

In [5]:
# EDIT THESE FIELDS
LOAD_PATH = '../../data/2. raw/historic/prediction.csv'
OUTPUT_PATH = '../../data/2. raw/historic/predicitons_suburbs.csv'

df = pd.read_csv(LOAD_PATH, index_col=0)

In [None]:
# This section is just to make the aggregation a bit easier
# is in the form { 'col_name': 'func', 'col2': 'func' }
# ---------------------------------------------------------

# list of columns to take the sum of the aggregation for
sum_list = [x for x in list(df.columns) if "population" in x] + \
           [x for x in list(df.columns) if "birth" in x] + \
           [x for x in list(df.columns) if "studying" in x] + \
           ['overseas: 5 years'] + \
           [x for x in list(df.columns) if "relationships" in x] + \
           ["economic: number of earners"]


# list of columns to take the avg of the aggregation for
avg_list = [x for x in df.columns if ("economic: " in x) and (not "number" in x)]

agg_functions = {col_name: 'mean' for col_name in avg_list}
agg_functions.update(
    {col_name: 'sum' for col_name in sum_list}
)

# ----------------------------------------------------------

# EDIT ME: Calls the actual join function
out = df_to_regions(df = df, sa2_col_name = 'SA2 code', aggregation_functions=agg_functions, year_col_name = 'year', quarter_col_name="quarter")
out.to_csv(OUTPUT_PATH)
out

In [8]:
# This section is just to make the aggregation a bit easier
# is in the form { 'col_name': 'func', 'col2': 'func' }
# ---------------------------------------------------------

# list of columns to take the sum of the aggregation for
sum_list = ["population: 2026 total"]

agg_functions = {col_name: 'mean' for col_name in avg_list}
agg_functions.update(
    {col_name: 'sum' for col_name in sum_list}
)

# ----------------------------------------------------------

# EDIT ME: Calls the actual join function
out = df_to_regions(df = df, sa2_col_name = 'SA2 code', aggregation_functions=agg_functions)
out.to_csv(OUTPUT_PATH)
out

Unnamed: 0,suburbs,population: 2026 total
0,Albert Park-Middle Park-West St Kilda,32306.267900
1,Altona,30915.396427
2,Armadale,10096.580596
3,Aspendale-Chelsea-Carrum,43084.958120
4,Bairnsdale,16557.084499
...,...,...
139,West Footscray,12478.346448
140,Whittlesea,22798.760413
141,Williamstown,16900.023858
142,Wodonga,30341.376641
