In [71]:
import numpy as np
import pandas as pd
import ast
import os

## Functions

### Renaming

In [79]:
import warnings

def rename_columns(df, index_dict={}, column_dict={}, keep_columns=False, multilevel=False, warn_me=True):
    # get the renamed dictionary using indices
    new_column_names = list(df.columns.copy())

    # get the column and index indices
    index_indices = list(index_dict.keys())
    column_indices = [new_column_names.index(col) for col in column_dict.keys() if col in new_column_names]

    # check no overlap
    if (index_indices and column_indices):
        overlap = set(column_indices) & set(index_indices)
        assert (not column_indices or not index_indices or not overlap), f"index_dict and column_dict both use {[new_column_names[x] for x in overlap]}"

    # check within suitable range
    if (index_indices):
        assert (min(index_indices) >= 1), "index_dict indices must be above 0"
        assert (max(index_indices) <= len(new_column_names)), f"index_dict indices must be below {len(new_column_names)}"

    # create a warning for columns not included
    columns_not_used = [col for col in column_dict.keys() if not col in new_column_names]
    if (warn_me and columns_not_used):
        warnings.warn(f"the following columns were not included: {columns_not_used}", UserWarning)

    # use indices to change the dictionary
    if (index_dict | column_dict):
        for col_num in range(1, len(new_column_names)+1):
            if (index_dict.get(col_num) != None):
                new_column_names[col_num-1] = index_dict[col_num]
            elif (column_dict.get(new_column_names[col_num-1]) != None):
                new_column_names[col_num-1] = column_dict[new_column_names[col_num-1]]
            elif (multilevel == True):
                new_column_names[col_num-1] = (new_column_names[col_num-1], None)

    # convert the rest of the indices necessary to multi index
    if (multilevel == True):
        new_column_names = pd.MultiIndex.from_tuples(new_column_names)

    # add the new columns
    df.columns = new_column_names

    # drop the columns not of interest
    if (not keep_columns):
        index_indices = [x-1 for x in index_indices]
        keep_columns = sorted(index_indices + column_indices)
    elif (keep_columns == "all"):
        keep_columns = list(range(len(new_column_names)))
    else:
        keep_columns = [x-1 for x in keep_columns]
    
    # filter the columns
    df = df.iloc[:, keep_columns]

    return df

### Preparing

In [36]:
"""will basically create a record for each entry of one of the columns, and a new column for another entry
Note: assumes a multi index
Note: index_stop is the number of columns in the original dataframe that don't need to be melted"""
def unpack_levels(df, index_stop, level_1_name, level_2_name, index_level=0):
    # make a massive dataframe, everything is a record
    df = df.melt(id_vars=df.columns.to_list()[:index_stop], 
                 value_vars = df.columns.to_list()[index_stop:],
                 var_name=[level_1_name, level_2_name],
                 value_name="value")

    if (index_level != 0):
        # basically extract the column data from the second level
        df = df.pivot(index=list(df.columns[:index_stop]) + [level_1_name], 
                      columns=level_2_name, values="value")
        
    else:
        # switch around the melted columns so the end on gets melted
        df = df.pivot(index=list(df.columns[:index_stop]) + [level_2_name], 
                      columns=level_1_name, values="value")

    # reset the indices and rename
    df = df.reset_index()
    #print([column[0] for column in df.columns[:index_stop]])
    df.columns = [column[0] for column in df.columns[:index_stop]] + list(df.columns[index_stop:])

    return df


In [37]:
QUARTERS = 4

def get_quarters(df, year_col="year", quarter_col="quarter", financial_year=False):
    all_frames = []
    for quarter in range(QUARTERS):
        curr_quarter_df = df.copy()

        # create the current quarter series
        curr_quarter_df[quarter_col] = quarter + 1

        # aggregate the result
        all_frames.append(curr_quarter_df)

    # combine the results
    df = pd.concat(all_frames, ignore_index=True)

    # change the year column if necessary
    if (financial_year):
        df[year_col] = df.apply(
            lambda x: int(x[year_col][:4])
            if (x[quarter_col] in [3, 4])  # this step is necessary as defining quarter as the part of the year, so this is the end results desired
            else 2000 + int(x[year_col][5:]),
            axis=1
        )
    
    # sort the values
    df = df.sort_values(by=["SA2 code", year_col, quarter_col])

    return df

In [38]:
def is_vic(value):
    # filter out na
    if (pd.isna(value)):
        return False
    
    # if have a numeric type
    return np.floor(value / 1e8) == 2

### Merging

In [57]:
MERGE_COLUMNS = ["SA2 code", "year", "quarter"]

def get_next_merged_df(new_df, final_df):
    if (final_df.empty):
        return new_df

    # get the attributes used for the merge
    merge_columns = list(set(new_df.columns) & set(MERGE_COLUMNS) & set(final_df.columns))

    # get the columns not already in the data frame
    new_columns = list(set(new_df.columns) - set(final_df.columns) - set(merge_columns))

    # merge columns
    merged_df = pd.merge(final_df, new_df[merge_columns + new_columns], on=merge_columns, how="outer")

    # check if any rows lost
    print(f"lost {final_df.shape[0] - merged_df.shape[0]} out of {final_df.shape[0]} records")
    
    return merged_df

In [58]:
def get_merged_df(df_list):
    final_df = df_list[0]

    for df in df_list:
        final_df = get_next_merged_df(df, final_df)
    
    return final_df

## Prepare for Merge

In [39]:
RELATIVE_PATH_IN = "../../data/2. raw/1. renamed"
RELATIVE_PATH_OUT = "../../data/2. raw/2. merged"

### Housing

In [96]:
# get a list of dataframes to merge
housing_df_array = []
for file_name in os.listdir(f"{RELATIVE_PATH_IN}/housing"):
    if (file_name != "all_properties.csv"):
        # get the information about the file
        curr_type = file_name.split("_")[0]
        curr_beds = file_name.split("_")[1]
    else:
        curr_type = "all"
        curr_beds = "all"

    # download the frame and add information
    curr_df = pd.read_csv(f"{RELATIVE_PATH_IN}/housing/{file_name}", header=[0, 1])
    curr_df["type"] = curr_type
    curr_df["beds"] = curr_beds

    # drop the first column
    curr_df = curr_df.drop(curr_df.columns[0], axis=1)

    # add the dataframe to the list
    housing_df_array.append(curr_df)

# merge the dataframes
housing_df_raw = pd.concat(housing_df_array)

In [97]:
# reorder columns
housing_df_raw = housing_df_raw.iloc[:, [0, -2, -1] + list(range(1, len(housing_df_raw.columns) - 2))]

# remove records with group total
housing_df_raw = housing_df_raw[~housing_df_raw[('suburbs', "nan")].str.contains('Group Total')]

# Spelling correction
housing_df_raw[('suburbs', "nan")] = housing_df_raw[('suburbs', "nan")].replace('Wanagaratta', 'Wangaratta')
housing_df_raw[('suburbs', "nan")] = housing_df_raw[('suburbs', "nan")].replace('Newcombe', 'Newcomb')

In [98]:
# turn everything into records
housing_df = unpack_levels(housing_df_raw, 3, "time stamp", "measure", index_level=1)

# fix the time stamp
housing_df[["year", "month"]] = housing_df["time stamp"].str.split("-", expand=True).astype(int)
housing_df.drop(columns="time stamp", inplace=True)

# get the actual quarter
housing_df["quarter"] = housing_df["month"] // 3
housing_df.drop(columns="month", inplace=True)

housing_df.head(5)

Unnamed: 0,suburbs,type,beds,count,median,year,quarter
0,Albert Park-Middle Park-West St Kilda,all,all,1143.0,260.0,2000,1
1,Albert Park-Middle Park-West St Kilda,all,all,1134.0,260.0,2000,2
2,Albert Park-Middle Park-West St Kilda,all,all,1177.0,270.0,2000,3
3,Albert Park-Middle Park-West St Kilda,all,all,1178.0,275.0,2000,4
4,Albert Park-Middle Park-West St Kilda,all,all,1208.0,275.0,2001,1


### Economic by Geography

In [81]:
# get the df
income_region = pd.read_csv(f"{RELATIVE_PATH_IN}/economic_by_region/income_by_geography.csv", header=[0, 1], index_col=0)

# filter victoria columns
vic_mask = income_region[("SA2 code", "nan")].apply(is_vic)
income_region = income_region[vic_mask]

# make a mega dataframe
income_region = unpack_levels(income_region, 2, "measure", "year", index_level=0)

# get quarters and change the financial year
income_region = get_quarters(income_region, financial_year=True)

# do some renaming
income_region_rename = {
    4: "economic: number of earners",
    6: "economic: median income",
    7: "economic: median age of earners"
}
income_region_keep_columns = [1, 3, 4, 6, 7]
income_region = rename_columns(income_region, index_dict=income_region_rename, keep_columns=income_region_keep_columns)

income_region.head(4)

Unnamed: 0,SA2 code,year,economic: number of earners,economic: median income,economic: median age of earners
5220,201011001.0,2016,7117.0,50596.0,42.0
7830,201011001.0,2016,7117.0,50596.0,42.0
0,201011001.0,2017,7117.0,50596.0,42.0
2610,201011001.0,2017,7117.0,50596.0,42.0


In [82]:
# get the economic distribtuion
economic_distribution = pd.read_csv(f"{RELATIVE_PATH_IN}/economic_by_region/income_distribution_by_geography.csv",
                                    header=0, index_col=0)

# filter victoria columns
vic_mask = economic_distribution["SA2 code"].apply(is_vic)
economic_distribution = economic_distribution[vic_mask]

### Economic

In [63]:
def month_to_quarter(df, month_col="month", quarter_col="quarter"):
    # get the quarter
    df["quarter"] = ((df[month_col] - 1) // 3)
    df = df.drop(columns="month")
    return df

In [64]:
# read in data
interest = pd.read_csv(f"{RELATIVE_PATH_IN}/economic/interest.csv", index_col=0)

# get the quarter
interest = month_to_quarter(interest)
interest = interest.groupby(["year", "quarter"]).mean().reset_index()

interest = interest[interest["year"] >= 2000]

In [65]:
# read in data
gdp = pd.read_csv(f"{RELATIVE_PATH_IN}/economic/gdp.csv", index_col=0)

# get the quarter
gdp = month_to_quarter(gdp)

gdp = gdp[gdp["year"] >= 2000]

In [66]:
# read in data
inflation = pd.read_csv(f"{RELATIVE_PATH_IN}/economic/inflation.csv", index_col=0)

# get the quarter
inflation = month_to_quarter(inflation)

inflation = inflation[inflation["year"] >= 2000]

### Population

In [45]:
# get the df
vic_population_df = pd.read_csv(f"{RELATIVE_PATH_IN}/population/age demographics.csv", header=0, index_col=0)

# filter victoria columns
vic_mask = vic_population_df["SA2 code"].apply(is_vic)
vic_population_df = vic_population_df[vic_mask]

# get the quarters
vic_population_df = get_quarters(vic_population_df)

vic_population_df.head(5)

Unnamed: 0,year,state,SA2 code,population: 0-4,population: 5-9,population: 10-14,population: 15-19,population: 20-24,population: 25-29,population: 30-34,...,population: 50-54,population: 55-59,population: 60-64,population: 65-69,population: 70-74,population: 75-79,population: 80-84,population: 85+,population: total,quarter
0,2001,Victoria,201011001.0,353.0,467.0,584.0,556.0,310.0,266.0,364.0,...,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0,1
12006,2001,Victoria,201011001.0,353.0,467.0,584.0,556.0,310.0,266.0,364.0,...,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0,2
24012,2001,Victoria,201011001.0,353.0,467.0,584.0,556.0,310.0,266.0,364.0,...,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0,3
36018,2001,Victoria,201011001.0,353.0,467.0,584.0,556.0,310.0,266.0,364.0,...,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0,4
522,2002,Victoria,201011001.0,385.0,473.0,623.0,556.0,339.0,279.0,383.0,...,384.0,297.0,244.0,188.0,179.0,148.0,73.0,42.0,6092.0,1


In [78]:
# get the df
future_population = pd.read_csv(f"{RELATIVE_PATH_IN}/population/projections.csv", header=0, index_col=0)

# filter victoria columns
vic_mask = future_population["SA2 code"].apply(is_vic)
future_population = future_population[vic_mask]

# get the right year and filter
future_population = future_population[["SA2 code", "population: 2026"]]
future_population.rename(columns={"population: 2026": "population: total"}, inplace=True)
future_population["year"] = 2026

# get the quarters
future_population = get_quarters(future_population)

future_population.head(5)

Unnamed: 0,SA2 code,population: total,year,quarter
0,201011001.0,20756.256163,2026,1
522,201011001.0,20756.256163,2026,2
1044,201011001.0,20756.256163,2026,3
1566,201011001.0,20756.256163,2026,4
1,201011002.0,11698.293593,2026,1


### ABS

In [115]:
# get all the csvs
abs_files_path = f"{RELATIVE_PATH_IN}/ABS"
abs_files_names = [f"{abs_files_path}/{file}" for file in os.listdir(abs_files_path)]

# get the list of dataframes
abs_frames = []
for abs_file in abs_files_names:
    abs_frames.append(pd.read_csv(abs_file))

merged_df_abs = get_merged_df(abs_frames)

lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records
lost 0 out of 524 records


## Merge Together

In [108]:
MERGED_EXTERNAL_PATH_OUT = f"{RELATIVE_PATH_OUT}/merged external.csv"
MERGED_EXTERNAL_HOUSING_PATH_OUT = f"{RELATIVE_PATH_OUT}/forecast data.csv"

### Merging all data

In [101]:
MERGE_COLUMNS = ["SA2 code", "year", "quarter"]

In [116]:
all_frames = [vic_population_df, future_population,
              income_region, economic_distribution, 
              interest, gdp, inflation,
              merged_df_abs]

merged_df_sa2 = get_merged_df(all_frames)

merged_df_sa2.head(5)

lost 0 out of 48024 records
lost -2088 out of 48024 records
lost -29232 out of 50112 records
lost 0 out of 79344 records
lost -30 out of 79344 records
lost 0 out of 79374 records
lost 0 out of 79374 records
lost -2 out of 79374 records


Unnamed: 0,year,state,SA2 code,population: 0-4,population: 5-9,population: 10-14,population: 15-19,population: 20-24,population: 25-29,population: 30-34,...,relationships: non dependent child,birth: south africa,studying: tertiary FT 14-25,birth: iraq,birth: croatia,relationships: group household,birth: mauritius,birth: papua new guinea,birth: pakistan,birth: total
0,2001.0,Victoria,201011001.0,353.0,467.0,584.0,556.0,310.0,266.0,364.0,...,854.0,74.0,393.0,28.0,14.0,447.0,4.0,0.0,48.0,16835.0
1,2001.0,Victoria,201011001.0,353.0,467.0,584.0,556.0,310.0,266.0,364.0,...,854.0,74.0,393.0,28.0,14.0,447.0,4.0,0.0,48.0,16835.0
2,2001.0,Victoria,201011001.0,353.0,467.0,584.0,556.0,310.0,266.0,364.0,...,854.0,74.0,393.0,28.0,14.0,447.0,4.0,0.0,48.0,16835.0
3,2001.0,Victoria,201011001.0,353.0,467.0,584.0,556.0,310.0,266.0,364.0,...,854.0,74.0,393.0,28.0,14.0,447.0,4.0,0.0,48.0,16835.0
4,2002.0,Victoria,201011001.0,385.0,473.0,623.0,556.0,339.0,279.0,383.0,...,854.0,74.0,393.0,28.0,14.0,447.0,4.0,0.0,48.0,16835.0


### Convert to Suburb Level

In [117]:
# merging stuff
regions_df = pd.read_csv('../../data/2. raw/location/sa2_to_rental_suburb_groups.csv')

regions_df_list = regions_df.copy()
regions_df_list['code'] = regions_df_list['code'].apply(ast.literal_eval)

exploded_regions = regions_df_list.explode('code')
exploded_regions['code'] = exploded_regions['code'].apply(pd.to_numeric, errors='coerce')

def df_to_regions(df, sa2_col_name, aggregation_functions, year_col_name = None, quarter_col_name = None):
    df[sa2_col_name] = df[sa2_col_name].apply(pd.to_numeric, errors='coerce')
    regions_with_stats = pd.merge(exploded_regions, df, left_on='code', right_on=sa2_col_name, how='left')

    for col in aggregation_functions:
        regions_with_stats[col] = regions_with_stats[col].apply(pd.to_numeric, errors='coerce')

    if quarter_col_name:
        grouped_by = regions_with_stats.groupby(['suburbs', year_col_name, quarter_col_name]).agg(aggregation_functions)
    elif year_col_name:
        grouped_by = regions_with_stats.groupby(['suburbs', year_col_name]).agg(aggregation_functions)
    else:
        grouped_by = regions_with_stats.groupby('suburbs').agg(aggregation_functions)

    return grouped_by.reset_index()

exploded_regions

Unnamed: 0.1,Unnamed: 0,geometry,suburbs,regions,code
0,0,POLYGON ((144.97018001032677 -37.8606998481848...,Albert Park-Middle Park-West St Kilda,"['Albert Park', 'St Kilda - West']",206051128
0,0,POLYGON ((144.97018001032677 -37.8606998481848...,Albert Park-Middle Park-West St Kilda,"['Albert Park', 'St Kilda - West']",206051514
1,1,POLYGON ((144.79015492149858 -37.8621593959499...,Altona,"['Altona', 'Altona North']",213021341
1,1,POLYGON ((144.79015492149858 -37.8621593959499...,Altona,"['Altona', 'Altona North']",213021343
2,2,POLYGON ((145.01167433388778 -37.8535692509816...,Armadale,['Armadale'],206061135
...,...,...,...,...,...
141,141,POLYGON ((144.8869958358719 -37.85078700244943...,Williamstown,['Williamstown'],213021346
142,142,POLYGON ((146.77393701213975 -36.1280104970832...,Wodonga,"['Wodonga', 'West Wodonga']",204031073
142,142,POLYGON ((146.77393701213975 -36.1280104970832...,Wodonga,"['Wodonga', 'West Wodonga']",204031492
143,143,POLYGON ((144.85984002458792 -37.8138202803743...,Yarraville-Seddon,"['Seddon - Kingsville', 'Yarraville']",213031352


In [118]:
# This section is just to make the aggregation a bit easier
# is in the form { 'col_name': 'func', 'col2': 'func' }
# ---------------------------------------------------------

           #['overseas: 5 years'] + \

# list of columns to take the sum of the aggregation for
sum_list = [x for x in list(merged_df_sa2.columns) if "population" in x] + \
           [x for x in list(merged_df_sa2.columns) if "birth" in x] + \
           [x for x in list(merged_df_sa2.columns) if "studying" in x] + \
           [x for x in list(merged_df_sa2.columns) if "relationships" in x] + \
           ["economic: number of earners"]


# list of columns to take the avg of the aggregation for
avg_list = [x for x in merged_df_sa2.columns if ("economic: " in x) and (not "number" in x)]

agg_functions = {col_name: 'mean' for col_name in avg_list}
agg_functions.update(
    {col_name: 'sum' for col_name in sum_list}
)

# ----------------------------------------------------------

# EDIT ME: Calls the actual join function
merged_df_suburbs = df_to_regions(df = merged_df_sa2, sa2_col_name = 'SA2 code', aggregation_functions=agg_functions, year_col_name = 'year', quarter_col_name="quarter")
merged_df_suburbs.to_csv(MERGED_EXTERNAL_PATH_OUT)
merged_df_suburbs.head(5)

### Merging to housing

In [109]:
merged_df_external_housing = pd.merge(merged_df_suburbs, housing_df, on=["suburbs", "year", "quarter"], how="outer")
merged_df_external_housing.to_csv(MERGED_EXTERNAL_HOUSING_PATH_OUT)