In [2]:
import numpy as np
import pandas as pd
import os

## Functions

In [3]:
MERGE_COLUMNS = ["SA2 code", "year", "quarter"]

def get_merged_df(new_df, final_df):
    if (final_df.empty):
        return new_df

    # get the attributes used for the merge
    merge_columns = list(set(new_df.columns) & set(MERGE_COLUMNS) & set(final_df.columns))

    # get the columns not already in the data frame
    new_columns = list(set(new_df.columns) - set(final_df.columns) - set(merge_columns))

    # merge columns
    merged_df = pd.merge(final_df, new_df[merge_columns + new_columns], on=merge_columns, how="outer")

    # check if any rows lost
    print(f"lost {final_df.shape[0] - merged_df.shape[0]} out of {final_df.shape[0]} records")
    
    return merged_df

In [4]:
"""will basically create a record for each entry of one of the columns, and a new column for another entry
Note: assumes a multi index
Note: index_stop is the number of columns in the original dataframe that don't need to be melted"""
def unpack_levels(df, index_stop, level_1_name, level_2_name, index_level=0):
    # make a massive dataframe, everything is a record
    df = df.melt(id_vars=df.columns.to_list()[:index_stop], 
                 value_vars = df.columns.to_list()[index_stop:],
                 var_name=[level_1_name, level_2_name],
                 value_name="value")

    if (index_level != 0):
        # basically extract the column data from the second level
        df = df.pivot(index=list(df.columns[:index_stop]) + [level_1_name], 
                      columns=level_2_name, values="value")
        
    else:
        # switch around the melted columns so the end on gets melted
        df = df.pivot(index=list(df.columns[:index_stop]) + [level_2_name], 
                      columns=level_1_name, values="value")

    # reset the indices and rename
    df = df.reset_index()
    #print([column[0] for column in df.columns[:index_stop]])
    df.columns = [column[0] for column in df.columns[:index_stop]] + list(df.columns[index_stop:])

    return df


In [20]:
QUARTERS = 4

def get_quarters(df, year_col="year", quarter_col="quarter", financial_year=False):
    all_frames = []
    for quarter in range(QUARTERS):
        curr_quarter_df = df.copy()

        # create the current quarter series
        curr_quarter_df[quarter_col] = quarter + 1

        # aggregate the result
        all_frames.append(curr_quarter_df)

    # combine the results
    df = pd.concat(all_frames, ignore_index=True)

    # change the year column if necessary
    if (financial_year):
        df[year_col] = df.apply(
            lambda x: int(x[year_col][:4])
            if (x[quarter_col] in [3, 4])  # this step is necessary as defining quarter as the part of the year, so this is the end results desired
            else 2000 + int(x[year_col][5:]),
            axis=1
        )
    
    # sort the values
    df = df.sort_values(by=["SA2 code", year_col, quarter_col])

    return df

In [5]:
def is_vic(value):
    # filter out na
    if (pd.isna(value)):
        return False
    
    # if have a numeric type
    return np.floor(value / 1e8) == 2

## Prepare for Merge

In [6]:
RELATIVE_PATH_IN = "../../data/2. raw/1. renamed"
RELATIVE_PATH_OUT = "../../data/2. raw/2. merged"

### Housing

In [13]:
# get a list of dataframes to merge
housing_df_array = []
for file_name in os.listdir(f"{RELATIVE_PATH_IN}/housing"):
    if (file_name != "all_properties.csv"):
        # get the information about the file
        curr_type = file_name.split("_")[0]
        curr_beds = file_name.split("_")[1]
    else:
        curr_type = "all"
        curr_beds = "all"

    # download the frame and add information
    curr_df = pd.read_csv(f"{RELATIVE_PATH_IN}/housing/{file_name}", header=[0, 1])
    curr_df["type"] = curr_type
    curr_df["beds"] = curr_beds

    # drop the first column
    curr_df = curr_df.drop(curr_df.columns[0], axis=1)

    # add the dataframe to the list
    housing_df_array.append(curr_df)

# merge the dataframes
housing_df_raw = pd.concat(housing_df_array)

In [14]:
# reorder columns
housing_df_raw = housing_df_raw.iloc[:, [0, -2, -1] + list(range(1, len(housing_df_raw.columns) - 2))]

# remove records with group total
housing_df_raw = housing_df_raw[~housing_df_raw[('suburbs', "nan")].str.contains('Group Total')]

# Spelling correction
housing_df_raw[('suburbs', "nan")] = housing_df_raw[('suburbs', "nan")].replace('Wanagaratta', 'Wangaratta')
housing_df_raw[('suburbs', "nan")] = housing_df_raw[('suburbs', "nan")].replace('Newcombe', 'Newcomb')

In [15]:
# turn everything into records
housing_df = unpack_levels(housing_df_raw, 3, "time stamp", "measure", index_level=1)

# fix the time stamp
housing_df[["year", "month"]] = housing_df["time stamp"].str.split("-", expand=True).astype(int)

# get the actual quarter
housing_df["quarter"] = housing_df["month"] // 3
housing_df.drop(columns="month", inplace=True)

housing_df.head(5)

Unnamed: 0,suburbs,type,beds,time stamp,count,median,year,quarter
0,Albert Park-Middle Park-West St Kilda,all,all,2000-03,1143.0,260.0,2000,1
1,Albert Park-Middle Park-West St Kilda,all,all,2000-06,1134.0,260.0,2000,2
2,Albert Park-Middle Park-West St Kilda,all,all,2000-09,1177.0,270.0,2000,3
3,Albert Park-Middle Park-West St Kilda,all,all,2000-12,1178.0,275.0,2000,4
4,Albert Park-Middle Park-West St Kilda,all,all,2001-03,1208.0,275.0,2001,1


### Economic by Geography

In [23]:
# get the df
income_region = pd.read_csv(f"{RELATIVE_PATH_IN}/economic_by_region/income_by_geography.csv", header=[0, 1], index_col=0)

# filter victoria columns
vic_mask = income_region[("SA2 code", "nan")].apply(is_vic)
income_region = income_region[vic_mask]

# make a mega dataframe
income_region = unpack_levels(income_region, 2, "measure", "year", index_level=0)

# get quarters and change the financial year
income_region = get_quarters(income_region, financial_year=True)

income_region.head(4)

Unnamed: 0,SA2 code,SA2 name,year,Earners (persons),Mean ($),Median ($),Median age of earners (years),Sum ($),quarter
5220,201011001.0,Alfredton,2016,7117.0,60937.0,50596.0,42.0,433690088.0,3
7830,201011001.0,Alfredton,2016,7117.0,60937.0,50596.0,42.0,433690088.0,4
0,201011001.0,Alfredton,2017,7117.0,60937.0,50596.0,42.0,433690088.0,1
2610,201011001.0,Alfredton,2017,7117.0,60937.0,50596.0,42.0,433690088.0,2


In [98]:
# get the economic distribtuion
economic_distribution = pd.read_csv(f"{RELATIVE_PATH_IN}/economic_by_region/income_distribution_by_geography.csv",
                                    header=0, index_col=0)

# filter victoria columns
vic_mask = economic_distribution["SA2 code"].apply(is_vic)
economic_distribution = economic_distribution[vic_mask]

### Economic

In [55]:
def month_to_quarter(df, month_col="month", quarter_col="quarter"):
    # get the quarter
    df["quarter"] = ((df[month_col] - 1) // 3)
    df = df.drop(columns="month")
    return df

In [75]:
# read in data
interest = pd.read_csv(f"{RELATIVE_PATH_IN}/economic/interest.csv", index_col=0)

# get the quarter
interest = month_to_quarter(interest)
interest = interest.groupby(["year", "quarter"]).mean().reset_index()

In [76]:
# read in data
gdp = pd.read_csv(f"{RELATIVE_PATH_IN}/economic/gdp.csv", index_col=0)

# get the quarter
gdp = month_to_quarter(gdp)

In [77]:
# read in data
inflation = pd.read_csv(f"{RELATIVE_PATH_IN}/economic/inflation.csv", index_col=0)

# get the quarter
inflation = month_to_quarter(inflation)

### Population

In [33]:
# get the df
vic_population_df = pd.read_csv(f"{RELATIVE_PATH_IN}/population/age demographics.csv", header=0, index_col=0)

# filter victoria columns
vic_mask = vic_population_df["SA2 code"].apply(is_vic)
vic_population_df = vic_population_df[vic_mask]

# get the quarters
vic_population_df = get_quarters(vic_population_df)

vic_population_df.head(5)

Unnamed: 0,year,state,SA2 code,population: 0-4,population: 5-9,population: 10-14,population: 15-19,population: 20-24,population: 25-29,population: 30-34,...,population: 50-54,population: 55-59,population: 60-64,population: 65-69,population: 70-74,population: 75-79,population: 80-84,population: 85+,population: total,quarter
0,2001,Victoria,201011001.0,353.0,467.0,584.0,556.0,310.0,266.0,364.0,...,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0,1
12006,2001,Victoria,201011001.0,353.0,467.0,584.0,556.0,310.0,266.0,364.0,...,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0,2
24012,2001,Victoria,201011001.0,353.0,467.0,584.0,556.0,310.0,266.0,364.0,...,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0,3
36018,2001,Victoria,201011001.0,353.0,467.0,584.0,556.0,310.0,266.0,364.0,...,368.0,247.0,233.0,184.0,168.0,151.0,67.0,42.0,5756.0,4
522,2002,Victoria,201011001.0,385.0,473.0,623.0,556.0,339.0,279.0,383.0,...,384.0,297.0,244.0,188.0,179.0,148.0,73.0,42.0,6092.0,1


In [32]:
# get the df
future_population = pd.read_csv(f"{RELATIVE_PATH_IN}/population/projections.csv", header=0, index_col=0)

# filter victoria columns
vic_mask = future_population["SA2 code"].apply(is_vic)
future_population = future_population[vic_mask]

# get the right year and filter
future_population = future_population[["SA2 code", "population: 2026"]]
future_population.rename(columns={"population: 2026": "populaation: total"}, inplace=True)
future_population["year"] = 2026

# get the quarters
future_population = get_quarters(future_population)

future_population.head(5)

Unnamed: 0,SA2 code,populaation: total,year,quarter
0,201011001.0,20756.256163,2026,1
522,201011001.0,20756.256163,2026,2
1044,201011001.0,20756.256163,2026,3
1566,201011001.0,20756.256163,2026,4
1,201011002.0,11698.293593,2026,1


## Merge Together

### Merging all data