In [119]:
import numpy as np
import pandas as pd
import ast
import os

## Functions

In [120]:
def filter_columns(df, find_sub=[], avoid_sub=[]):
    curr_columns = df.columns
    
    # filter any columns that contain the find_sub
    if (find_sub):
        curr_columns = [column for column in curr_columns if any(sub in column for sub in find_sub)]

    # avoid any columns that contain the avoid_sub
    curr_columns = [column for column in curr_columns if all(not sub in column for sub in avoid_sub)]

    return curr_columns

In [121]:
def filter_frame(df, type=False, beds=False, suburb=False, start_year=False, end_year=False):
    if (not type and not beds):
        df_mask = (df["housing: type"] == type) & (df["housing: beds"] == beds)
    else:
        df_mask = pd.Series(data=True, index=df.index)
        if (type):
            df_mask = df_mask & (df["housing: type"] == type)
        if (beds):
            df_mask = df_mask & (df["housing: beds"] == beds)

    if (suburb):
        df_mask = df_mask & (df["suburbs"] == suburb)    
    if (start_year):
        df_mask = df_mask & (df["year groups"] >= start_year)
    if (end_year):
        df_mask = df_mask & (df["year groups"] <= end_year)
    
    return df[df_mask]

## Execution

In [123]:
RELATIVE_PATH_IN = "../../data/2. raw/2. merged"
RELATIVE_PATH_OUT = "../../data/3. curated"

MERGED_NAME = "forecast data.csv"

In [124]:
historic_raw = pd.read_csv(f"{RELATIVE_PATH_IN}/{MERGED_NAME}", index_col=0)
print(historic_raw.shape)
historic_raw.head(3)

(92736, 115)


Unnamed: 0,suburbs,year,quarter,economic: median income,economic: median age of earners,economic: top 1% suburb,economic: gini coefficient,economic: top 5% suburb,economic: variable interest rate,economic: gdp quarterly,...,relationships: group household,relationships: married,overseas: 5 years,economic: number of earners,housing: type,housing: beds,housing: count,housing: median,housing: previous 2 years,housing: avg 2 years
0,Albert Park-Middle Park-West St Kilda,2001.0,1.0,,,14.55,0.5565,30.75,7.653333,0.9,...,1798.0,7732.0,3646.0,0.0,all,all,1208.0,275.0,"[260.0, 260.0, 270.0, 275.0]",266.25
1,Albert Park-Middle Park-West St Kilda,2001.0,1.0,,,14.55,0.5565,30.75,7.653333,0.9,...,1798.0,7732.0,3646.0,0.0,flat,1,395.0,180.0,"[600.0, 660.0, 680.0, 165.0, 165.0, 170.0, 175.0]",373.571429
2,Albert Park-Middle Park-West St Kilda,2001.0,1.0,,,14.55,0.5565,30.75,7.653333,0.9,...,1798.0,7732.0,3646.0,0.0,flat,2,318.0,260.0,"[400.0, 425.0, 426.0, 250.0, 250.0, 250.0, 255.0]",322.285714


### Dealing with missing values

In [125]:
# getting the average for each column
avg_economic_columns = ["suburbs", "economic: median age of earners", "economic: median income", "economic: number of earners"]

# filter for all the present values
avg_economic_df = historic_raw[avg_economic_columns]
avg_economic_df = avg_economic_df[(~avg_economic_df.isna().any(axis=1)) & (avg_economic_df["economic: number of earners"] != 0)]

# take the average values
avg_economic_values = avg_economic_df.groupby("suburbs").mean().to_dict()

# fill in these values
for column, column_dict in avg_economic_values.items():
    historic_raw[column] = historic_raw["suburbs"].apply(lambda x: column_dict.get(x))

historic_raw.head(3)

Unnamed: 0,suburbs,year,quarter,economic: median income,economic: median age of earners,economic: top 1% suburb,economic: gini coefficient,economic: top 5% suburb,economic: variable interest rate,economic: gdp quarterly,...,relationships: group household,relationships: married,overseas: 5 years,economic: number of earners,housing: type,housing: beds,housing: count,housing: median,housing: previous 2 years,housing: avg 2 years
0,Albert Park-Middle Park-West St Kilda,2001.0,1.0,63415.3,39.8,14.55,0.5565,30.75,7.653333,0.9,...,1798.0,7732.0,3646.0,21731.8,all,all,1208.0,275.0,"[260.0, 260.0, 270.0, 275.0]",266.25
1,Albert Park-Middle Park-West St Kilda,2001.0,1.0,63415.3,39.8,14.55,0.5565,30.75,7.653333,0.9,...,1798.0,7732.0,3646.0,21731.8,flat,1,395.0,180.0,"[600.0, 660.0, 680.0, 165.0, 165.0, 170.0, 175.0]",373.571429
2,Albert Park-Middle Park-West St Kilda,2001.0,1.0,63415.3,39.8,14.55,0.5565,30.75,7.653333,0.9,...,1798.0,7732.0,3646.0,21731.8,flat,2,318.0,260.0,"[400.0, 425.0, 426.0, 250.0, 250.0, 250.0, 255.0]",322.285714


In [126]:
# remove entries with missing values
historic_raw = historic_raw[~historic_raw.isna().any(axis=1)]
historic_raw.shape

(87494, 115)

### Net inflation (not as useful anymore)

In [127]:
# changing the net economic growth and stuff
inflation_values = historic_raw[["year", "quarter", "economic: trimmed mean quarterly"]].value_counts().index
inflation_values = sorted(list(set([(triple[0], triple[1], round(triple[2], 3)) for triple in inflation_values])))

curr_multiplier = 1
for i, triple in enumerate(inflation_values):
    # get the current inflation
    curr_inflation = 1 + (triple[2]/100)

    # assign the values
    inflation_values[i] = (triple[0], triple[1], curr_multiplier)

    # update the multiplier
    curr_multiplier *= curr_inflation

inflation_values = {(year, quarter): net_inflation for year, quarter, net_inflation in inflation_values}

In [128]:
index_array = historic_raw[["year", "quarter"]].to_numpy()

new_inflation_values = []
for index_tuple in index_array:
    new_inflation_values.append(inflation_values[tuple(index_tuple)])

net_inflation_series = pd.Series(data = new_inflation_values, index=historic_raw.index)

historic_raw["economic: net inflation"] = net_inflation_series
historic_raw.head(3)

Unnamed: 0,suburbs,year,quarter,economic: median income,economic: median age of earners,economic: top 1% suburb,economic: gini coefficient,economic: top 5% suburb,economic: variable interest rate,economic: gdp quarterly,...,relationships: married,overseas: 5 years,economic: number of earners,housing: type,housing: beds,housing: count,housing: median,housing: previous 2 years,housing: avg 2 years,economic: net inflation
0,Albert Park-Middle Park-West St Kilda,2001.0,1.0,63415.3,39.8,14.55,0.5565,30.75,7.653333,0.9,...,7732.0,3646.0,21731.8,all,all,1208.0,275.0,"[260.0, 260.0, 270.0, 275.0]",266.25,1.0
1,Albert Park-Middle Park-West St Kilda,2001.0,1.0,63415.3,39.8,14.55,0.5565,30.75,7.653333,0.9,...,7732.0,3646.0,21731.8,flat,1,395.0,180.0,"[600.0, 660.0, 680.0, 165.0, 165.0, 170.0, 175.0]",373.571429,1.0
2,Albert Park-Middle Park-West St Kilda,2001.0,1.0,63415.3,39.8,14.55,0.5565,30.75,7.653333,0.9,...,7732.0,3646.0,21731.8,flat,2,318.0,260.0,"[400.0, 425.0, 426.0, 250.0, 250.0, 250.0, 255.0]",322.285714,1.0


### Combining Population

In [129]:
# combine populations
combine_columns = filter_columns(historic_raw, ["population:"], ["8", "total"]) + filter_columns(historic_raw, ["population: 8"])
for i in range(0, len(combine_columns), 2):
    historic_raw[f"population: {i*5}-{i*5+9}"] = historic_raw[combine_columns[i]] + historic_raw[combine_columns[i+1]]

### Averaging years

- WANT TO APPLY TO ALL COLUMNS except for count and previous prices
- Will basically average by year, then also do a weight average of the bedroom apartments

In [130]:
GROUP_NAME = "year groups"
CONSTANT_COLS = ["suburbs", "housing: type"]
YEAR_AVG = 1

def get_avg(input_df, avg_list, sum_list):
    # get the constant cols that will be used for this average
    constant_cols = CONSTANT_COLS + ["housing: beds", GROUP_NAME]

    # create a copy
    input_df = input_df.copy()

    # get the minimum year
    minimum_year = input_df["year"].min()

    # get the groups
    input_df.loc[:, GROUP_NAME] = input_df["year"].apply(lambda x: (x-minimum_year) // YEAR_AVG + 1)

    # get the avg list first
    out_df = input_df[avg_list + constant_cols].groupby(constant_cols).mean().reset_index()

    # get the sum list next
    out_df[sum_list] = input_df[sum_list + constant_cols].groupby(constant_cols).sum().reset_index()[sum_list]

    return out_df

In [131]:
# get the averaging and sum lists
avg_list = [x for x in historic_raw.columns if not x in ["housing: count", "housing: previous 2 years", "housing: avg 2 years"] + \
                                                  CONSTANT_COLS + ["year", "quarter", "housing: beds"]]
sum_list = ["housing: count"]

# get the average df
avg_df = get_avg(historic_raw, avg_list, sum_list)

print(avg_df.shape)
avg_df.head(2)

(21994, 122)


Unnamed: 0,suburbs,housing: type,housing: beds,year groups,economic: median income,economic: median age of earners,economic: top 1% suburb,economic: gini coefficient,economic: top 5% suburb,economic: variable interest rate,...,population: 0-9,population: 10-19,population: 20-29,population: 30-39,population: 40-49,population: 50-59,population: 60-69,population: 70-79,population: 80-89,housing: count
0,Albert Park-Middle Park-West St Kilda,all,all,1.0,63415.3,39.8,14.55,0.5565,30.75,6.861667,...,1554.0,1452.0,5256.0,5562.0,3234.0,2697.0,1624.0,1267.0,972.0,4990.0
1,Albert Park-Middle Park-West St Kilda,all,all,2.0,63415.3,39.8,14.55,0.5565,30.75,6.3825,...,1623.0,1494.0,5338.0,5744.0,3277.0,2791.0,1655.0,1224.0,977.0,5528.0


### Growth Features

In [132]:
# adding the growth rate age demos. Includes:
# - age demographic data
# - economic: variable interest rate
# - housing: median
GROWTH_FEATURES = ["housing: median"] + \
                  filter_columns(avg_df, find_sub=["population"]) + \
                  ["economic: variable interest rate"]

# get the new names
NEW_NAMES = [feature + " growth" for feature in GROWTH_FEATURES]

# get the new df
final_df = avg_df.copy()

# get the percentage change
final_df[NEW_NAMES] = final_df.groupby(CONSTANT_COLS + ["housing: beds"])[GROWTH_FEATURES].pct_change().apply(lambda x: round(x, 3))

# drop the first column as don't have access to
final_df = final_df[final_df["year groups"] != 1]

print(final_df.shape)
final_df[filter_columns(final_df, find_sub=["housing", "year"])].head(5)

(21058, 152)


Unnamed: 0,housing: type,housing: beds,year groups,overseas: 5 years,housing: median,housing: count,housing: median growth
1,all,all,2.0,3646.0,300.0,5528.0,0.067
2,all,all,3.0,3646.0,293.75,5413.0,-0.021
3,all,all,4.0,3646.0,300.0,4963.0,0.021
4,all,all,5.0,3646.0,302.5,5048.0,0.008
5,all,all,6.0,3646.0,317.0,4101.0,0.048


### Percentage features

- Choosing to use percentage features that's not age demographics, as age demographics will already be in the model (only interested in growth rates
)
- Shouldn't do number of earners, as will always be decreasing (not updated yearly), likewise with other metrics

In [136]:
# NOT SURE WILL ACTUALLY BE USEFUL AGE DEMOGRAPHICS
PERCENTAGE_FEATURES = filter_columns(avg_df, find_sub=["overseas", "relationships", "studying"], 
                                     avoid_sub=["PT", "FT", "15"])

# filter the population in 2021
filt_population = final_df[final_df["year groups"] == (final_df["year groups"].max()-1)].drop_duplicates(subset="suburbs")

# get the dictionary values
percent_dict = {}
for i, row in filt_population.iterrows():
    percent_dict[row["suburbs"]] = {}
    for feature in PERCENTAGE_FEATURES:
        percent_dict[row["suburbs"]][feature] = round(row[feature] / row["population: total"], 4)

# change each value
for feature in PERCENTAGE_FEATURES:
    final_df[feature + " (%)"] = final_df[[feature, "suburbs"]].apply(lambda x: percent_dict[x["suburbs"]][feature], axis=1)

final_df.head(5)

Unnamed: 0,suburbs,housing: type,housing: beds,year groups,economic: median income,economic: median age of earners,economic: top 1% suburb,economic: gini coefficient,economic: top 5% suburb,economic: variable interest rate,...,studying: primary total (%),relationships: other related individual (%),relationships: dependent student (%),relationships: non dependent child (%),relationships: lone parents (%),relationships: defacto (%),relationships: lone persons (%),relationships: group household (%),relationships: married (%),overseas: 5 years (%)
1,Albert Park-Middle Park-West St Kilda,all,all,2.0,63415.3,39.8,14.55,0.5565,30.75,6.3825,...,0.0436,0.014,0.0326,0.0215,0.025,0.1424,0.2021,0.059,0.2538,0.1197
2,Albert Park-Middle Park-West St Kilda,all,all,3.0,63415.3,39.8,14.55,0.5565,30.75,6.6325,...,0.0436,0.014,0.0326,0.0215,0.025,0.1424,0.2021,0.059,0.2538,0.1197
3,Albert Park-Middle Park-West St Kilda,all,all,4.0,63415.3,39.8,14.55,0.5565,30.75,7.07,...,0.0436,0.014,0.0326,0.0215,0.025,0.1424,0.2021,0.059,0.2538,0.1197
4,Albert Park-Middle Park-West St Kilda,all,all,5.0,63415.3,39.8,14.55,0.5565,30.75,7.278333,...,0.0436,0.014,0.0326,0.0215,0.025,0.1424,0.2021,0.059,0.2538,0.1197
5,Albert Park-Middle Park-West St Kilda,all,all,6.0,63415.3,39.8,14.55,0.5565,30.75,7.6325,...,0.0436,0.014,0.0326,0.0215,0.025,0.1424,0.2021,0.059,0.2538,0.1197


### Combining beds

In [137]:
WEIGHT_COLUMN = 'housing: count'  # Column name for weights
EXCLUDE_COLUMN = "housing: beds"

def get_weight_avg(input_df, avg_list, sum_list):
    # create a copy
    use_df = input_df.copy()

    # Calculate the sum of weights for each group
    weight_sums = use_df.groupby(CONSTANT_COLS + [GROUP_NAME])[WEIGHT_COLUMN].sum().reset_index()
    weight_sums = weight_sums.rename(columns={WEIGHT_COLUMN: 'total_weight'})  # Rename to distinguish the total weight

    # Calculate weighted sum for the columns in avg_list
    for col in avg_list:
        use_df[col] = use_df[col] * use_df[WEIGHT_COLUMN]

    # Group by to get the sum of weighted values
    weighted_sums = use_df.groupby(CONSTANT_COLS + [GROUP_NAME])[avg_list].sum().reset_index()

    # Merge the total weight back to get the weighted average
    weighted_avg_df = pd.merge(weighted_sums, weight_sums, on=CONSTANT_COLS + [GROUP_NAME])

    # Calculate the weighted average
    for col in avg_list:
        weighted_avg_df[col] = weighted_avg_df[col] / weighted_avg_df['total_weight']

    # Calculate the sum for columns in sum_list
    sum_df = use_df.groupby(CONSTANT_COLS + [GROUP_NAME])[sum_list + [WEIGHT_COLUMN]].sum().reset_index()   # always want to calculate sum of counts

    # Merge weighted averages and sums into the final DataFrame
    final_merge_df = pd.merge(weighted_avg_df, sum_df, on=CONSTANT_COLS + [GROUP_NAME])

    # Remove unnecessary columns before returning
    final_merge_df = final_merge_df.drop(columns=['total_weight'])

    # get the remaining dataframe
    remaining_columns = [x for x in input_df.columns if x not in avg_list + sum_list + [WEIGHT_COLUMN]]
    remaining_df = pd.concat([filter_frame(input_df[remaining_columns], type="flat", beds="2"), filter_frame(input_df[remaining_columns], type="house", beds="3")])
    remaining_df.drop(columns=EXCLUDE_COLUMN, inplace=True)

    # merge and return
    final_df = pd.merge(remaining_df, final_merge_df, on=CONSTANT_COLS + [GROUP_NAME])

    return final_df

In [138]:
# can change
avg_list = ["housing: median growth"]
sum_list = [] # will already sum the weighting column

# get the weighted averge
weighted_avg = get_weight_avg(final_df, avg_list, sum_list)

# reintroduce the count growth
weighted_avg["housing: count growth"] = weighted_avg.groupby(CONSTANT_COLS)["housing: count"].pct_change().apply(lambda x: round(x, 4))

weighted_avg.head(5)#[filter_columns(weighted_avg, ["housing"])]

Unnamed: 0,suburbs,housing: type,year groups,economic: median income,economic: median age of earners,economic: top 1% suburb,economic: gini coefficient,economic: top 5% suburb,economic: variable interest rate,economic: gdp quarterly,...,relationships: non dependent child (%),relationships: lone parents (%),relationships: defacto (%),relationships: lone persons (%),relationships: group household (%),relationships: married (%),overseas: 5 years (%),housing: median growth,housing: count,housing: count growth
0,Albert Park-Middle Park-West St Kilda,flat,2.0,63415.3,39.8,14.55,0.5565,30.75,6.3825,0.875,...,0.0215,0.025,0.1424,0.2021,0.059,0.2538,0.1197,0.096561,3295.0,
1,Albert Park-Middle Park-West St Kilda,flat,3.0,63415.3,39.8,14.55,0.5565,30.75,6.6325,1.025,...,0.0215,0.025,0.1424,0.2021,0.059,0.2538,0.1197,0.000478,3319.0,0.0073
2,Albert Park-Middle Park-West St Kilda,flat,4.0,63415.3,39.8,14.55,0.5565,30.75,7.07,0.725,...,0.0215,0.025,0.1424,0.2021,0.059,0.2538,0.1197,0.001083,2877.0,-0.1332
3,Albert Park-Middle Park-West St Kilda,flat,5.0,63415.3,39.8,14.55,0.5565,30.75,7.278333,0.8,...,0.0215,0.025,0.1424,0.2021,0.059,0.2538,0.1197,0.059849,3101.0,0.0779
4,Albert Park-Middle Park-West St Kilda,flat,6.0,63415.3,39.8,14.55,0.5565,30.75,7.6325,0.775,...,0.0215,0.025,0.1424,0.2021,0.059,0.2538,0.1197,0.00382,2400.0,-0.2261


### Moving average

In [139]:
# curate
kernel = np.array([1/4, 1/2, 1/4])

# Define a function to apply convolution
def apply_convolution(group):
    group["housing: median growth avg"] = np.convolve(group["housing: median growth"], kernel, mode='same')
    return group

# Apply the function to each group
weighted_avg = weighted_avg.sort_values(by=["suburbs", "housing: type", "year groups"])
weighted_avg = weighted_avg.groupby(["suburbs", "housing: type"], group_keys=False).apply(apply_convolution)

print(weighted_avg["housing: median growth"].head(5))
print(weighted_avg["housing: median growth avg"].head(5))

0    0.096561
1    0.000478
2    0.001083
3    0.059849
4    0.003820
Name: housing: median growth, dtype: float64
0    0.048400
1    0.024650
2    0.015623
3    0.031150
4    0.036345
Name: housing: median growth avg, dtype: float64


### Housing attributes

Should do
- Include distance to city (merge other dataset)

Want to do
- Apartments as well as a factor (potentially divided by growth or something)
- Density as well

Done
- Price of houses
- Previous growth

In [140]:
# include the previous growth
weighted_avg["housing: previous growth"] = weighted_avg.groupby(CONSTANT_COLS)["housing: median growth"].shift(1)
weighted_avg["housing: previous price"] = weighted_avg.groupby(CONSTANT_COLS)["housing: median"].shift(1)
weighted_avg["housing: previous count"] = weighted_avg.groupby(CONSTANT_COLS)["housing: count"].shift(1)
weighted_avg["housing: previous count growth"] = weighted_avg.groupby(CONSTANT_COLS)["housing: count growth"].shift(1)
weighted_avg["housing: previous growth avg"] = weighted_avg.groupby(CONSTANT_COLS)["housing: median growth avg"].shift(1)

print(weighted_avg.shape)
weighted_avg.head(5)

(6228, 177)


Unnamed: 0,suburbs,housing: type,year groups,economic: median income,economic: median age of earners,economic: top 1% suburb,economic: gini coefficient,economic: top 5% suburb,economic: variable interest rate,economic: gdp quarterly,...,overseas: 5 years (%),housing: median growth,housing: count,housing: count growth,housing: median growth avg,housing: previous growth,housing: previous price,housing: previous count,housing: previous count growth,housing: previous growth avg
0,Albert Park-Middle Park-West St Kilda,flat,2.0,63415.3,39.8,14.55,0.5565,30.75,6.3825,0.875,...,0.1197,0.096561,3295.0,,0.0484,,,,,
1,Albert Park-Middle Park-West St Kilda,flat,3.0,63415.3,39.8,14.55,0.5565,30.75,6.6325,1.025,...,0.1197,0.000478,3319.0,0.0073,0.02465,0.096561,296.25,3295.0,,0.0484
2,Albert Park-Middle Park-West St Kilda,flat,4.0,63415.3,39.8,14.55,0.5565,30.75,7.07,0.725,...,0.1197,0.001083,2877.0,-0.1332,0.015623,0.000478,295.0,3319.0,0.0073,0.02465
3,Albert Park-Middle Park-West St Kilda,flat,5.0,63415.3,39.8,14.55,0.5565,30.75,7.278333,0.8,...,0.1197,0.059849,3101.0,0.0779,0.03115,0.001083,295.75,2877.0,-0.1332,0.015623
4,Albert Park-Middle Park-West St Kilda,flat,6.0,63415.3,39.8,14.55,0.5565,30.75,7.6325,0.775,...,0.1197,0.00382,2400.0,-0.2261,0.036345,0.059849,315.0,3101.0,0.0779,0.03115


### Getting the distances

In [141]:
# read and filter
distances = pd.read_csv("../../data/2. raw/distances.csv")
distances = distances[["suburbs", "crow_distance_to_cbd", "distance_to_cbd"]]

for distance_metric in ["crow_distance_to_cbd", "distance_to_cbd"]:
    distances[distance_metric + " inv"] = 1 / distances[distance_metric]
    distances[distance_metric + " inv"] = distances[distance_metric + " inv"] / distances[distance_metric + " inv"].max()

distances.rename(columns={x: "distance: " + x.replace("_", " ") for x in distances.columns if x != "suburbs"}, inplace=True)

# merge
merged_df = pd.merge(weighted_avg, distances, on="suburbs")
#merged_df.iloc[80:100]

### Saving

In [142]:
merged_df.to_csv("../../data/3. curated/external prices.csv")

### Don't use atm (outliers)

In [208]:
def remove_outliers(df, column):
    # apply log transform if necessary
    column_interest = df[column]
    
    # get the interquartile range
    interQuartiles = column_interest.quantile([0.25, 0.75])
    IQR = interQuartiles.loc[0.75] - interQuartiles.loc[0.25]

    # get the cutoffs
    rangeMultiplier = np.sqrt(np.log(df.shape[0])) - 0.5
    cutoff_lwr = interQuartiles.loc[0.25] - IQR * rangeMultiplier
    cutoff_upr = interQuartiles.loc[0.75] + IQR * rangeMultiplier
        
    # print information
    print(f"cutoffs are {cutoff_lwr} and {cutoff_upr}")
        
    # filter the dataframe
    new_df = df[(df[column] >= cutoff_lwr) & (df[column] <= cutoff_upr)]

    print(f"removed {df.shape[0] - new_df.shape[0]} records from {df.shape[0]} records")
    
    return new_df

In [None]:
#weighted_avg = remove_outliers(weighted_avg, "housing: median growth")

#remove_outliers(final_df, "population: total persons growth")