In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter

### Functions

In [76]:
"""will basically create a record for each entry of one of the columns, and a new column for another entry
Note: assumes a multi index
Note: index_stop is the number of columns in the original dataframe that don't need to be melted"""
def unpack_levels(df, index_stop, level_1_name, level_2_name, record_level_2=False):
    # make a massive dataframe, everything is a record
    df = df.melt(id_vars=df.columns.to_list()[:index_stop], 
                 value_vars = df.columns.to_list()[index_stop:],
                 var_name=[level_1_name, level_2_name],
                 value_name="value")

    if (not record_level_2):
        # basically extract the column data from the second level
        df = df.pivot(index=list(df.columns[:index_stop]) + [level_1_name], 
                      columns=level_2_name, values="value")
        
    else:
        # switch around the melted columns so the end on gets melted
        df = df.pivot(index=list(df.columns[:index_stop]) + [level_2_name], 
                      columns=level_1_name, values="value")

    # reset the indices and rename
    df = df.reset_index()
    #print([column[0] for column in df.columns[:index_stop]])
    df.columns = [column[0] for column in df.columns[:index_stop]] + list(df.columns[index_stop:])

    return df


### Download

In [77]:
RELATIVE_PATH = "../../data/landing/housing/"

# get a list of dataframes to merge
housing_df_array = []
for file_name in os.listdir(RELATIVE_PATH):
    # get the information about the file
    curr_type = file_name.split("_")[0]
    curr_beds = file_name.split("_")[1]

    # download the frame and add information
    curr_df = pd.read_csv(RELATIVE_PATH + file_name)
    curr_df["type"] = curr_type
    curr_df["beds"] = int(curr_beds)

    # drop the first column
    curr_df = curr_df.drop(curr_df.columns[0], axis=1)

    # add the dataframe to the list
    housing_df_array.append(curr_df)

# merge the dataframes
housing_df = pd.concat(housing_df_array)

print(housing_df.shape)
print(housing_df.dtypes)
housing_df.head(5)

(906, 189)
Unnamed: 1      object
Mar 2000        object
Unnamed: 3      object
Jun 2000        object
Unnamed: 5      object
                 ...  
Unnamed: 185    object
Mar 2023        object
Unnamed: 187    object
type            object
beds             int64
Length: 189, dtype: object


Unnamed: 0,Unnamed: 1,Mar 2000,Unnamed: 3,Jun 2000,Unnamed: 5,Sep 2000,Unnamed: 7,Dec 2000,Unnamed: 9,Mar 2001,...,Jun 2022,Unnamed: 181,Sep 2022,Unnamed: 183,Dec 2022,Unnamed: 185,Mar 2023,Unnamed: 187,type,beds
0,East St Kilda,62,255,62,263,60,270,64,268,56,...,41,595,50,618,48,625,42,650,house,2
1,Elwood,39,315,38,318,36,333,42,328,40,...,23,670,21,650,16,600,18,600,house,2
2,Fitzroy,125,250,137,250,131,260,143,260,132,...,88,625,92,630,93,640,79,650,house,2
3,Fitzroy North-Clifton Hill,272,250,298,250,290,250,292,260,272,...,204,620,208,620,204,630,189,650,house,2
4,Flemington-Kensington,188,220,184,220,205,220,206,230,219,...,209,520,213,520,206,510,204,523,house,2


- Manually checked each column followed this pattern
- They actually made a mistake on the dataset, 2003 dec replaced 2002 dec

### Renaming, Conversion and Missing values

In [78]:
# get the measure columns
time_stamps = [str(year) + "-" + month 
               for year in range(2000, 2024) 
               for month in ["03", "06", "09", "12"]]

# remove the last 3 quarters
time_stamps = time_stamps[:-3]

measure_columns = [(time_stamp, measure) for time_stamp in time_stamps for measure in ["count", "median"]]

# rename the columns
housing_df.columns = pd.MultiIndex.from_tuples([("region", None)] + measure_columns + [("type", None), ("beds", None)])

# reverse sort the column names
housing_df = housing_df.iloc[:, [0, 187, 188] + list(range(1, 187))]

In [79]:
# remove records with group total
housing_df = housing_df[~housing_df[('region', None)].str.contains('Group Total')]

# Spelling correction
housing_df['region'] = housing_df['region'].replace('Wanagaratta', 'Wangaratta')
housing_df['region'] = housing_df['region'].replace('Newcombe', 'Newcomb')

# find the na values
housing_df = housing_df.replace("-", np.nan)

In [80]:
# convert all values to integers
for measure_column in housing_df.columns[3:]:
    housing_df[measure_column] = housing_df[measure_column].astype(float)

In [81]:
# getting the median value only
housing_median = housing_df[[column for column in housing_df.columns if column[1] != "count"]]
housing_median.columns = [x[0] for x in housing_median.columns]

# getting the median average over the year
def conditional_row_mean(x):
    # Only calculate the mean if there are at least 3 non-NaN values
    return x.mean(axis=1).where(x.count(axis=1) >= 4, np.nan)

# only get the full years
housing_median_avg = housing_median.loc[:, "2000-03":"2022-12"]

# get the column labels to group together the months into years
year_labels = np.arange(len(housing_median_avg.columns)) // 4

# calculate the average
housing_median_avg = housing_median_avg.groupby(year_labels, axis=1).apply(conditional_row_mean)

In [82]:
# add the former columns
for column in housing_median.columns[:3]:
    housing_median_avg[column] = housing_median[column]

# rearrange
housing_median_avg = housing_median_avg[list(housing_median_avg.columns[-3:]) + list(housing_median_avg.columns[:-3])]

# rename the year columns
housing_median_avg.columns = list(housing_median_avg.columns[:3]) + list([str(x) for x in range(2000,2023)])

In [83]:
# Compute the yearly growth rate of the median rent for each suburb
housing_growth = housing_median_avg.copy()
housing_growth.iloc[:, 3:] = housing_growth.iloc[:, 3:].pct_change(axis=1) * 100

# make sure to drop the
housing_growth.head(5)

Unnamed: 0,region,type,beds,2000,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,East St Kilda,house,2,,3.503788,7.959744,-1.779661,3.796376,-0.831255,11.483655,...,1.64861,2.548656,0.54225,-1.123596,4.545455,2.826087,2.536998,-2.268041,-4.725738,7.307352
1,Elwood,house,2,,-4.327666,9.04685,2.222222,6.376812,1.839237,-3.67893,...,1.319149,-1.259975,2.50957,2.282158,-1.460446,3.540552,0.795229,0.631164,2.822423,-1.258101
2,Fitzroy,house,2,,7.156863,5.215005,0.434783,2.597403,3.881857,10.39805,...,0.961538,5.47619,5.914221,2.30179,-2.041667,6.890685,-2.307998,4.276986,-5.46875,4.256198
3,Fitzroy North-Clifton Hill,house,2,,4.950495,5.660377,1.785714,1.754386,2.844828,7.29254,...,3.740648,0.480769,1.913876,3.755869,2.941176,3.296703,3.829787,-0.614754,-1.525773,3.433836
4,Flemington-Kensington,house,2,,7.865169,4.166667,4.0,0.0,2.211538,5.174036,...,4.345238,0.969766,3.107345,3.287671,2.122016,3.376623,3.015075,1.463415,-3.846154,2.5


### Melting

In [28]:
housing_df = unpack_levels(housing_df, 3, "time stamp", "measure")
housing_df

Unnamed: 0,region,type,beds,time stamp,count,median
0,Altona,flat,1,2000-03,87.0,95.0
1,Altona,flat,1,2000-06,94.0,100.0
2,Altona,flat,1,2000-09,97.0,105.0
3,Altona,flat,1,2000-12,98.0,105.0
4,Altona,flat,1,2001-03,89.0,105.0
...,...,...,...,...,...,...
76999,Yarraville-Seddon,house,4,2022-03,55.0,660.0
77000,Yarraville-Seddon,house,4,2022-06,65.0,680.0
77001,Yarraville-Seddon,house,4,2022-09,74.0,700.0
77002,Yarraville-Seddon,house,4,2022-12,70.0,700.0


In [53]:
(housing_growth["median"] == 0).sum()

31496

In [85]:
housing_growth = housing_growth.melt(id_vars=housing_growth.columns[:3], 
                                     value_vars = housing_growth[3:],
                                    var_name=["median growth"]
                                    value_name="value")
housing_growth

IndexError: Too many levels: Index has only 1 level, not 2

## Exploration and Checking stuff

### Checking how many null values will be deleted

So the below basically tests how many growth rate records will be deleted when calculating the yearly growth rate, only accepting years will MAX_THRES years or less

In [59]:
START = 3
MAX_THRES = 0


print(housing_median.shape[0] * 23)

# split into batches
for max_thres in range(3):
    sum_lost = 0

    for batch_i in range(int((len(housing_median.columns) - 1 - START) / 4)):
        subset = housing_median.iloc[:, START + 4*batch_i: START + 4*(batch_i+2)]
        # checking the amoun     t of times the amount of na's is exceeded for the previous or current year
        sum_lost += subset.isna().apply(lambda x: (x.iloc[:4].sum() > max_thres) | 
                                        (x.iloc[4:].sum() > max_thres), axis=1).sum()

    print(sum_lost)

19044
1061
860
708


### Blah