In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter

In [152]:
RELATIVE_PATH = "../../data/landing/housing/"
RELATIVE_OUT = "../../data/raw/historic/"

### Functions

In [165]:
"""will basically create a record for each entry of one of the columns, and a new column for another entry
Note: assumes a multi index
Note: index_stop is the number of columns in the original dataframe that don't need to be melted"""
def unpack_levels(df, index_stop, level_1_name, level_2_name, record_level_2=False):
    # make a massive dataframe, everything is a record
    df = df.melt(id_vars=df.columns.to_list()[:index_stop], 
                 value_vars = df.columns.to_list()[index_stop:],
                 var_name=[level_1_name, level_2_name],
                 value_name="value")

    if (not record_level_2):
        # basically extract the column data from the second level
        df = df.pivot(index=list(df.columns[:index_stop]) + [level_1_name], 
                      columns=level_2_name, values="value")
        
    else:
        # switch around the melted columns so the end on gets melted
        df = df.pivot(index=list(df.columns[:index_stop]) + [level_2_name], 
                      columns=level_1_name, values="value")

    # reset the indices and rename
    df = df.reset_index()
    #print([column[0] for column in df.columns[:index_stop]])
    df.columns = [column[0] for column in df.columns[:index_stop]] + list(df.columns[index_stop:])

    return df


### Download

In [172]:
# get a list of dataframes to merge
housing_df_array = []
for file_name in os.listdir(RELATIVE_PATH):
    # get the information about the file
    curr_type = file_name.split("_")[0]
    curr_beds = file_name.split("_")[1]

    # download the frame and add information
    curr_df = pd.read_csv(RELATIVE_PATH + file_name)
    curr_df["type"] = curr_type
    curr_df["beds"] = int(curr_beds)

    # drop the first column
    curr_df = curr_df.drop(curr_df.columns[0], axis=1)

    # add the dataframe to the list
    housing_df_array.append(curr_df)

# merge the dataframes
housing_df_raw = pd.concat(housing_df_array)

print(housing_df_raw.shape)
print(housing_df_raw.dtypes)
housing_df_raw.head(5)

(906, 189)
Unnamed: 1      object
Mar 2000        object
Unnamed: 3      object
Jun 2000        object
Unnamed: 5      object
                 ...  
Unnamed: 185    object
Mar 2023        object
Unnamed: 187    object
type            object
beds             int64
Length: 189, dtype: object


Unnamed: 0,Unnamed: 1,Mar 2000,Unnamed: 3,Jun 2000,Unnamed: 5,Sep 2000,Unnamed: 7,Dec 2000,Unnamed: 9,Mar 2001,...,Jun 2022,Unnamed: 181,Sep 2022,Unnamed: 183,Dec 2022,Unnamed: 185,Mar 2023,Unnamed: 187,type,beds
0,East St Kilda,62,255,62,263,60,270,64,268,56,...,41,595,50,618,48,625,42,650,house,2
1,Elwood,39,315,38,318,36,333,42,328,40,...,23,670,21,650,16,600,18,600,house,2
2,Fitzroy,125,250,137,250,131,260,143,260,132,...,88,625,92,630,93,640,79,650,house,2
3,Fitzroy North-Clifton Hill,272,250,298,250,290,250,292,260,272,...,204,620,208,620,204,630,189,650,house,2
4,Flemington-Kensington,188,220,184,220,205,220,206,230,219,...,209,520,213,520,206,510,204,523,house,2


- Manually checked each column followed this pattern
- They actually made a mistake on the dataset, 2003 dec replaced 2002 dec

### Renaming, Conversion and Missing values

In [173]:
# get the measure columns
time_stamps = [str(year) + "-" + month 
               for year in range(2000, 2024) 
               for month in ["03", "06", "09", "12"]]

# remove the last 3 quarters (because don't exist in the dataframe)
time_stamps = time_stamps[:-3]

measure_columns = [(time_stamp, measure) for time_stamp in time_stamps for measure in ["count", "median"]]

# rename the columns
housing_df_raw.columns = pd.MultiIndex.from_tuples([("region", None)] + measure_columns + [("type", None), ("beds", None)])

# reverse sort the column names
housing_df_raw = housing_df_raw.iloc[:, [0, 187, 188] + list(range(1, 187))]

In [175]:
# remove records with group total
housing_df_raw = housing_df_raw[~housing_df[('region', None)].str.contains('Group Total')]

# Spelling correction
housing_df_raw['region'] = housing_df_raw['region'].replace('Wanagaratta', 'Wangaratta')
housing_df_raw['region'] = housing_df_raw['region'].replace('Newcombe', 'Newcomb')

# find the na values
housing_df_raw = housing_df_raw.replace("-", np.nan)

# convert all values to integers
for measure_column in housing_df_raw.columns[3:]:
    housing_df_raw[measure_column] = housing_df_raw[measure_column].astype(float)

housing_df_raw.head(5)

Unnamed: 0_level_0,region,type,beds,2000-03,2000-03,2000-06,2000-06,2000-09,2000-09,2000-12,...,2022-03,2022-03,2022-06,2022-06,2022-09,2022-09,2022-12,2022-12,2023-03,2023-03
Unnamed: 0_level_1,NaN,NaN,NaN,count,median,count,median,count,median,count,...,count,median,count,median,count,median,count,median,count,median
0,East St Kilda,house,2,62.0,255.0,62.0,263.0,60.0,270.0,64.0,...,44.0,585.0,41.0,595.0,50.0,618.0,48.0,625.0,42.0,650.0
1,Elwood,house,2,39.0,315.0,38.0,318.0,36.0,333.0,42.0,...,21.0,670.0,23.0,670.0,21.0,650.0,16.0,600.0,18.0,600.0
2,Fitzroy,house,2,125.0,250.0,137.0,250.0,131.0,260.0,143.0,...,80.0,628.0,88.0,625.0,92.0,630.0,93.0,640.0,79.0,650.0
3,Fitzroy North-Clifton Hill,house,2,272.0,250.0,298.0,250.0,290.0,250.0,292.0,...,184.0,600.0,204.0,620.0,208.0,620.0,204.0,630.0,189.0,650.0
4,Flemington-Kensington,house,2,188.0,220.0,184.0,220.0,205.0,220.0,206.0,...,216.0,500.0,209.0,520.0,213.0,520.0,206.0,510.0,204.0,523.0


### Melting

In [176]:
# number of values want
N_VALUES = 7

def get_previous_periods(series):
    previous_values = []
    previous_value_array = []
    for i, value in series.items():
        # add the previous values
        previous_value_array.append(previous_values.copy())

        # update the seen values
        previous_values.append(value)

        # remove the first value if 
        if (len(previous_values) > N_VALUES):
            previous_values = previous_values[1:]

    # return the series
    return pd.Series(previous_value_array, index=series.index)

In [177]:
# turn everything into records
housing_df = unpack_levels(housing_df_raw, 3, "time stamp", "measure")

# create quarters and years
housing_df[["year", "quarter"]] = housing_df["time stamp"].str.split("-", expand=True).astype(int)

# get the actual quarter
housing_df["quarter"] = housing_df["quarter"] // 3

# get the previous 5 periods
housing_df["previous 2 years"] = housing_df.groupby("region", group_keys=False)["median"].apply(get_previous_periods)

# get the average for the time period
housing_df["avg 2 years"] = housing_df["previous 2 years"].apply(lambda x: sum(x) / len(x) if len(x) > 0 else None)

# to csv
housing_df.to_csv(RELATIVE_OUT + "housing.csv", index=False)

## Exploration and Checking stuff

### Checking how many null values will be deleted

So the below basically tests how many growth rate records will be deleted when calculating the yearly growth rate, only accepting years will MAX_THRES years or less

In [59]:
START = 3
MAX_THRES = 0


print(housing_median.shape[0] * 23)

# split into batches
for max_thres in range(3):
    sum_lost = 0

    for batch_i in range(int((len(housing_median.columns) - 1 - START) / 4)):
        subset = housing_median.iloc[:, START + 4*batch_i: START + 4*(batch_i+2)]
        # checking the amoun     t of times the amount of na's is exceeded for the previous or current year
        sum_lost += subset.isna().apply(lambda x: (x.iloc[:4].sum() > max_thres) | 
                                        (x.iloc[4:].sum() > max_thres), axis=1).sum()

    print(sum_lost)

19044
1061
860
708


### Blah