**PREPROCESS**
-----------

In this notebook, we will preprocess and feature engineer historical rental excel file

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Reading the excel file with historical rent

In [None]:
file_path = '../data/landing/Moving annual rent by suburb - March quarter 2024.xlsx'

sheets_to_load = ['1 bedroom flat', '2 bedroom flat', '3 bedroom flat', '2 bedroom house', '3 bedroom house', '4 bedroom house', 'All properties']

dfs = pd.read_excel(file_path, sheet_name=sheets_to_load, header=None)

df_list = []

Iterating through all the excel sheets to process and clean the data

In [2]:

for sheet_name, df in dfs.items():
    print(f"Processing sheet: {sheet_name}")
    
    # Rows and columns where the information begins
    start_row = 3  
    date_row = 1  
    median_row = 2 
    
    # Extracting features
    df_suburbs = df.iloc[start_row:, 1].reset_index(drop=True) 
    df_dates = df.iloc[date_row, 2::2].reset_index(drop=True)  
    df_medians = df.iloc[start_row:, 3::2].reset_index(drop=True) 

    # Combining import features to a new dataframe
    df_values = pd.DataFrame(df_medians.values, columns=df_dates) 
    df_values.insert(0, 'Suburb', df_suburbs) 

    # Converting to long format
    df_long = pd.melt(df_values, id_vars=['Suburb'], var_name='Date', value_name='Rent')

    # Converting date to datetime
    df_long['Date'] = pd.to_datetime(df_long['Date'], format='%b %Y', errors='coerce')

    # Adding property type as a feature
    df_long['Property_Type'] = sheet_name

    # Appending to a list
    df_list.append(df_long)

    print(f"Processed data for sheet: {sheet_name}")
    print(df_long.head())  

# Concatenating all dataframes
combined_df = pd.concat(df_list, ignore_index=True)
print("Combined DataFrame:")
print(combined_df.head())


Processing sheet: 1 bedroom flat
Processed data for sheet: 1 bedroom flat
                                  Suburb       Date Rent   Property_Type
0  Albert Park-Middle Park-West St Kilda 2000-03-01  165  1 bedroom flat
1                               Armadale 2000-03-01  150  1 bedroom flat
2                          Carlton North 2000-03-01  150  1 bedroom flat
3                      Carlton-Parkville 2000-03-01  165  1 bedroom flat
4                        CBD-St Kilda Rd 2000-03-01  250  1 bedroom flat
Processing sheet: 2 bedroom flat
Processed data for sheet: 2 bedroom flat
                                  Suburb       Date Rent   Property_Type
0  Albert Park-Middle Park-West St Kilda 2000-03-01  250  2 bedroom flat
1                               Armadale 2000-03-01  200  2 bedroom flat
2                          Carlton North 2000-03-01  215  2 bedroom flat
3                      Carlton-Parkville 2000-03-01  260  2 bedroom flat
4                        CBD-St Kilda Rd 2000-03-

Cleaning up missing values

In [4]:

combined_df = combined_df[combined_df['Suburb'] != 'Group Total']

print("Instances before cleaning: ", len(combined_df))

combined_df['Rent'] = pd.to_numeric(combined_df['Rent'], errors='coerce')

combined_df = combined_df.dropna(subset=['Rent'])

print("Instances after cleaning: ", len(combined_df))
combined_df['Rent'] = combined_df['Rent'].astype(float)

Instances before cleaning:  99134
Instances after cleaning:  95483


Adding feature engineering

In [5]:
# Remove rows where 'Suburb' is 'Group Total'
combined_df = combined_df[combined_df['Suburb'] != 'Group Total']

# Extract time-based features
combined_df['Year'] = combined_df['Date'].dt.year
combined_df['Month'] = combined_df['Date'].dt.month
combined_df['Quarter'] = combined_df['Date'].dt.quarter

# Create season feature
combined_df['Is_Summer'] = combined_df['Month'].isin([12, 1, 2]).astype(int)
combined_df['Is_Autumn'] = combined_df['Month'].isin([3, 4, 5]).astype(int)
combined_df['Is_Winter'] = combined_df['Month'].isin([6, 7, 8]).astype(int)
combined_df['Is_Spring'] = combined_df['Month'].isin([9, 10, 11]).astype(int)

# Calculate time since the start
combined_df['Time_Since_Start'] = (combined_df['Date'] - combined_df['Date'].min()).dt.days

# Create lag features (1, 3, and 12-month lags)
combined_df['Rent_Lag_1'] = combined_df.groupby(['Suburb', 'Property_Type'])['Rent'].shift(1)
combined_df['Rent_Lag_3'] = combined_df.groupby(['Suburb', 'Property_Type'])['Rent'].shift(3)
combined_df['Rent_Lag_12'] = combined_df.groupby(['Suburb', 'Property_Type'])['Rent'].shift(12)

# Create moving average features (3 and 12-month moving averages)
combined_df['Rent_MA_3'] = combined_df.groupby(['Suburb', 'Property_Type'])['Rent'].transform(lambda x: x.rolling(window=3).mean())
combined_df['Rent_MA_12'] = combined_df.groupby(['Suburb', 'Property_Type'])['Rent'].transform(lambda x: x.rolling(window=12).mean())

# Apply log transformation to Rent
combined_df['Log_Rent'] = np.log1p(combined_df['Rent'])

print("Data after feature engineering:")
combined_df


Data after feature engineering:


Unnamed: 0,Suburb,Date,Rent,Property_Type,Year,Month,Quarter,Is_Summer,Is_Autumn,Is_Winter,Is_Spring,Time_Since_Start,Rent_Lag_1,Rent_Lag_3,Rent_Lag_12,Rent_MA_3,Rent_MA_12,Log_Rent
0,Albert Park-Middle Park-West St Kilda,2000-03-01,165.0,1 bedroom flat,2000,3,1,0,1,0,0,0,,,,,,5.111988
1,Armadale,2000-03-01,150.0,1 bedroom flat,2000,3,1,0,1,0,0,0,,,,,,5.017280
2,Carlton North,2000-03-01,150.0,1 bedroom flat,2000,3,1,0,1,0,0,0,,,,,,5.017280
3,Carlton-Parkville,2000-03-01,165.0,1 bedroom flat,2000,3,1,0,1,0,0,0,,,,,,5.111988
4,CBD-St Kilda Rd,2000-03-01,250.0,1 bedroom flat,2000,3,1,0,1,0,0,0,,,,,,5.525453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107955,Traralgon,2024-03-01,410.0,All properties,2024,3,1,0,1,0,0,8766,395.0,390.0,330.0,398.333333,376.583333,6.018593
107956,Wanagaratta,2024-03-01,400.0,All properties,2024,3,1,0,1,0,0,8766,395.0,390.0,320.0,395.000000,375.416667,5.993961
107957,Warragul,2024-03-01,470.0,All properties,2024,3,1,0,1,0,0,8766,460.0,450.0,375.0,460.000000,423.333333,6.154858
107958,Warrnambool,2024-03-01,460.0,All properties,2024,3,1,0,1,0,0,8766,460.0,430.0,350.0,456.666667,414.166667,6.133398


Saving cleaned and feature engineered data

In [6]:
combined_df.to_csv("../data/curated/historical_rental_data.csv")