**PREPROCESS**
-----------

In this notebook, we will preprocess and feature engineer historical rental excel file

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
import pandas as pd

# Load the Excel file with multiple sheets
file_path = '../data/landing/Moving annual rent by suburb - March quarter 2023.xlsx'

sheets_to_load = ['1 bedroom flat', '2 bedroom flat', '3 bedroom flat', '2 bedroom house', '3 bedroom house', '4 bedroom house', 'All properties']

# Read the Excel file into a dictionary of DataFrames without headers
dfs = pd.read_excel(file_path, sheet_name=sheets_to_load, header=None)

# Initialize an empty list to store reshaped DataFrames
df_list = []

# Iterate through each sheet and process the data
for sheet_name, df in dfs.items():
    print(f"Processing sheet: {sheet_name}")
    
    start_row = 3  # The first row where data starts (excluding headers)
    date_row = 1   # Row containing dates
    median_row = 2 # Row containing the "Median" labels
    
    # Extract relevant parts of the DataFrame
    df_suburbs = df.iloc[start_row:, 1].reset_index(drop=True)  # Extract suburbs column
    df_dates = df.iloc[date_row, 2::2].reset_index(drop=True)  # Select every second column for dates (even-indexed)
    df_medians = df.iloc[start_row:, 3::2].reset_index(drop=True)  # Select every second column for median values (odd-indexed)

    # Combine suburb, dates, and median values into a new DataFrame
    df_values = pd.DataFrame(df_medians.values, columns=df_dates)  # Create DataFrame with dates as columns
    df_values.insert(0, 'Suburb', df_suburbs)  # Ensure 'Suburb' is inserted as the first column

    # Melt the DataFrame to a long format
    df_long = pd.melt(df_values, id_vars=['Suburb'], var_name='Date', value_name='Rent')

    # Convert 'Date' to a datetime format
    df_long['Date'] = pd.to_datetime(df_long['Date'], format='%b %Y', errors='coerce')

    # Add a column for the property type (from the sheet name)
    df_long['Property_Type'] = sheet_name

    # Append the reshaped DataFrame to the list
    df_list.append(df_long)

    print(f"Processed data for sheet: {sheet_name}")
    print(df_long.head())  # Print the processed data to check the output

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(df_list, ignore_index=True)

# Display the combined DataFrame
print("Combined DataFrame:")
print(combined_df.head())


Processing sheet: 1 bedroom flat
Processed data for sheet: 1 bedroom flat
                                  Suburb       Date Rent   Property_Type
0  Albert Park-Middle Park-West St Kilda 2000-03-01  165  1 bedroom flat
1                               Armadale 2000-03-01  150  1 bedroom flat
2                          Carlton North 2000-03-01  150  1 bedroom flat
3                      Carlton-Parkville 2000-03-01  165  1 bedroom flat
4                        CBD-St Kilda Rd 2000-03-01  250  1 bedroom flat
Processing sheet: 2 bedroom flat
Processed data for sheet: 2 bedroom flat
                                  Suburb       Date Rent   Property_Type
0  Albert Park-Middle Park-West St Kilda 2000-03-01  250  2 bedroom flat
1                               Armadale 2000-03-01  200  2 bedroom flat
2                          Carlton North 2000-03-01  215  2 bedroom flat
3                      Carlton-Parkville 2000-03-01  270  2 bedroom flat
4                        CBD-St Kilda Rd 2000-03-

In [3]:
combined_df.head()

Unnamed: 0,Suburb,Date,Rent,Property_Type
0,Albert Park-Middle Park-West St Kilda,2000-03-01,165,1 bedroom flat
1,Armadale,2000-03-01,150,1 bedroom flat
2,Carlton North,2000-03-01,150,1 bedroom flat
3,Carlton-Parkville,2000-03-01,165,1 bedroom flat
4,CBD-St Kilda Rd,2000-03-01,250,1 bedroom flat


Cleaning up missing values

In [4]:

combined_df = combined_df[combined_df['Suburb'] != 'Group Total']

print("Instances before cleaning: ", len(combined_df))

combined_df['Rent'] = pd.to_numeric(combined_df['Rent'], errors='coerce')

combined_df = combined_df.dropna(subset=['Rent'])

print("Instances after cleaning: ", len(combined_df))
combined_df['Rent'] = combined_df['Rent'].astype(float)

Instances before cleaning:  81468
Instances after cleaning:  78545


Adding feature engineering

In [5]:
# Remove rows where 'Suburb' is 'Group Total'
combined_df = combined_df[combined_df['Suburb'] != 'Group Total']

# Extract time-based features
combined_df['Year'] = combined_df['Date'].dt.year
combined_df['Month'] = combined_df['Date'].dt.month
combined_df['Quarter'] = combined_df['Date'].dt.quarter

# Create seasonal indicators (e.g., Summer: December to February)
combined_df['Is_Summer'] = combined_df['Month'].isin([12, 1, 2]).astype(int)
combined_df['Is_Autumn'] = combined_df['Month'].isin([3, 4, 5]).astype(int)
combined_df['Is_Winter'] = combined_df['Month'].isin([6, 7, 8]).astype(int)
combined_df['Is_Spring'] = combined_df['Month'].isin([9, 10, 11]).astype(int)

# Calculate time since the start
combined_df['Time_Since_Start'] = (combined_df['Date'] - combined_df['Date'].min()).dt.days

# Create lag features (1, 3, and 12-month lags)
combined_df['Rent_Lag_1'] = combined_df.groupby(['Suburb', 'Property_Type'])['Rent'].shift(1)
combined_df['Rent_Lag_3'] = combined_df.groupby(['Suburb', 'Property_Type'])['Rent'].shift(3)
combined_df['Rent_Lag_12'] = combined_df.groupby(['Suburb', 'Property_Type'])['Rent'].shift(12)

# Create moving average features (3 and 12-month moving averages)
combined_df['Rent_MA_3'] = combined_df.groupby(['Suburb', 'Property_Type'])['Rent'].transform(lambda x: x.rolling(window=3).mean())
combined_df['Rent_MA_12'] = combined_df.groupby(['Suburb', 'Property_Type'])['Rent'].transform(lambda x: x.rolling(window=12).mean())

# Encode categorical variables using one-hot encoding
#combined_df = pd.get_dummies(combined_df, columns=['Property_Type'], drop_first=True)

# Apply log transformation to Rent (if distribution is skewed)
combined_df['Log_Rent'] = np.log1p(combined_df['Rent'])

print("Data after feature engineering:")
combined_df


Data after feature engineering:


Unnamed: 0,Suburb,Date,Rent,Property_Type,Year,Month,Quarter,Is_Summer,Is_Autumn,Is_Winter,Is_Spring,Time_Since_Start,Rent_Lag_1,Rent_Lag_3,Rent_Lag_12,Rent_MA_3,Rent_MA_12,Log_Rent
0,Albert Park-Middle Park-West St Kilda,2000-03-01,165.0,1 bedroom flat,2000,3,1,0,1,0,0,0,,,,,,5.111988
1,Armadale,2000-03-01,150.0,1 bedroom flat,2000,3,1,0,1,0,0,0,,,,,,5.017280
2,Carlton North,2000-03-01,150.0,1 bedroom flat,2000,3,1,0,1,0,0,0,,,,,,5.017280
3,Carlton-Parkville,2000-03-01,165.0,1 bedroom flat,2000,3,1,0,1,0,0,0,,,,,,5.111988
4,CBD-St Kilda Rd,2000-03-01,250.0,1 bedroom flat,2000,3,1,0,1,0,0,0,,,,,,5.525453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88716,Traralgon,2023-03-01,385.0,All properties,2023,3,1,0,1,0,0,8400,380.0,380.0,300.0,381.666667,349.583333,5.955837
88717,Wanagaratta,2023-03-01,380.0,All properties,2023,3,1,0,1,0,0,8400,380.0,380.0,290.0,380.000000,346.500000,5.942799
88718,Warragul,2023-03-01,440.0,All properties,2023,3,1,0,1,0,0,8400,430.0,400.0,350.0,430.000000,392.916667,6.089045
88719,Warrnambool,2023-03-01,420.0,All properties,2023,3,1,0,1,0,0,8400,420.0,400.0,340.0,420.000000,379.166667,6.042633


In [6]:
combined_df.to_csv("../data/curated/historical_rental_data.csv")

In [7]:
# Filter the dataframe for the suburb "Altona"
altona_df = combined_df[(combined_df['Suburb'] == 'Altona') & (combined_df['Property_Type'] == '1 bedroom flat')]

# Select only the 'Date' and 'Rent' columns
altona_rent_df = altona_df[['Date', 'Rent']]

# Sort the dataframe by 'Date' to ensure the data is in order
altona_rent_df = altona_rent_df.sort_values(by='Date')

# Reset the index for neatness
altona_rent_df = altona_rent_df.reset_index(drop=True)
pd.set_option('display.max_rows', None)
altona_rent_df


Unnamed: 0,Date,Rent
0,2000-03-01,95.0
1,2000-06-01,100.0
2,2000-09-01,105.0
3,2000-12-01,105.0
4,2001-03-01,105.0
5,2001-06-01,105.0
6,2001-09-01,110.0
7,2001-12-01,110.0
8,2002-03-01,115.0
9,2002-06-01,115.0
