### Import Libraries and Data

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

In [2]:
# Rental History Data
one_bed_flat = pd.read_csv('../data/raw/rental_history/one_bed_flat.csv')
two_bed_flat = pd.read_csv('../data/raw/rental_history/two_bed_flat.csv')
three_bed_flat = pd.read_csv('../data/raw/rental_history/three_bed_flat.csv')
two_bed_house = pd.read_csv('../data/raw/rental_history/two_bed_house.csv')
three_bed_house = pd.read_csv('../data/raw/rental_history/three_bed_house.csv')
four_bed_house = pd.read_csv('../data/raw/rental_history/four_bed_house.csv')
all_properties = pd.read_csv('../data/raw/rental_history/all_properties.csv')

# Domain Rental Data
domain_one_bed_flat = pd.read_csv('../data/curated/domain_one_bed_flat_rent.csv')
domain_two_bed_flat = pd.read_csv('../data/curated/domain_two_bed_flat_rent.csv')
domain_three_bed_flat = pd.read_csv('../data/curated/domain_three_bed_flat_rent.csv')
domain_two_bed_house = pd.read_csv('../data/curated/domain_two_bed_house_rent.csv')
domain_three_bed_house = pd.read_csv('../data/curated/domain_three_bed_house_rent.csv')
domain_four_bed_house = pd.read_csv('../data/curated/domain_four_bed_house.csv')
domain_all_properties = pd.read_csv('../data/curated/domain_all_properties_rent.csv')

# Other engineered feature sets 
crimes = pd.read_csv('../data/curated/crimes.csv')
population = pd.read_csv('../data/curated/final_population.csv')
education = pd.read_csv('../data/curated/education_df.csv')
urban_landmarks = pd.read_csv('../data/raw/urban_landmarks_features.csv')
pt_distances = pd.read_csv('../data/curated/suburb_transport_distances.csv')
income = pd.read_csv('../data/curated/income.csv')

# Livability Index data 
livability_one_bed_flat = pd.read_csv('../data/curated/livability/livability_one_bed_flat.csv')
livability_two_bed_flat = pd.read_csv('../data/curated/livability/livability_two_bed_flat.csv')
livability_three_bed_flat = pd.read_csv('../data/curated/livability/livability_three_bed_flat.csv')
livability_two_bed_house = pd.read_csv('../data/curated/livability/livability_two_bed_house.csv')
livability_three_bed_house = pd.read_csv('../data/curated/livability/livability_three_bed_house.csv')
livability_four_bed_house = pd.read_csv('../data/curated/livability/livability_four_bed_house.csv')
livability_all_properties = pd.read_csv('../data/curated/livability/livability_all_properties.csv')
livability_all_properties

Unnamed: 0,year,suburb,livability
0,2016,abbotsford,0.325155
1,2017,abbotsford,0.322438
2,2018,abbotsford,0.320943
3,2019,abbotsford,0.314829
4,2020,abbotsford,0.313954
...,...,...,...
2359,2023,yarraville,0.506413
2360,2024,yarraville,0.506005
2361,2025,yarraville,0.505578
2362,2026,yarraville,0.505217


### Formatting Rental Dataframes

In [3]:
def clean_domain_df(df):
    """
    This function cleans the domain dataframes by removing
    the 'Unnamed:' column, renaming median_rent to 'sep_median'
    (for a standardised column name as in rental history dfs) and
    also creates a year column and inputs the relevant year that
    the data is from - 2024. 
    """

    # Drop columns that contain 'Unnamed:' in their name
    df = df.loc[:, ~df.columns.str.contains('^Unnamed:')]
    
    # Rename the 'median_rent' column to 'sep_median'
    if 'median_rent' in df.columns:
        df = df.rename(columns={'median_rent': 'sep_median'})
    
    # Add a 'year' column with value 2024 for each row
    df['year'] = 2024

    # Reorder columns to make 'year' the second column
    cols = list(df.columns)
    cols.insert(1, cols.pop(cols.index('year')))
    df = df[cols]
    
    return df

# Apply the clean_domain_df function to all the domain dataframes
domain_one_bed_flat = clean_domain_df(domain_one_bed_flat)
domain_two_bed_flat = clean_domain_df(domain_two_bed_flat)
domain_three_bed_flat = clean_domain_df(domain_three_bed_flat)
domain_two_bed_house = clean_domain_df(domain_two_bed_house)
domain_three_bed_house = clean_domain_df(domain_three_bed_house)
domain_four_bed_house = clean_domain_df(domain_four_bed_house)
domain_all_properties = clean_domain_df(domain_all_properties)


Impute the Sep median price from scraped properties into the rental history dataframes

In [4]:
def impute_sep_2024_rental_data(rental_history_df, domain_df):
    """
    This function retrieves all the median rental prices in 
    September from the domain dataframes and then imputes
    them into the rental history dataframes where the year
    is 2024 and month is September. 
    """

    # Merge rental_history_df with domain_df on 'suburb' to keep all years from rental_history_df
    merged_df = pd.merge(rental_history_df, domain_df[['suburb', 'year', 'sep_median']],
                         on=['suburb'], how='left', suffixes=('', '_domain'))
    
    # Replace sep_median values with domain values only for rows where year == 2024
    condition = (merged_df['year'] == 2024) & merged_df['sep_median_domain'].notna()
    merged_df.loc[condition, 'sep_median'] = merged_df.loc[condition, 'sep_median_domain']
    
    # Drop the domain-specific columns used for imputation
    merged_df.drop(columns=['sep_median_domain', 'year_domain'], inplace=True)

    # Filter the dataframe to keep only the suburbs that appear 9 or more times
    suburb_counts = merged_df['suburb'].value_counts()
    suburbs_to_keep = suburb_counts[suburb_counts >= 9].index
    merged_df = merged_df[merged_df['suburb'].isin(suburbs_to_keep)]
    
    # Drop the sep_median column from the domain DataFrame
    domain_df = domain_df.drop(columns=['year', 'sep_median', 'num_properties'], errors='ignore')
    
    return merged_df, domain_df

# Apply the function to each dataset 
one_bed_flat, domain_one_bed_flat = impute_sep_2024_rental_data(one_bed_flat, domain_one_bed_flat)
two_bed_flat, domain_two_bed_flat = impute_sep_2024_rental_data(two_bed_flat, domain_two_bed_flat)
three_bed_flat, domain_three_bed_flat = impute_sep_2024_rental_data(three_bed_flat, domain_three_bed_flat)
two_bed_house, domain_two_bed_house = impute_sep_2024_rental_data(two_bed_house, domain_two_bed_house)
three_bed_house, domain_three_bed_house = impute_sep_2024_rental_data(three_bed_house, domain_three_bed_house)
four_bed_house, domain_four_bed_house = impute_sep_2024_rental_data(four_bed_house, domain_four_bed_house)
all_properties, domain_all_properties = impute_sep_2024_rental_data(all_properties, domain_all_properties)

### Combining All Feature Sets

In [5]:
# Merge one_bed_flat
one_bed_flat_merged = pd.merge(one_bed_flat, crimes, on=['suburb', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, livability_one_bed_flat, on=['suburb', 'year'], how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, education, on='suburb', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, pt_distances, on='suburb', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, domain_one_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in one_bed_flat and the population dataframe
one_bed_flat_merged = one_bed_flat_merged[one_bed_flat_merged['suburb'].isin(population['sa2_name'])]
one_bed_flat_merged = one_bed_flat_merged.drop_duplicates()


# Merge two_bed_flat
two_bed_flat_merged = pd.merge(two_bed_flat, crimes, on=['suburb', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, livability_two_bed_flat, on=['suburb', 'year'], how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, education, on='suburb', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, pt_distances, on='suburb', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, domain_two_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in two_bed_flat and the population dataframe
two_bed_flat_merged = two_bed_flat_merged[two_bed_flat_merged['suburb'].isin(population['sa2_name'])]
two_bed_flat_merged = two_bed_flat_merged.drop_duplicates()


# Merge three_bed_flat
three_bed_flat_merged = pd.merge(three_bed_flat, crimes, on=['suburb', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, livability_three_bed_flat, on=['suburb', 'year'], how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, education, on='suburb', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, pt_distances, on='suburb', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, domain_three_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in three_bed_flat and the population dataframe
three_bed_flat_merged = three_bed_flat_merged[three_bed_flat_merged['suburb'].isin(population['sa2_name'])]
three_bed_flat_merged = three_bed_flat_merged.drop_duplicates()


# Merge two_bed_house
two_bed_house_merged = pd.merge(two_bed_house, crimes, on=['suburb', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, income, on=['suburb', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, livability_two_bed_house, on=['suburb', 'year'], how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, education, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, pt_distances, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, domain_two_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in two_bed_house and the population dataframe
two_bed_house_merged = two_bed_house_merged[two_bed_house_merged['suburb'].isin(population['sa2_name'])]
two_bed_house_merged = two_bed_house_merged.drop_duplicates()


# Merge three_bed_house
three_bed_house_merged = pd.merge(three_bed_house, crimes, on=['suburb', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, income, on=['suburb', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, livability_three_bed_house, on=['suburb', 'year'], how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, education, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, pt_distances, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, domain_three_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in three_bed_house and the population dataframe
three_bed_house_merged = three_bed_house_merged[three_bed_house_merged['suburb'].isin(population['sa2_name'])]
three_bed_house_merged = three_bed_house_merged.drop_duplicates()


# Merge four_bed_house
four_bed_house_merged = pd.merge(four_bed_house, crimes, on=['suburb', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, income, on=['suburb', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, livability_four_bed_house, on=['suburb', 'year'], how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, education, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, pt_distances, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, domain_four_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in four_bed_house and the population dataframe
four_bed_house_merged = four_bed_house_merged[four_bed_house_merged['suburb'].isin(population['sa2_name'])]
four_bed_house_merged =four_bed_house_merged.drop_duplicates()

# Merge all_properties
all_properties_merged = pd.merge(all_properties, crimes, on=['suburb', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, income, on=['suburb', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, livability_all_properties, on=['suburb', 'year'], how='inner')
all_properties_merged = pd.merge(all_properties_merged, education, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
all_properties_merged = pd.merge(all_properties_merged, pt_distances, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, domain_all_properties, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in all_properties and the population dataframe
all_properties_merged = all_properties_merged[all_properties_merged['suburb'].isin(population['sa2_name'])]
all_properties_merged = all_properties_merged.drop_duplicates()

In [6]:
# Drop all other suburb column names. Only keep the first suburb column 
def clean_suburb_cols(df):
    """
    This function removes all duplicated of the suburb column name 
    from the merged dataframes. The duplicate suburb column name 
    could be 'Unnamed', 'sa2_name' or 'gazetted_locality'.
    """
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]  # removes the duplicate 'suburb' column
    columns_to_drop = ['sa2_name', 'gazetted_locality']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    return df

# Clean the column names
one_bed_flat_merged = clean_suburb_cols(one_bed_flat_merged)
two_bed_flat_merged = clean_suburb_cols(two_bed_flat_merged)
three_bed_flat_merged = clean_suburb_cols(three_bed_flat_merged)
two_bed_house_merged = clean_suburb_cols(two_bed_house_merged)
three_bed_house_merged = clean_suburb_cols(three_bed_house_merged)
four_bed_house_merged = clean_suburb_cols(four_bed_house_merged)
all_properties_merged = clean_suburb_cols(all_properties_merged)

In [7]:
def remove_nan_before_2025(df, median_columns):
    # Drop rows where year is less than 2025 and NaN values exist in any of the median columns
    return df[~((df['year'] < 2025) & (df[median_columns].isnull().any(axis=1)))]

# Define the median columns to check
median_columns = ['dec_median', 'jun_median', 'mar_median', 'sep_median']

# Call the function for each dataframe and reassign the cleaned data
one_bed_flat_merged = remove_nan_before_2025(one_bed_flat_merged, median_columns)
two_bed_flat_merged = remove_nan_before_2025(two_bed_flat_merged, median_columns)
three_bed_flat_merged = remove_nan_before_2025(three_bed_flat_merged, median_columns)
two_bed_house_merged = remove_nan_before_2025(two_bed_house_merged, median_columns)
three_bed_house_merged = remove_nan_before_2025(three_bed_house_merged, median_columns)
four_bed_house_merged = remove_nan_before_2025(four_bed_house_merged, median_columns)
all_properties_merged = remove_nan_before_2025(all_properties_merged, median_columns)

### Save All Properties Dataframe for Visualisation Purposes

In [8]:
def save_merged_dataframes():
    # Define the base path
    base_path = '../data/curated/merged_feature_set'

    # Ensure the directory exists
    if not os.path.exists(base_path):
        os.makedirs(base_path)

    # Save each dataframe to a CSV file
    one_bed_flat_merged.to_csv(os.path.join(base_path, 'one_bed_flat_merged.csv'), index=False)
    two_bed_flat_merged.to_csv(os.path.join(base_path, 'two_bed_flat_merged.csv'), index=False)
    three_bed_flat_merged.to_csv(os.path.join(base_path, 'three_bed_flat_merged.csv'), index=False)
    two_bed_house_merged.to_csv(os.path.join(base_path, 'two_bed_house_merged.csv'), index=False)
    three_bed_house_merged.to_csv(os.path.join(base_path, 'three_bed_house_merged.csv'), index=False)
    four_bed_house_merged.to_csv(os.path.join(base_path, 'four_bed_house_merged.csv'), index=False)
    all_properties_merged.to_csv(os.path.join(base_path, 'all_properties_merged.csv'), index=False)

# Call the function
save_merged_dataframes()

### Create Train & Test Sets

In [9]:
def train_val_test_sets(df):
    """
    This function splits the dataframe into training, validation, and testing sets
    based on the 'year' column:
    - Training set: 2016-2021
    - Validation set: 2022-2024
    - Testing set: 2025-2027

    It also merges additional columns that are not part of the 
    features specific to years or target columns back with the 
    respective sets based on matching suburbs.

    The function returns:
    - X_train, X_val, X_test: Feature sets
    - y_train, y_val, y_test: Target sets
    """
    # Define the year ranges
    train_years = range(2016, 2022)
    val_years = range(2022, 2025)
    test_years = range(2025, 2028)

    # Define target columns (excluding suburb and year from the drop)
    target_columns = ['dec_median', 'jun_median', 'mar_median', 'sep_median']

    # Keep suburb and year in the features
    X = df.drop(columns=target_columns)  # This keeps 'suburb' and 'year' in X
    y = df[['suburb', 'year'] + target_columns]  # Target includes suburb, year, and target columns

    # Split into train, validation, and test sets based on the year
    X_train = X[X['year'].isin(train_years)]
    X_val = X[X['year'].isin(val_years)]
    X_test = X[X['year'].isin(test_years)]

    y_train = y[y['year'].isin(train_years)]
    y_val = y[y['year'].isin(val_years)]
    y_test = y[y['year'].isin(test_years)]


    return X_train, X_val, X_test, y_train, y_val, y_test


# Create training, validation, and test sets for each property type
X_train_one_bed_labels, X_val_one_bed_labels, X_test_one_bed_labels, y_train_one_bed_labels, y_val_one_bed_labels, y_test_one_bed_labels = train_val_test_sets(one_bed_flat_merged)
X_train_two_bed_labels, X_val_two_bed_labels, X_test_two_bed_labels, y_train_two_bed_labels, y_val_two_bed_labels, y_test_two_bed_labels = train_val_test_sets(two_bed_flat_merged)
X_train_three_bed_labels, X_val_three_bed_labels, X_test_three_bed_labels, y_train_three_bed_labels, y_val_three_bed_labels, y_test_three_bed_labels = train_val_test_sets(three_bed_flat_merged)
X_train_two_bed_house_labels, X_val_two_bed_house_labels, X_test_two_bed_house_labels, y_train_two_bed_house_labels, y_val_two_bed_house_labels, y_test_two_bed_house_labels = train_val_test_sets(two_bed_house_merged)
X_train_three_bed_house_labels, X_val_three_bed_house_labels, X_test_three_bed_house_labels, y_train_three_bed_house_labels, y_val_three_bed_house_labels, y_test_three_bed_house_labels = train_val_test_sets(three_bed_house_merged)
X_train_four_bed_house_labels, X_val_four_bed_house_labels, X_test_four_bed_house_labels, y_train_four_bed_house_labels, y_val_four_bed_house_labels, y_test_four_bed_house_labels = train_val_test_sets(four_bed_house_merged)
X_train_all_properties_labels, X_val_all_properties_labels, X_test_all_properties_labels, y_train_all_properties_labels, y_val_all_properties_labels, y_test_all_properties_labels = train_val_test_sets(all_properties_merged)

In [10]:
def drop_labels(X_train, X_val, X_test, y_train, y_val, y_test):
    # Drop the specified columns from X dataframes
    X_train = X_train.drop(columns=['suburb', 'year'])
    X_val = X_val.drop(columns=['suburb', 'year'])
    X_test = X_test.drop(columns=['suburb', 'year'])

    # Drop the specified columns from y dataframes
    y_train = y_train.drop(columns=['suburb', 'year'])
    y_val = y_val.drop(columns=['suburb', 'year'])
    y_test = y_test.drop(columns=['suburb', 'year'])
    
    # Return the modified dataframes without '_labels'
    return X_train, X_val, X_test, y_train, y_val, y_test

# For each dataset, apply the function
X_train_one_bed, X_val_one_bed, X_test_one_bed, y_train_one_bed, y_val_one_bed, y_test_one_bed = drop_labels(
    X_train_one_bed_labels, X_val_one_bed_labels, X_test_one_bed_labels, y_train_one_bed_labels, y_val_one_bed_labels, y_test_one_bed_labels
)

X_train_two_bed, X_val_two_bed, X_test_two_bed, y_train_two_bed, y_val_two_bed, y_test_two_bed = drop_labels(
    X_train_two_bed_labels, X_val_two_bed_labels, X_test_two_bed_labels, y_train_two_bed_labels, y_val_two_bed_labels, y_test_two_bed_labels
)

X_train_three_bed, X_val_three_bed, X_test_three_bed, y_train_three_bed, y_val_three_bed, y_test_three_bed = drop_labels(
    X_train_three_bed_labels, X_val_three_bed_labels, X_test_three_bed_labels, y_train_three_bed_labels, y_val_three_bed_labels, y_test_three_bed_labels
)

X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_val_two_bed_house, y_test_two_bed_house = drop_labels(
    X_train_two_bed_house_labels, X_val_two_bed_house_labels, X_test_two_bed_house_labels, y_train_two_bed_house_labels, y_val_two_bed_house_labels, y_test_two_bed_house_labels
)

X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_val_three_bed_house, y_test_three_bed_house = drop_labels(
    X_train_three_bed_house_labels, X_val_three_bed_house_labels, X_test_three_bed_house_labels, y_train_three_bed_house_labels, y_val_three_bed_house_labels, y_test_three_bed_house_labels
)

X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_val_four_bed_house, y_test_four_bed_house = drop_labels(
    X_train_four_bed_house_labels, X_val_four_bed_house_labels, X_test_four_bed_house_labels, y_train_four_bed_house_labels, y_val_four_bed_house_labels, y_test_four_bed_house_labels
)

X_train_all_properties, X_val_all_properties, X_test_all_properties, y_train_all_properties, y_val_all_properties, y_test_all_properties = drop_labels(
    X_train_all_properties_labels, X_val_all_properties_labels, X_test_all_properties_labels, y_train_all_properties_labels, y_val_all_properties_labels, y_test_all_properties_labels
)

Check to see all the X columns are the same 

In [11]:
def compare_feature_columns(train, val, test):
    """
    This function compares columns of the training, validation, and 
    testing feature dataframes (X). A dictionary is then returned
    indicating if any columns are missing in each set or if all
    the colums are the same. 
    """
    comparison_result = {}
    # Check if columns match between train, validation, and test sets
    train_val_match = train.columns.equals(val.columns)
    train_test_match = train.columns.equals(test.columns)
    val_test_match = val.columns.equals(test.columns)
    
    if not (train_val_match and train_test_match and val_test_match):
        missing_in_val = set(train.columns) - set(val.columns)
        missing_in_train_val = set(val.columns) - set(train.columns)
        missing_in_test = set(train.columns) - set(test.columns)
        missing_in_train_test = set(test.columns) - set(train.columns)
        missing_in_val_test = set(val.columns) - set(test.columns)
        missing_in_test_val = set(test.columns) - set(val.columns)

        comparison_result = {
            "Columns missing in validation set compared to train": list(missing_in_val),
            "Columns missing in train set compared to validation": list(missing_in_train_val),
            "Columns missing in test set compared to train": list(missing_in_test),
            "Columns missing in train set compared to test": list(missing_in_train_test),
            "Columns missing in test set compared to validation": list(missing_in_val_test),
            "Columns missing in validation set compared to test": list(missing_in_test_val),
        }
    else:
        comparison_result = "Columns are the same in all three sets (train, validation, test)."

    return comparison_result

# List of training, validation, and testing DataFrames to compare
feature_dfs = {
    "One Bed": (X_train_one_bed, X_val_one_bed, X_test_one_bed),
    "Two Bed": (X_train_two_bed, X_val_two_bed, X_test_two_bed),
    "Three Bed": (X_train_three_bed, X_val_three_bed, X_test_three_bed),
    "Two Bed House": (X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house),
    "Three Bed House": (X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house),
    "Four Bed House": (X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house),
    "All Properties": (X_train_all_properties, X_val_all_properties, X_test_all_properties)
}

# Compare columns for each triplet of training, validation, and testing sets
comparison_results = {name: compare_feature_columns(train, val, test) for name, (train, val, test) in feature_dfs.items()}

comparison_results 

{'One Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Two Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Three Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Two Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'Three Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'Four Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'All Properties': 'Columns are the same in all three sets (train, validation, test).'}

### Check Null Values

In [12]:
# List of dataframes to check for missing values
dataframes = {
    'X_train_one_bed': X_train_one_bed,
    'X_val_one_bed': X_val_one_bed,
    'X_test_one_bed': X_test_one_bed,
    'y_train_one_bed': y_train_one_bed,
    'y_val_one_bed': y_val_one_bed,
    'y_test_one_bed': y_test_one_bed,
    
    'X_train_two_bed': X_train_two_bed,
    'X_val_two_bed': X_val_two_bed,
    'X_test_two_bed': X_test_two_bed,
    'y_train_two_bed': y_train_two_bed,
    'y_val_two_bed': y_val_two_bed,
    'y_test_two_bed': y_test_two_bed,
    
    'X_train_three_bed': X_train_three_bed,
    'X_val_three_bed': X_val_three_bed,
    'X_test_three_bed': X_test_three_bed,
    'y_train_three_bed': y_train_three_bed,
    'y_val_three_bed': y_val_three_bed,
    'y_test_three_bed': y_test_three_bed,
    
    'X_train_two_bed_house': X_train_two_bed_house,
    'X_val_two_bed_house': X_val_two_bed_house,
    'X_test_two_bed_house': X_test_two_bed_house,
    'y_train_two_bed_house': y_train_two_bed_house,
    'y_val_two_bed_house': y_val_two_bed_house,
    'y_test_two_bed_house': y_test_two_bed_house,
    
    'X_train_three_bed_house': X_train_three_bed_house,
    'X_val_three_bed_house': X_val_three_bed_house,
    'X_test_three_bed_house': X_test_three_bed_house,
    'y_train_three_bed_house': y_train_three_bed_house,
    'y_val_three_bed_house': y_val_three_bed_house,
    'y_test_three_bed_house': y_test_three_bed_house,
    
    'X_train_four_bed_house': X_train_four_bed_house,
    'X_val_four_bed_house': X_val_four_bed_house,
    'X_test_four_bed_house': X_test_four_bed_house,
    'y_train_four_bed_house': y_train_four_bed_house,
    'y_val_four_bed_house': y_val_four_bed_house,
    'y_test_four_bed_house': y_test_four_bed_house,
    
    'X_train_all_properties': X_train_all_properties,
    'X_val_all_properties': X_val_all_properties,
    'X_test_all_properties': X_test_all_properties,
    'y_train_all_properties': y_train_all_properties,
    'y_val_all_properties': y_val_all_properties,
    'y_test_all_properties': y_test_all_properties,
}

In [13]:
# Collecting rows with missing values for each dataframe
missing_rows_summary = {}

for name, df in dataframes.items():
    rows_with_missing = df[df.isnull().any(axis=1)]
    if not rows_with_missing.empty:
        missing_rows_summary[name] = rows_with_missing

print(missing_rows_summary)

# Check for missing values in each dataframe by columns 
missing_values_summary = {}
for name, df in dataframes.items():
    missing_values = df.isnull().sum()
    columns_with_missing = missing_values[missing_values > 0]
    if not columns_with_missing.empty:
        missing_values_summary[name] = columns_with_missing

print(missing_values_summary)

{'y_test_one_bed':       dec_median  jun_median  mar_median  sep_median
9            NaN         NaN         NaN         NaN
10           NaN         NaN         NaN         NaN
11           NaN         NaN         NaN         NaN
21           NaN         NaN         NaN         NaN
22           NaN         NaN         NaN         NaN
...          ...         ...         ...         ...
1531         NaN         NaN         NaN         NaN
1532         NaN         NaN         NaN         NaN
1542         NaN         NaN         NaN         NaN
1543         NaN         NaN         NaN         NaN
1544         NaN         NaN         NaN         NaN

[393 rows x 4 columns], 'y_test_two_bed':       dec_median  jun_median  mar_median  sep_median
9            NaN         NaN         NaN         NaN
10           NaN         NaN         NaN         NaN
11           NaN         NaN         NaN         NaN
21           NaN         NaN         NaN         NaN
22           NaN         NaN         

### Feature Selection

In [14]:
# Define a list of all training, validation, and test sets
ML_dfs = [
    (X_train_one_bed, X_val_one_bed, X_test_one_bed, y_train_one_bed, y_val_one_bed, y_test_one_bed),
    (X_train_two_bed, X_val_two_bed, X_test_two_bed, y_train_two_bed, y_val_two_bed, y_test_two_bed),
    (X_train_three_bed, X_val_three_bed, X_test_three_bed, y_train_three_bed, y_val_three_bed, y_test_three_bed),
    (X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_val_two_bed_house, y_test_two_bed_house),
    (X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_val_three_bed_house, y_test_three_bed_house),
    (X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_val_four_bed_house, y_test_four_bed_house),
    (X_train_all_properties, X_val_all_properties, X_test_all_properties, y_train_all_properties, y_val_all_properties, y_test_all_properties),
]

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Define the hyperparameters for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [5, 10, 15, None],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
}

def rfecv_with_random_forest(X_train, X_val, X_test, y_train):
    """
    Apply Recursive Feature Elimination with Cross-Validation (RFECV) using
    Random Forest as the estimator to automatically select the optimal number
    of features, then use GridSearchCV to fine-tune hyperparameters on the reduced feature set.
    
    Parameters:
    X_train: Training feature set
    X_val: Validation feature set
    X_test: Test feature set
    y_train: Training labels (target)
    
    Returns:
    X_train_rfecv, X_val_rfecv, X_test_rfecv: Reduced datasets
    best_rf: Best tuned Random Forest model after feature selection
    """
    
    # Standardize the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # Initialize the Random Forest model
    rf_model = RandomForestRegressor(random_state=42)
    
    # Use RFECV to automatically select the optimal number of features
    rfecv = RFECV(estimator=rf_model, step=1, cv=KFold(5), scoring='neg_mean_squared_error', n_jobs=-1)
    rfecv.fit(X_train_scaled, y_train)

    # Transform the datasets based on the selected features
    X_train_rfecv = rfecv.transform(X_train_scaled)
    X_val_rfecv = rfecv.transform(X_val_scaled)
    X_test_rfecv = rfecv.transform(X_test_scaled)

    # Perform hyperparameter tuning using GridSearchCV on the reduced feature set
    rf_after_rfecv = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(rf_after_rfecv, param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X_train_rfecv, y_train)
    
    # Select the best Random Forest model based on GridSearchCV
    best_rf = grid_search.best_estimator_

    # Get the selected feature indices and names
    selected_features = rfecv.get_support(indices=True)
    selected_feature_names = [X_train.columns[i] for i in selected_features]  # Use X_train columns

    # Get the feature importance values from the trained Random Forest model (best_rf)
    importances = best_rf.feature_importances_

    # Pair selected features with their corresponding importance values
    feature_importance_pairs = list(zip(selected_feature_names, importances))

    # Sort the features by importance values in descending order
    sorted_features = sorted(feature_importance_pairs, key=lambda x: x[1], reverse=True)

    # Print the sorted features with their importance values
    print(f"Optimal number of features selected: {rfecv.n_features_}")
    print(f"Selected feature indices: {selected_features}")
    print(f"Selected features sorted by importance:")
    for feature, importance in sorted_features:
        print(f"{feature}: {importance:.4f}")

    return X_train_rfecv, X_val_rfecv, X_test_rfecv, best_rf, selected_feature_names


### Train & Fit Model

Random Forest

In [16]:
# Initialise the predictions dictionary
predictions_dict = {}

# Loop through each set, perform RFE, and predict using the tuned Random Forest model
for i, (X_train, X_val, X_test, y_train, y_val, y_test) in enumerate(ML_dfs):
    # Perform feature selection and get the best model
    X_train_rfecv, X_val_rfecv, X_test_rfecv, best_rf, selected_feature_names = rfecv_with_random_forest(
        X_train, X_val, X_test, y_train
    )
    
    # Evaluate on the validation set
    y_val_pred = best_rf.predict(X_val_rfecv)
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    
    # Print the validation results
    print(f"Property Type {i+1}:")
    print(f"Best n_estimators: {best_rf.n_estimators}, Best max_depth: {best_rf.max_depth}")
    print(f"Validation MSE: {val_mse:.4f}, R^2: {val_r2:.4f}, Validation MAE: {val_mae:.4f}")
    
    # Combine the training and validation sets for final model training
    X_train_val_rfecv = np.vstack((X_train_rfecv, X_val_rfecv))
    y_train_val = np.concatenate((y_train, y_val))
    
    # Retrain the model using the combined training and validation sets
    best_rf.fit(X_train_val_rfecv, y_train_val)
    
    # Make predictions on the test set
    y_test_pred = best_rf.predict(X_test_rfecv)
    
    # Store predictions in the dictionary with dataset index as key
    predictions_dict[f'X_test_{i+1}_predictions'] = y_test_pred
    
    # Print the predictions for the test set
    print(f"Predictions for 2025-2027: {y_test_pred}\n")

Optimal number of features selected: 53
Selected features sorted by importance:
distance_to_cbd: 0.7248
hi_4000_more_tot: 0.0702
distance_to_hotel: 0.0096
offence_division_4: 0.0088
main_roads: 0.0082
pets_allowed: 0.0080
hi_1500_1749_tot: 0.0080
offence_division_6: 0.0072
industrial_areas: 0.0070
offence_division_3: 0.0068
livability: 0.0067
retail_areas: 0.0065
hi_3500_3999_tot: 0.0059
hi_3000_3499_tot: 0.0058
offence_division_2: 0.0053
hi_150_299_tot: 0.0052
erp: 0.0049
hi_500_649_tot: 0.0049
motorway: 0.0049
avg_secondary_school_rank: 0.0049
nearest_transport_avg_distance: 0.0048
hi_1_149_tot: 0.0043
walking_paths: 0.0040
residential_areas: 0.0039
offence_division_5: 0.0038
tourism_and_attractions: 0.0037
hi_300_399_tot: 0.0037
offence_division_1: 0.0034
hi_400_499_tot: 0.0034
distance_to_mall: 0.0033
accommodation: 0.0033
hi_2500_2999_tot: 0.0028
distance_to_supermarket: 0.0027
financial_institutions: 0.0025
hi_1750_1999_tot: 0.0024
culture_and_leisure: 0.0024
forest: 0.0024
avg_p

NameError: name 'predictions_dict' is not defined

In [None]:
# Add predictions back to the training data 
predictions_dict['X_test_1_predictions']
X_predictions = np.vstack((X_train_rfecv, X_val_rfecv))
X_test_one_bed_labels['predictions'] = flattened_predictions
# X_test_two_bed_labels['predictions'] = predictions_dict['X_test_2_predictions']
# X_test_three_bed_labels['predictions'] = predictions_dict['X_test_3_predictions']
# X_test_two_bed_house_labels['predictions'] = predictions_dict['X_test_4_predictions']
# X_test_three_bed_house_labels['predictions'] = predictions_dict['X_test_5_predictions']
# X_test_four_bed_house_labels['predictions'] = predictions_dict['X_test_6_predictions']
# X_test_all_properties_labels['predictions'] = predictions_dict['X_test_7_predictions']
#X_test_one_bed.shape

- growth rate map (for each suburb 2024-2027) - march 2024 and dec 2027
- Get the feature importance rankings
- Model errors 