### Import Libraries and Data

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

In [2]:
# Rental History Data
one_bed_flat = pd.read_csv('../data/raw/rental_history/one_bed_flat.csv')
two_bed_flat = pd.read_csv('../data/raw/rental_history/two_bed_flat.csv')
three_bed_flat = pd.read_csv('../data/raw/rental_history/three_bed_flat.csv')
two_bed_house = pd.read_csv('../data/raw/rental_history/two_bed_house.csv')
three_bed_house = pd.read_csv('../data/raw/rental_history/three_bed_house.csv')
four_bed_house = pd.read_csv('../data/raw/rental_history/four_bed_house.csv')
all_properties = pd.read_csv('../data/raw/rental_history/all_properties.csv')

# Domain Rental Data
domain_one_bed_flat = pd.read_csv('../data/curated/domain_one_bed_flat_rent.csv')
domain_two_bed_flat = pd.read_csv('../data/curated/domain_two_bed_flat_rent.csv')
domain_three_bed_flat = pd.read_csv('../data/curated/domain_three_bed_flat_rent.csv')
domain_two_bed_house = pd.read_csv('../data/curated/domain_two_bed_house_rent.csv')
domain_three_bed_house = pd.read_csv('../data/curated/domain_three_bed_house_rent.csv')
domain_four_bed_house = pd.read_csv('../data/curated/domain_four_bed_house.csv')
domain_all_properties = pd.read_csv('../data/curated/domain_all_properties_rent.csv')

# Other engineered feature sets 
crimes = pd.read_csv('../data/curated/crimes.csv')
population = pd.read_csv('../data/curated/final_population.csv')
education = pd.read_csv('../data/curated/education_df.csv')
urban_landmarks = pd.read_csv('../data/raw/urban_landmarks_features.csv')
pt_distances = pd.read_csv('../data/curated/suburb_transport_distances.csv')
income = pd.read_csv('../data/curated/income.csv')

### Formatting Rental Dataframes

In [3]:
def clean_domain_df(df):
    """
    This function cleans the domain dataframes by removing
    the 'Unnamed:' column, renaming median_rent to 'sep_median'
    (for a standardised column name as in rental history dfs) and
    also creates a year column and inputs the relevant year that
    the data is from - 2024. 
    """

    # Drop columns that contain 'Unnamed:' in their name
    df = df.loc[:, ~df.columns.str.contains('^Unnamed:')]
    
    # Rename the 'median_rent' column to 'sep_median'
    if 'median_rent' in df.columns:
        df = df.rename(columns={'median_rent': 'sep_median'})
    
    # Add a 'year' column with value 2024 for each row
    df['year'] = 2024

    # Reorder columns to make 'year' the second column
    cols = list(df.columns)
    cols.insert(1, cols.pop(cols.index('year')))
    df = df[cols]
    
    return df

# Apply the clean_domain_df function to all the domain dataframes
domain_one_bed_flat = clean_domain_df(domain_one_bed_flat)
domain_two_bed_flat = clean_domain_df(domain_two_bed_flat)
domain_three_bed_flat = clean_domain_df(domain_three_bed_flat)
domain_two_bed_house = clean_domain_df(domain_two_bed_house)
domain_three_bed_house = clean_domain_df(domain_three_bed_house)
domain_four_bed_house = clean_domain_df(domain_four_bed_house)
domain_all_properties = clean_domain_df(domain_all_properties)


Impute the Sep median price from scraped properties into the rental history dataframes

In [4]:
def impute_sep_2024_rental_data(rental_history_df, domain_df):
    """
    This function retrieves all the median rental prices in 
    September from the domain dataframes and then imputes
    them into the rental history dataframes where the year
    is 2024 and month is September. 
    """

    # Merge rental_history_df with domain_df on 'suburb' to keep all years from rental_history_df
    merged_df = pd.merge(rental_history_df, domain_df[['suburb', 'year', 'sep_median']],
                         on=['suburb'], how='left', suffixes=('', '_domain'))
    
    # Replace sep_median values with domain values only for rows where year == 2024
    condition = (merged_df['year'] == 2024) & merged_df['sep_median_domain'].notna()
    merged_df.loc[condition, 'sep_median'] = merged_df.loc[condition, 'sep_median_domain']
    
    # Drop the domain-specific columns used for imputation
    merged_df.drop(columns=['sep_median_domain', 'year_domain'], inplace=True)

    # Filter the dataframe to keep only the suburbs that appear 9 or more times
    suburb_counts = merged_df['suburb'].value_counts()
    suburbs_to_keep = suburb_counts[suburb_counts >= 9].index
    merged_df = merged_df[merged_df['suburb'].isin(suburbs_to_keep)]
    
    # Drop the sep_median column from the domain DataFrame
    domain_df = domain_df.drop(columns=['year', 'sep_median', 'num_properties'], errors='ignore')
    
    return merged_df, domain_df

# Apply the function to each dataset 
one_bed_flat, domain_one_bed_flat = impute_sep_2024_rental_data(one_bed_flat, domain_one_bed_flat)
two_bed_flat, domain_two_bed_flat = impute_sep_2024_rental_data(two_bed_flat, domain_two_bed_flat)
three_bed_flat, domain_three_bed_flat = impute_sep_2024_rental_data(three_bed_flat, domain_three_bed_flat)
two_bed_house, domain_two_bed_house = impute_sep_2024_rental_data(two_bed_house, domain_two_bed_house)
three_bed_house, domain_three_bed_house = impute_sep_2024_rental_data(three_bed_house, domain_three_bed_house)
four_bed_house, domain_four_bed_house = impute_sep_2024_rental_data(four_bed_house, domain_four_bed_house)
all_properties, domain_all_properties = impute_sep_2024_rental_data(all_properties, domain_all_properties)

### Combining All Feature Sets

In [5]:
# Merge one_bed_flat
one_bed_flat_merged = pd.merge(one_bed_flat, crimes, on=['suburb', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, education, on='suburb', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, pt_distances, on='suburb', how='inner')
one_bed_flat_merged = pd.merge(one_bed_flat_merged, domain_one_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in one_bed_flat and the population dataframe
one_bed_flat_merged = one_bed_flat_merged[one_bed_flat_merged['suburb'].isin(population['sa2_name'])]


# Merge two_bed_flat
two_bed_flat_merged = pd.merge(two_bed_flat, crimes, on=['suburb', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, education, on='suburb', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, pt_distances, on='suburb', how='inner')
two_bed_flat_merged = pd.merge(two_bed_flat_merged, domain_two_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in two_bed_flat and the population dataframe
two_bed_flat_merged = two_bed_flat_merged[two_bed_flat_merged['suburb'].isin(population['sa2_name'])]


# Merge three_bed_flat
three_bed_flat_merged = pd.merge(three_bed_flat, crimes, on=['suburb', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, income, on=['suburb', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, education, on='suburb', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, pt_distances, on='suburb', how='inner')
three_bed_flat_merged = pd.merge(three_bed_flat_merged, domain_three_bed_flat, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in three_bed_flat and the population dataframe
three_bed_flat_merged = three_bed_flat_merged[three_bed_flat_merged['suburb'].isin(population['sa2_name'])]


# Merge two_bed_house
two_bed_house_merged = pd.merge(two_bed_house, crimes, on=['suburb', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, income, on=['suburb', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
two_bed_house_merged = pd.merge(two_bed_house_merged, education, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, pt_distances, on='suburb', how='inner')
two_bed_house_merged = pd.merge(two_bed_house_merged, domain_two_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in two_bed_house and the population dataframe
two_bed_house_merged = two_bed_house_merged[two_bed_house_merged['suburb'].isin(population['sa2_name'])]


# Merge three_bed_house
three_bed_house_merged = pd.merge(three_bed_house, crimes, on=['suburb', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, income, on=['suburb', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
three_bed_house_merged = pd.merge(three_bed_house_merged, education, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, pt_distances, on='suburb', how='inner')
three_bed_house_merged = pd.merge(three_bed_house_merged, domain_three_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in three_bed_house and the population dataframe
three_bed_house_merged = three_bed_house_merged[three_bed_house_merged['suburb'].isin(population['sa2_name'])]


# Merge four_bed_house
four_bed_house_merged = pd.merge(four_bed_house, crimes, on=['suburb', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, income, on=['suburb', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
four_bed_house_merged = pd.merge(four_bed_house_merged, education, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, pt_distances, on='suburb', how='inner')
four_bed_house_merged = pd.merge(four_bed_house_merged, domain_four_bed_house, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in four_bed_house and the population dataframe
four_bed_house_merged = four_bed_house_merged[four_bed_house_merged['suburb'].isin(population['sa2_name'])]


# Merge all_properties
all_properties_merged = pd.merge(all_properties, crimes, on=['suburb', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, income, on=['suburb', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, population, left_on=['suburb', 'year'], right_on=['sa2_name', 'year'], how='outer')
all_properties_merged = pd.merge(all_properties_merged, education, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, urban_landmarks, left_on='suburb', right_on='gazetted_locality', how='inner')
all_properties_merged = pd.merge(all_properties_merged, pt_distances, on='suburb', how='inner')
all_properties_merged = pd.merge(all_properties_merged, domain_all_properties, on='suburb', how='inner')

# Filter rows to retain only matching suburbs in all_properties and the population dataframe
all_properties_merged = all_properties_merged[all_properties_merged['suburb'].isin(population['sa2_name'])]


one_bed_flat_merged

Unnamed: 0,suburb,year,dec_median,jun_median,mar_median,sep_median,offence_division_1,offence_division_2,offence_division_3,offence_division_4,...,distance_to_restaurant,distance_to_supermarket,nearest_transport_avg_distance,distance_to_cbd,median_bath,median_parkings,furnished_count,unfurnished_count,pets_allowed,pets_not_allowed
0,abbotsford,2016,380.0,380.0,380.0,380.0,107.0,1065.0,76.0,59.0,...,1.141601,1.083238,1.110000,5.55,1,0,2,10,4,8
1,abbotsford,2017,400.0,390.0,390.0,395.0,138.0,1019.0,64.0,69.0,...,1.141601,1.083238,1.110000,5.55,1,0,2,10,4,8
2,abbotsford,2018,410.0,400.0,400.0,400.0,100.0,1162.0,88.0,84.0,...,1.141601,1.083238,1.110000,5.55,1,0,2,10,4,8
3,abbotsford,2019,420.0,420.0,410.0,420.0,175.0,1053.0,178.0,114.0,...,1.141601,1.083238,1.110000,5.55,1,0,2,10,4,8
4,abbotsford,2020,390.0,418.0,420.0,410.0,145.0,985.0,151.0,89.0,...,1.141601,1.083238,1.110000,5.55,1,0,2,10,4,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615,yarraville,2023,295.0,270.0,275.0,280.0,96.0,633.0,68.0,31.0,...,1.032864,0.937960,1.826667,10.21,1,1,0,1,0,1
1616,yarraville,2024,325.0,325.0,325.0,320.0,101.0,607.0,60.0,40.0,...,1.032864,0.937960,1.826667,10.21,1,1,0,1,0,1
1617,yarraville,2025,,,,,102.0,604.0,63.0,40.0,...,1.032864,0.937960,1.826667,10.21,1,1,0,1,0,1
1618,yarraville,2026,,,,,103.0,600.0,65.0,39.0,...,1.032864,0.937960,1.826667,10.21,1,1,0,1,0,1


In [6]:
# Drop all other suburb column names. Only keep the first suburb column 
def clean_suburb_cols(df):
    """
    This function removes all duplicated of the suburb column name 
    from the merged dataframes. The duplicate suburb column name 
    could be 'Unnamed', 'sa2_name' or 'gazetted_locality'.
    """
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]  # removes the duplicate 'suburb' column
    columns_to_drop = ['sa2_name', 'gazetted_locality']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    return df

# Clean the column names
one_bed_flat_merged = clean_suburb_cols(one_bed_flat_merged)
two_bed_flat_merged = clean_suburb_cols(two_bed_flat_merged)
three_bed_flat_merged = clean_suburb_cols(three_bed_flat_merged)
two_bed_house_merged = clean_suburb_cols(two_bed_house_merged)
three_bed_house_merged = clean_suburb_cols(three_bed_house_merged)
four_bed_house_merged = clean_suburb_cols(four_bed_house_merged)
all_properties_merged = clean_suburb_cols(all_properties_merged)

In [7]:
def remove_nan_before_2025(df, median_columns):
    # Drop rows where year is less than 2025 and NaN values exist in any of the median columns
    return df[~((df['year'] < 2025) & (df[median_columns].isnull().any(axis=1)))]

# Define the median columns to check
median_columns = ['dec_median', 'jun_median', 'mar_median', 'sep_median']

# Call the function for each dataframe and reassign the cleaned data
one_bed_flat_merged = remove_nan_before_2025(one_bed_flat_merged, median_columns)
two_bed_flat_merged = remove_nan_before_2025(two_bed_flat_merged, median_columns)
three_bed_flat_merged = remove_nan_before_2025(three_bed_flat_merged, median_columns)
two_bed_house_merged = remove_nan_before_2025(two_bed_house_merged, median_columns)
three_bed_house_merged = remove_nan_before_2025(three_bed_house_merged, median_columns)
four_bed_house_merged = remove_nan_before_2025(four_bed_house_merged, median_columns)
all_properties_merged = remove_nan_before_2025(all_properties_merged, median_columns)

### Save All Properties Dataframe for Visualisation Purposes

In [8]:
# Save df to a CSV file
all_properties_merged.to_csv('../data/curated/all_properties_merged.csv', index=False)

### Create Train & Test Sets

In [9]:
def train_val_test_sets(df):
    """
    This function first splits the dataframe into training, validation, 
    and testing sets based on the 'year' column:
    - Training set includes data from the years 2016-2021.
    - Validation set includes data from the years 2022-2024.
    - Testing set includes data from the years 2025-2027.

    It then merges additional columns that are not part of the 
    features specific to years or target columns back with the 
    respective sets based on matching suburbs.

    The function then returns the follow dataframes:
    - X_train: Training feature set.
    - X_val: Validation feature set.
    - X_test: Testing feature set.
    - y_train: Training target set.
    - y_val: Validation target set.
    - y_test: Testing target set.
    """
    # Define the year ranges for training, validation, and testing sets
    train_years = range(2016, 2022)
    val_years = range(2022, 2025)
    test_years = range(2025, 2028)

    # Columns to include in X (specific to the years) and y splits
    feature_year_cols = ['suburb', 'year', 'offence_division_1', 'offence_division_2', 'offence_division_3', 'offence_division_4', 'offence_division_5', 'offence_division_6', 'erp']
    target_columns = ['dec_median', 'jun_median', 'mar_median', 'sep_median']

    # Add the income columns that start with 'hi_' and end with '_tot' using regex
    regex_pattern = r'^hi_.*_tot$'
    hi_tot_cols = df.filter(regex=regex_pattern).columns.tolist()
    # Combine to the feature_year_cols
    feature_year_cols += hi_tot_cols

    # Split features (specific to the years) and target data
    X = df[feature_year_cols]
    y = df[target_columns]

    # Split the dataframe into training, validation, and testing sets based on the year
    X_train = X[X['year'].isin(train_years)]
    X_val = X[X['year'].isin(val_years)]
    X_test = X[X['year'].isin(test_years)]

    # Align target sets with the corresponding feature sets
    y_train = y.loc[X_train.index]
    y_val = y.loc[X_val.index]
    y_test = y.loc[X_test.index]

    # Extract other columns not in feature_year_cols or target_columns, including 'suburb'
    other_columns = df.drop(columns=feature_year_cols + target_columns).columns
    other_data = df[other_columns].copy()
    other_data['suburb'] = df['suburb']  # Ensure 'suburb' is included

    # Merge the 'other' data back with the matching suburbs, irrespective of the year
    X_train = X_train.merge(other_data, on='suburb', how='left')
    X_val = X_val.merge(other_data, on='suburb', how='left')
    X_test = X_test.merge(other_data, on='suburb', how='left')

    # Drop dupliactes from the X dfs
    X_train = X_train.drop_duplicates()
    X_val = X_val.drop_duplicates()
    X_test = X_test.drop_duplicates()

    # Now drop 'suburb' and 'year' from the feature sets
    X_train = X_train.drop(columns=['suburb', 'year'])
    X_val = X_val.drop(columns=['suburb', 'year'])
    X_test = X_test.drop(columns=['suburb', 'year'])

    return X_train, X_val, X_test, y_train, y_val, y_test

# Create training, validation, and test sets for each property type
X_train_one_bed, X_val_one_bed, X_test_one_bed, y_train_one_bed, y_val_one_bed, y_test_one_bed = train_val_test_sets(one_bed_flat_merged)
X_train_two_bed, X_val_two_bed, X_test_two_bed, y_train_two_bed, y_val_two_bed, y_test_two_bed = train_val_test_sets(two_bed_flat_merged)
X_train_three_bed, X_val_three_bed, X_test_three_bed, y_train_three_bed, y_val_three_bed, y_test_three_bed = train_val_test_sets(three_bed_flat_merged)
X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_val_two_bed_house, y_test_two_bed_house = train_val_test_sets(two_bed_house_merged)
X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_val_three_bed_house, y_test_three_bed_house = train_val_test_sets(three_bed_house_merged)
X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_val_four_bed_house, y_test_four_bed_house = train_val_test_sets(four_bed_house_merged)
X_train_all_properties, X_val_all_properties, X_test_all_properties, y_train_all_properties, y_val_all_properties, y_test_all_properties = train_val_test_sets(all_properties_merged)

Check to see all the X columns are the same 

In [10]:
def compare_feature_columns(train, val, test):
    """
    This function compares columns of the training, validation, and 
    testing feature dataframes (X). A dictionary is then returned
    indicating if any columns are missing in each set or if all
    the colums are the same. 
    """
    comparison_result = {}
    # Check if columns match between train, validation, and test sets
    train_val_match = train.columns.equals(val.columns)
    train_test_match = train.columns.equals(test.columns)
    val_test_match = val.columns.equals(test.columns)
    
    if not (train_val_match and train_test_match and val_test_match):
        missing_in_val = set(train.columns) - set(val.columns)
        missing_in_train_val = set(val.columns) - set(train.columns)
        missing_in_test = set(train.columns) - set(test.columns)
        missing_in_train_test = set(test.columns) - set(train.columns)
        missing_in_val_test = set(val.columns) - set(test.columns)
        missing_in_test_val = set(test.columns) - set(val.columns)

        comparison_result = {
            "Columns missing in validation set compared to train": list(missing_in_val),
            "Columns missing in train set compared to validation": list(missing_in_train_val),
            "Columns missing in test set compared to train": list(missing_in_test),
            "Columns missing in train set compared to test": list(missing_in_train_test),
            "Columns missing in test set compared to validation": list(missing_in_val_test),
            "Columns missing in validation set compared to test": list(missing_in_test_val),
        }
    else:
        comparison_result = "Columns are the same in all three sets (train, validation, test)."

    return comparison_result

# List of training, validation, and testing DataFrames to compare
feature_dfs = {
    "One Bed": (X_train_one_bed, X_val_one_bed, X_test_one_bed),
    "Two Bed": (X_train_two_bed, X_val_two_bed, X_test_two_bed),
    "Three Bed": (X_train_three_bed, X_val_three_bed, X_test_three_bed),
    "Two Bed House": (X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house),
    "Three Bed House": (X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house),
    "Four Bed House": (X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house),
    "All Properties": (X_train_all_properties, X_val_all_properties, X_test_all_properties)
}

# Compare columns for each triplet of training, validation, and testing sets
comparison_results = {name: compare_feature_columns(train, val, test) for name, (train, val, test) in feature_dfs.items()}

comparison_results 

{'One Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Two Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Three Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Two Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'Three Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'Four Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'All Properties': 'Columns are the same in all three sets (train, validation, test).'}

### Check Null Values

In [11]:
# List of dataframes to check for missing values
dataframes = {
    'X_train_one_bed': X_train_one_bed,
    'X_val_one_bed': X_val_one_bed,
    'X_test_one_bed': X_test_one_bed,
    'y_train_one_bed': y_train_one_bed,
    'y_val_one_bed': y_val_one_bed,
    'y_test_one_bed': y_test_one_bed,
    
    'X_train_two_bed': X_train_two_bed,
    'X_val_two_bed': X_val_two_bed,
    'X_test_two_bed': X_test_two_bed,
    'y_train_two_bed': y_train_two_bed,
    'y_val_two_bed': y_val_two_bed,
    'y_test_two_bed': y_test_two_bed,
    
    'X_train_three_bed': X_train_three_bed,
    'X_val_three_bed': X_val_three_bed,
    'X_test_three_bed': X_test_three_bed,
    'y_train_three_bed': y_train_three_bed,
    'y_val_three_bed': y_val_three_bed,
    'y_test_three_bed': y_test_three_bed,
    
    'X_train_two_bed_house': X_train_two_bed_house,
    'X_val_two_bed_house': X_val_two_bed_house,
    'X_test_two_bed_house': X_test_two_bed_house,
    'y_train_two_bed_house': y_train_two_bed_house,
    'y_val_two_bed_house': y_val_two_bed_house,
    'y_test_two_bed_house': y_test_two_bed_house,
    
    'X_train_three_bed_house': X_train_three_bed_house,
    'X_val_three_bed_house': X_val_three_bed_house,
    'X_test_three_bed_house': X_test_three_bed_house,
    'y_train_three_bed_house': y_train_three_bed_house,
    'y_val_three_bed_house': y_val_three_bed_house,
    'y_test_three_bed_house': y_test_three_bed_house,
    
    'X_train_four_bed_house': X_train_four_bed_house,
    'X_val_four_bed_house': X_val_four_bed_house,
    'X_test_four_bed_house': X_test_four_bed_house,
    'y_train_four_bed_house': y_train_four_bed_house,
    'y_val_four_bed_house': y_val_four_bed_house,
    'y_test_four_bed_house': y_test_four_bed_house,
    
    'X_train_all_properties': X_train_all_properties,
    'X_val_all_properties': X_val_all_properties,
    'X_test_all_properties': X_test_all_properties,
    'y_train_all_properties': y_train_all_properties,
    'y_val_all_properties': y_val_all_properties,
    'y_test_all_properties': y_test_all_properties,
}

In [12]:
# Collecting rows with missing values for each dataframe
missing_rows_summary = {}

for name, df in dataframes.items():
    rows_with_missing = df[df.isnull().any(axis=1)]
    if not rows_with_missing.empty:
        missing_rows_summary[name] = rows_with_missing

print(missing_rows_summary)

# Check for missing values in each dataframe by columns 
missing_values_summary = {}
for name, df in dataframes.items():
    missing_values = df.isnull().sum()
    columns_with_missing = missing_values[missing_values > 0]
    if not columns_with_missing.empty:
        missing_values_summary[name] = columns_with_missing

print(missing_values_summary)

{'y_test_one_bed':       dec_median  jun_median  mar_median  sep_median
9            NaN         NaN         NaN         NaN
10           NaN         NaN         NaN         NaN
11           NaN         NaN         NaN         NaN
21           NaN         NaN         NaN         NaN
22           NaN         NaN         NaN         NaN
...          ...         ...         ...         ...
1606         NaN         NaN         NaN         NaN
1607         NaN         NaN         NaN         NaN
1617         NaN         NaN         NaN         NaN
1618         NaN         NaN         NaN         NaN
1619         NaN         NaN         NaN         NaN

[396 rows x 4 columns], 'y_test_two_bed':       dec_median  jun_median  mar_median  sep_median
9            NaN         NaN         NaN         NaN
10           NaN         NaN         NaN         NaN
11           NaN         NaN         NaN         NaN
21           NaN         NaN         NaN         NaN
22           NaN         NaN         

### Feature Selection

In [13]:
def pca_feature_selection(X_train, X_val, X_test, variance_threshold=0.9):
    """
    The function applies PCA for dimensionality reduction by fitting on the
    training set and transforming both the training and test sets. It keeps 
    a select number of components based on the defined variance 
    threshold. It returns the reduced training and test sets.
    """

    # Standardise the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # Initialise PCA, specifying the variance threshold
    pca_temp = PCA().fit(X_train_scaled)
    cumulative_variance = pca_temp.explained_variance_ratio_.cumsum()
    # Find the number of components to capture the specified variance
    n_components = next(i for i, total_variance in enumerate(cumulative_variance) if total_variance >= variance_threshold) + 1

    pca = PCA(n_components=n_components)

    # Fit PCA on the training set and transform both training and test sets
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_val_pca = pca.transform(X_val_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    print(f"Number of components selected: {n_components}")
    print(f"Explained variance by selected components: {sum(pca.explained_variance_ratio_):.2f}")

    return X_train_pca, X_val_pca, X_test_pca

# Perform feature selection with PCA on the X sets 
X_train_one_bed, X_val_one_bed, X_test_one_bed = pca_feature_selection(X_train_one_bed, X_val_one_bed, X_test_one_bed)
X_train_two_bed, X_val_two_bed, X_test_two_bed = pca_feature_selection(X_train_two_bed, X_val_two_bed, X_test_two_bed)
X_train_three_bed, X_val_three_bed, X_test_three_bed = pca_feature_selection(X_train_three_bed, X_val_three_bed, X_test_three_bed)
X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house = pca_feature_selection(X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house)
X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house = pca_feature_selection(X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house)
X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house = pca_feature_selection(X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house)
X_train_all_properties, X_val_all_properties, X_test_all_properties = pca_feature_selection(X_train_all_properties, X_val_all_properties, X_test_all_properties)

Number of components selected: 25
Explained variance by selected components: 0.91
Number of components selected: 27
Explained variance by selected components: 0.90
Number of components selected: 26
Explained variance by selected components: 0.90
Number of components selected: 29
Explained variance by selected components: 0.90
Number of components selected: 31
Explained variance by selected components: 0.90
Number of components selected: 30
Explained variance by selected components: 0.90
Number of components selected: 28
Explained variance by selected components: 0.90


### Train & Fit Model

In [14]:
# Define a list of all training, validation, and test sets
ML_dfs = [
    (X_train_one_bed, X_val_one_bed, X_test_one_bed, y_train_one_bed, y_val_one_bed, y_test_one_bed),
    (X_train_two_bed, X_val_two_bed, X_test_two_bed, y_train_two_bed, y_val_two_bed, y_test_two_bed),
    (X_train_three_bed, X_val_three_bed, X_test_three_bed, y_train_three_bed, y_val_three_bed, y_test_three_bed),
    (X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_val_two_bed_house, y_test_two_bed_house),
    (X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_val_three_bed_house, y_test_three_bed_house),
    (X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_val_four_bed_house, y_test_four_bed_house),
    (X_train_all_properties, X_val_all_properties, X_test_all_properties, y_train_all_properties, y_val_all_properties, y_test_all_properties),
]

Lasso Regression

In [15]:
# Define hyperparameters for Lasso
alpha_range = np.logspace(-6, 2, 20) 
param_grid = {'alpha': alpha_range}

# Loop through each set, tune the model on the validation set, and predict on the test set
for i, (X_train, X_val, X_test, y_train, y_val, y_test) in enumerate(ML_dfs):
    # Initialise the Lasso model
    lasso = Lasso()
    
    # Set up the GridSearchCV to tune the 'alpha' hyperparameter
    grid_search = GridSearchCV(lasso, param_grid, scoring='neg_mean_squared_error', cv=5)
    
    # Train the model using the training set and validate on the validation set
    grid_search.fit(X_train, y_train)
    
    # Select the best model based on the validation set
    best_lasso = grid_search.best_estimator_
    
    # Evaluate on the validation set
    y_val_pred = best_lasso.predict(X_val)
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    
    # Make predictions on the test set
    y_test_pred = best_lasso.predict(X_test)
    
    # Print the results
    print(f"Property Type {i+1}:")
    print(f"Best alpha: {best_lasso.alpha}")
    print(f"Validation MSE: {val_mse:.4f}, R^2: {val_r2:.4f}, Validation MAE: {val_mae:.4f}")
    print(f"Predictions for 2025-2027: {y_test_pred}\n")


Dataset 1:
Best alpha: 14.384498882876601
Validation MSE: 5009.8348, R^2: 0.1062, Validation MAE: 50.9748
Predictions for 2025-2027: [[323.14012052 321.97013048 321.45366707 322.19933826]
 [324.09682223 323.02173817 322.54618006 323.2211289 ]
 [325.06802744 324.08878311 323.65467291 324.25810648]
 ...
 [331.76980198 331.91121442 332.04567406 331.73020899]
 [333.47896818 333.7738687  333.99678321 333.54874198]
 [335.12084015 335.56527067 335.8732931  335.29665727]]

Dataset 2:
Best alpha: 14.384498882876601
Validation MSE: 7408.4227, R^2: 0.1357, Validation MAE: 64.4800
Predictions for 2025-2027: [[437.40793109 436.9361802  436.56650009 437.45511115]
 [438.55238422 438.12690419 437.78043313 438.62108054]
 [439.71468483 439.33601035 439.01310014 439.80511486]
 ...
 [441.17116341 440.14605227 439.82039554 440.80790341]
 [443.56383461 442.62586383 442.34628355 443.24403381]
 [445.86638691 445.01370889 444.7784666  445.58950236]]

Dataset 3:
Best alpha: 2.06913808111479
Validation MSE: 1617

Random Forest

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Define the hyperparameters for Random Forest
# ADD MORE
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [5, 10, 15, None],  # Maximum depth of each tree
}

# Loop through each set, tune the model on the validation set, and predict on the test set
for i, (X_train, X_val, X_test, y_train, y_val, y_test) in enumerate(ML_dfs):
    # Initialise the Random Forest model
    rf = RandomForestRegressor(random_state=42)
    
    # Set up the GridSearchCV to tune 'n_estimators' and 'max_depth'
    grid_search = GridSearchCV(rf, param_grid, scoring='neg_mean_squared_error', cv=5)
    
    # Train the model using the training set and validate on the validation set
    grid_search.fit(X_train, y_train)
    
    # Select the best model based on the validation set
    best_rf = grid_search.best_estimator_
    
    # Evaluate on the validation set
    y_val_pred = best_rf.predict(X_val)
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    
    # Make predictions on the test set
    y_test_pred = best_rf.predict(X_test)
    
    # Print the results
    print(f"Dataset {i+1}:")
    print(f"Best n_estimators: {best_rf.n_estimators}, Best max_depth: {best_rf.max_depth}")
    print(f"Validation MSE: {val_mse:.4f}, R^2: {val_r2:.4f}, Validation MAE: {val_mae:.4f}")
    print(f"Predictions for 2025-2027: {y_test_pred}\n")


Dataset 1:
Best n_estimators: 100, Best max_depth: None
Validation MSE: 4237.1439, R^2: 0.2528, Validation MAE: 46.2835
Predictions for 2025-2027: [[287.785 286.02  285.13  286.405]
 [286.485 284.31  283.61  285.005]
 [286.685 284.71  284.11  285.255]
 ...
 [298.68  304.69  307.61  299.27 ]
 [306.43  310.69  313.66  306.77 ]
 [320.84  323.46  325.56  320.64 ]]

Dataset 2:
Best n_estimators: 100, Best max_depth: None
Validation MSE: 5965.4465, R^2: 0.3082, Validation MAE: 58.7633
Predictions for 2025-2027: [[486.91 489.95 499.15 486.4 ]
 [486.08 489.95 499.15 486.1 ]
 [482.97 487.15 496.38 483.25]
 ...
 [395.24 395.68 397.85 396.53]
 [397.37 397.78 399.5  398.78]
 [399.7  400.18 401.95 401.33]]

Dataset 3:
Best n_estimators: 200, Best max_depth: None
Validation MSE: 13930.0568, R^2: 0.5607, Validation MAE: 80.6894
Predictions for 2025-2027: [[764.765 762.35  756.155 762.655]
 [760.45  758.66  753.445 759.495]
 [758.435 756.63  752.45  757.425]
 ...
 [691.345 674.335 652.72  689.31 ]
 [6