### Import Libraries and Data

In [3]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, KFold

In [4]:
# Import all merged feature sets

one_bed_flat_merged = pd.read_csv('../data/curated/merged_feature_set/one_bed_flat_merged.csv')
two_bed_flat_merged = pd.read_csv('../data/curated/merged_feature_set/two_bed_flat_merged.csv')
three_bed_flat_merged = pd.read_csv('../data/curated/merged_feature_set/three_bed_flat_merged.csv')

two_bed_house_merged = pd.read_csv('../data/curated/merged_feature_set/two_bed_house_merged.csv')
three_bed_house_merged = pd.read_csv('../data/curated/merged_feature_set/three_bed_house_merged.csv')
four_bed_house_merged = pd.read_csv('../data/curated/merged_feature_set/four_bed_house_merged.csv')

all_properties_merged = pd.read_csv('../data/curated/merged_feature_set/all_properties_merged.csv')

### Create Train & Test Sets

In [5]:
def train_val_test_sets(df):
    """
    This function splits the dataframe into training, validation, and testing sets
    based on the 'year' column:
    - Training set: 2016-2021
    - Validation set: 2022-2024
    - Testing set: 2025-2027

    It retains the year and suburb columns for each dataframe. This is why we call 
    these dataframes '_labels'. 

    The function returns:
    - X_train, X_val, X_test: Feature sets
    - y_train, y_val, y_test: Target sets
    """
    # Define the year ranges
    train_years = range(2016, 2022)
    val_years = range(2022, 2025)
    test_years = range(2025, 2028)

    # Define target columns (excluding suburb and year from the drop)
    target_columns = ['dec_median', 'jun_median', 'mar_median', 'sep_median']

    # Keep suburb and year in the features
    X = df.drop(columns=target_columns)  # This keeps 'suburb' and 'year' in X
    y = df[['suburb', 'year'] + target_columns]  # Target includes suburb, year, and target columns

    # Split into train, validation, and test sets based on the year
    X_train = X[X['year'].isin(train_years)]
    X_val = X[X['year'].isin(val_years)]
    X_test = X[X['year'].isin(test_years)]

    y_train = y[y['year'].isin(train_years)]
    y_val = y[y['year'].isin(val_years)]
    y_test = y[y['year'].isin(test_years)]


    return X_train, X_val, X_test, y_train, y_val, y_test


# Create training, validation, and test sets for each property type
X_train_one_bed_labels, X_val_one_bed_labels, X_test_one_bed_labels, y_train_one_bed_labels, y_val_one_bed_labels, y_test_one_bed_labels = train_val_test_sets(one_bed_flat_merged)
X_train_two_bed_labels, X_val_two_bed_labels, X_test_two_bed_labels, y_train_two_bed_labels, y_val_two_bed_labels, y_test_two_bed_labels = train_val_test_sets(two_bed_flat_merged)
X_train_three_bed_labels, X_val_three_bed_labels, X_test_three_bed_labels, y_train_three_bed_labels, y_val_three_bed_labels, y_test_three_bed_labels = train_val_test_sets(three_bed_flat_merged)
X_train_two_bed_house_labels, X_val_two_bed_house_labels, X_test_two_bed_house_labels, y_train_two_bed_house_labels, y_val_two_bed_house_labels, y_test_two_bed_house_labels = train_val_test_sets(two_bed_house_merged)
X_train_three_bed_house_labels, X_val_three_bed_house_labels, X_test_three_bed_house_labels, y_train_three_bed_house_labels, y_val_three_bed_house_labels, y_test_three_bed_house_labels = train_val_test_sets(three_bed_house_merged)
X_train_four_bed_house_labels, X_val_four_bed_house_labels, X_test_four_bed_house_labels, y_train_four_bed_house_labels, y_val_four_bed_house_labels, y_test_four_bed_house_labels = train_val_test_sets(four_bed_house_merged)
X_train_all_properties_labels, X_val_all_properties_labels, X_test_all_properties_labels, y_train_all_properties_labels, y_val_all_properties_labels, y_test_all_properties_labels = train_val_test_sets(all_properties_merged)

In [6]:
def drop_labels(X_train, X_val, X_test, y_train, y_val, y_test):
    """
    This function removes the year and suburb columns from each 
    of the train, val and test dataframes.
    """
    
    # Drop the specified columns from X dataframes
    X_train = X_train.drop(columns=['suburb', 'year'])
    X_val = X_val.drop(columns=['suburb', 'year'])
    X_test = X_test.drop(columns=['suburb', 'year'])

    # Drop the specified columns from y dataframes
    y_train = y_train.drop(columns=['suburb', 'year'])
    y_val = y_val.drop(columns=['suburb', 'year'])
    y_test = y_test.drop(columns=['suburb', 'year'])
    
    # Return the modified dataframes without '_labels'
    return X_train, X_val, X_test, y_train, y_val, y_test


# For each dataset, apply the function
X_train_one_bed, X_val_one_bed, X_test_one_bed, y_train_one_bed, y_val_one_bed, y_test_one_bed = drop_labels(
    X_train_one_bed_labels, X_val_one_bed_labels, X_test_one_bed_labels, y_train_one_bed_labels, y_val_one_bed_labels, y_test_one_bed_labels
)

X_train_two_bed, X_val_two_bed, X_test_two_bed, y_train_two_bed, y_val_two_bed, y_test_two_bed = drop_labels(
    X_train_two_bed_labels, X_val_two_bed_labels, X_test_two_bed_labels, y_train_two_bed_labels, y_val_two_bed_labels, y_test_two_bed_labels
)

X_train_three_bed, X_val_three_bed, X_test_three_bed, y_train_three_bed, y_val_three_bed, y_test_three_bed = drop_labels(
    X_train_three_bed_labels, X_val_three_bed_labels, X_test_three_bed_labels, y_train_three_bed_labels, y_val_three_bed_labels, y_test_three_bed_labels
)

X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_val_two_bed_house, y_test_two_bed_house = drop_labels(
    X_train_two_bed_house_labels, X_val_two_bed_house_labels, X_test_two_bed_house_labels, y_train_two_bed_house_labels, y_val_two_bed_house_labels, y_test_two_bed_house_labels
)

X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_val_three_bed_house, y_test_three_bed_house = drop_labels(
    X_train_three_bed_house_labels, X_val_three_bed_house_labels, X_test_three_bed_house_labels, y_train_three_bed_house_labels, y_val_three_bed_house_labels, y_test_three_bed_house_labels
)

X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_val_four_bed_house, y_test_four_bed_house = drop_labels(
    X_train_four_bed_house_labels, X_val_four_bed_house_labels, X_test_four_bed_house_labels, y_train_four_bed_house_labels, y_val_four_bed_house_labels, y_test_four_bed_house_labels
)

X_train_all_properties, X_val_all_properties, X_test_all_properties, y_train_all_properties, y_val_all_properties, y_test_all_properties = drop_labels(
    X_train_all_properties_labels, X_val_all_properties_labels, X_test_all_properties_labels, y_train_all_properties_labels, y_val_all_properties_labels, y_test_all_properties_labels
)

Check to see all the X columns are the same 

In [7]:
def compare_feature_columns(train, val, test):
    """
    This function compares columns of the training, validation, and 
    testing feature dataframes (X). A dictionary is then returned
    indicating if any columns are missing in each set or if all
    the colums are the same. 
    """
    comparison_result = {}
    # Check if columns match between train, validation, and test sets
    train_val_match = train.columns.equals(val.columns)
    train_test_match = train.columns.equals(test.columns)
    val_test_match = val.columns.equals(test.columns)
    
    if not (train_val_match and train_test_match and val_test_match):
        missing_in_val = set(train.columns) - set(val.columns)
        missing_in_train_val = set(val.columns) - set(train.columns)
        missing_in_test = set(train.columns) - set(test.columns)
        missing_in_train_test = set(test.columns) - set(train.columns)
        missing_in_val_test = set(val.columns) - set(test.columns)
        missing_in_test_val = set(test.columns) - set(val.columns)

        comparison_result = {
            "Columns missing in validation set compared to train": list(missing_in_val),
            "Columns missing in train set compared to validation": list(missing_in_train_val),
            "Columns missing in test set compared to train": list(missing_in_test),
            "Columns missing in train set compared to test": list(missing_in_train_test),
            "Columns missing in test set compared to validation": list(missing_in_val_test),
            "Columns missing in validation set compared to test": list(missing_in_test_val),
        }
    else:
        comparison_result = "Columns are the same in all three sets (train, validation, test)."

    return comparison_result

# List of training, validation, and testing DataFrames to compare
feature_dfs = {
    "One Bed": (X_train_one_bed, X_val_one_bed, X_test_one_bed),
    "Two Bed": (X_train_two_bed, X_val_two_bed, X_test_two_bed),
    "Three Bed": (X_train_three_bed, X_val_three_bed, X_test_three_bed),
    "Two Bed House": (X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house),
    "Three Bed House": (X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house),
    "Four Bed House": (X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house),
    "All Properties": (X_train_all_properties, X_val_all_properties, X_test_all_properties)
}

# Compare columns for each triplet of training, validation, and testing sets
comparison_results = {name: compare_feature_columns(train, val, test) for name, (train, val, test) in feature_dfs.items()}

comparison_results 

{'One Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Two Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Three Bed': 'Columns are the same in all three sets (train, validation, test).',
 'Two Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'Three Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'Four Bed House': 'Columns are the same in all three sets (train, validation, test).',
 'All Properties': 'Columns are the same in all three sets (train, validation, test).'}

### Check Null Values

In [8]:
# List of dataframes to check for missing values
dataframes = {
    'X_train_one_bed': X_train_one_bed,
    'X_val_one_bed': X_val_one_bed,
    'X_test_one_bed': X_test_one_bed,
    'y_train_one_bed': y_train_one_bed,
    'y_val_one_bed': y_val_one_bed,
    'y_test_one_bed': y_test_one_bed,
    
    'X_train_two_bed': X_train_two_bed,
    'X_val_two_bed': X_val_two_bed,
    'X_test_two_bed': X_test_two_bed,
    'y_train_two_bed': y_train_two_bed,
    'y_val_two_bed': y_val_two_bed,
    'y_test_two_bed': y_test_two_bed,
    
    'X_train_three_bed': X_train_three_bed,
    'X_val_three_bed': X_val_three_bed,
    'X_test_three_bed': X_test_three_bed,
    'y_train_three_bed': y_train_three_bed,
    'y_val_three_bed': y_val_three_bed,
    'y_test_three_bed': y_test_three_bed,
    
    'X_train_two_bed_house': X_train_two_bed_house,
    'X_val_two_bed_house': X_val_two_bed_house,
    'X_test_two_bed_house': X_test_two_bed_house,
    'y_train_two_bed_house': y_train_two_bed_house,
    'y_val_two_bed_house': y_val_two_bed_house,
    'y_test_two_bed_house': y_test_two_bed_house,
    
    'X_train_three_bed_house': X_train_three_bed_house,
    'X_val_three_bed_house': X_val_three_bed_house,
    'X_test_three_bed_house': X_test_three_bed_house,
    'y_train_three_bed_house': y_train_three_bed_house,
    'y_val_three_bed_house': y_val_three_bed_house,
    'y_test_three_bed_house': y_test_three_bed_house,
    
    'X_train_four_bed_house': X_train_four_bed_house,
    'X_val_four_bed_house': X_val_four_bed_house,
    'X_test_four_bed_house': X_test_four_bed_house,
    'y_train_four_bed_house': y_train_four_bed_house,
    'y_val_four_bed_house': y_val_four_bed_house,
    'y_test_four_bed_house': y_test_four_bed_house,
    
    'X_train_all_properties': X_train_all_properties,
    'X_val_all_properties': X_val_all_properties,
    'X_test_all_properties': X_test_all_properties,
    'y_train_all_properties': y_train_all_properties,
    'y_val_all_properties': y_val_all_properties,
    'y_test_all_properties': y_test_all_properties,
}

In [9]:
# Collecting rows with missing values for each dataframe
missing_rows_summary = {}

for name, df in dataframes.items():
    rows_with_missing = df[df.isnull().any(axis=1)]
    if not rows_with_missing.empty:
        missing_rows_summary[name] = rows_with_missing

print(missing_rows_summary)

# Check for missing values in each dataframe by columns 
missing_values_summary = {}
for name, df in dataframes.items():
    missing_values = df.isnull().sum()
    columns_with_missing = missing_values[missing_values > 0]
    if not columns_with_missing.empty:
        missing_values_summary[name] = columns_with_missing

print(missing_values_summary)

{'y_test_one_bed':       dec_median  jun_median  mar_median  sep_median
9            NaN         NaN         NaN         NaN
10           NaN         NaN         NaN         NaN
11           NaN         NaN         NaN         NaN
21           NaN         NaN         NaN         NaN
22           NaN         NaN         NaN         NaN
...          ...         ...         ...         ...
1531         NaN         NaN         NaN         NaN
1532         NaN         NaN         NaN         NaN
1542         NaN         NaN         NaN         NaN
1543         NaN         NaN         NaN         NaN
1544         NaN         NaN         NaN         NaN

[393 rows x 4 columns], 'y_test_two_bed':       dec_median  jun_median  mar_median  sep_median
9            NaN         NaN         NaN         NaN
10           NaN         NaN         NaN         NaN
11           NaN         NaN         NaN         NaN
21           NaN         NaN         NaN         NaN
22           NaN         NaN         

### Feature Selection

In [10]:
# Define a list of all training, validation, and test sets
ML_dfs = [
    (X_train_one_bed, X_val_one_bed, X_test_one_bed, y_train_one_bed, y_val_one_bed, y_test_one_bed),
    (X_train_two_bed, X_val_two_bed, X_test_two_bed, y_train_two_bed, y_val_two_bed, y_test_two_bed),
    (X_train_three_bed, X_val_three_bed, X_test_three_bed, y_train_three_bed, y_val_three_bed, y_test_three_bed),
    (X_train_two_bed_house, X_val_two_bed_house, X_test_two_bed_house, y_train_two_bed_house, y_val_two_bed_house, y_test_two_bed_house),
    (X_train_three_bed_house, X_val_three_bed_house, X_test_three_bed_house, y_train_three_bed_house, y_val_three_bed_house, y_test_three_bed_house),
    (X_train_four_bed_house, X_val_four_bed_house, X_test_four_bed_house, y_train_four_bed_house, y_val_four_bed_house, y_test_four_bed_house),
    (X_train_all_properties, X_val_all_properties, X_test_all_properties, y_train_all_properties, y_val_all_properties, y_test_all_properties),
]

In [11]:
# Define the hyperparameters for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [5, 10, 15, None],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
}

def rfecv_with_random_forest(X_train, X_val, X_test, y_train):
    """
    Apply Recursive Feature Elimination with Cross-Validation (RFECV) using
    Random Forest as the estimator to automatically select the optimal number
    of features, then use GridSearchCV to fine-tune hyperparameters on the reduced feature set.
    
    Parameters:
    X_train: Training feature set
    X_val: Validation feature set
    X_test: Test feature set
    y_train: Training labels (target)
    
    Returns:
    X_train_rfecv, X_val_rfecv, X_test_rfecv: Reduced datasets
    best_rf: Best tuned Random Forest model after feature selection
    """
    
    # Standardize the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # Initialize the Random Forest model
    rf_model = RandomForestRegressor(random_state=42)
    
    # Use RFECV to automatically select the optimal number of features
    rfecv = RFECV(estimator=rf_model, step=1, cv=KFold(5), scoring='neg_mean_squared_error', n_jobs=-1)
    rfecv.fit(X_train_scaled, y_train)

    # Transform the datasets based on the selected features
    X_train_rfecv = rfecv.transform(X_train_scaled)
    X_val_rfecv = rfecv.transform(X_val_scaled)
    X_test_rfecv = rfecv.transform(X_test_scaled)

    # Perform hyperparameter tuning using GridSearchCV on the reduced feature set
    rf_after_rfecv = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(rf_after_rfecv, param_grid, v, cv=5)
    grid_search.fit(X_train_rfecv, y_train)
    
    # Select the best Random Forest model based on GridSearchCV
    best_rf = grid_search.best_estimator_

    # Get the selected feature indices and names
    selected_features = rfecv.get_support(indices=True)
    selected_feature_names = [X_train.columns[i] for i in selected_features]  # Use X_train columns

    # Get the feature importance values from the trained Random Forest model (best_rf)
    importances = best_rf.feature_importances_

    # Pair selected features with their corresponding importance values
    feature_importance_pairs = list(zip(selected_feature_names, importances))

    # Sort the features by importance values in descending order
    sorted_features = sorted(feature_importance_pairs, key=lambda x: x[1], reverse=True)

    # Print the sorted features with their importance values
    print(f"Optimal number of features selected: {rfecv.n_features_}")
    print(f"Selected feature names: {selected_feature_names}")
    print(f"Selected features sorted by importance:")
    for feature, importance in sorted_features:
        print(f"{feature}: {importance:.4f}")

    return X_train_rfecv, X_val_rfecv, X_test_rfecv, best_rf, selected_feature_names


----

### Train & Fit Model

Random Forest

In [12]:
# Initialise the predictions dictionary and MAE list
predictions_dict = {}
mae_list = []

# Property Types
property_types = ['one_bed_flat', 'two_bed_flat', 'three_bed_flat', 
                  'two_bed_house', 'three_bed_house', 'four_bed_house', 'all_properties']

# Loop through each set, perform RFE, and predict using the tuned Random Forest model
for i, (X_train, X_val, X_test, y_train, y_val, y_test) in enumerate(ML_dfs):
    print(f"Property Type: {property_types[i]}")
    # Perform feature selection and get the best model
    X_train_rfecv, X_val_rfecv, X_test_rfecv, best_rf, selected_feature_names = rfecv_with_random_forest(
        X_train, X_val, X_test, y_train
    )
    
    # Evaluate on the validation set
    y_val_pred = best_rf.predict(X_val_rfecv)
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)

    # Save MAE
    mae_list.append(val_mae)
    
    # Print the validation results
    print(f"Best n_estimators: {best_rf.n_estimators}, Best max_depth: {best_rf.max_depth}")
    print(f"Validation MSE: {val_mse:.4f}, R^2: {val_r2:.4f}, Validation MAE: {val_mae:.4f}")
    
    # Combine the training and validation sets for final model training
    X_train_val_rfecv = np.vstack((X_train_rfecv, X_val_rfecv))
    y_train_val = np.concatenate((y_train, y_val))
    
    # Retrain the model using the combined training and validation sets
    best_rf.fit(X_train_val_rfecv, y_train_val)
    
    # Make predictions on the test set
    y_test_pred = best_rf.predict(X_test_rfecv)
    
    # Store predictions in the dictionary with dataset index as key
    predictions_dict[f'X_test_{i+1}_predictions'] = y_test_pred
    
    # Print the predictions for the test set
    print(f"Predictions for 2025-2027: {y_test_pred}\n")
    print()

Property Type: one_bed_flat


KeyboardInterrupt: 

### Save Predictions

In [None]:
# Define the base path
base_path = '../data/curated/predictions'

# Ensure the directory exists
if not os.path.exists(base_path):
    os.makedirs(base_path)

In [None]:
# Prediction Column Names
column_names = ['dec_median', 'jun_median', 'mar_median', 'sep_median']

# labelled_dfs corresponds to the datasets with labels for each property type
labelled_dfs = [X_test_one_bed_labels, X_test_two_bed_labels, X_test_three_bed_labels, 
                X_test_two_bed_house_labels, X_test_three_bed_house_labels, 
                X_test_four_bed_house_labels, X_test_all_properties_labels]

# Iterate through the predictions dictionary and labelled DataFrames
for i, (key, value) in enumerate(predictions_dict.items()):
    # Select only the 'year' and 'suburb' columns from the labelled_dfs
    labelled_df_subset = labelled_dfs[i][['suburb', 'year']]

    # Initialise an empty DataFrame for the predictions
    predictions_df = pd.DataFrame()

    # value is a 2D array with multiple rows and 4 columns (n_samples, 4)
    # Add each column using the custom names for the median values
    for j in range(value.shape[1]):
        predictions_df[column_names[j]] = value[:, j]

    # Reset the index for both DataFrames to ensure proper alignment
    labelled_df_subset = labelled_df_subset.reset_index(drop=True)
    predictions_df = predictions_df.reset_index(drop=True)
    
    # Concatenate the 'year' and 'suburb' columns with the predictions_df
    labelled_with_predictions = pd.concat([labelled_df_subset, predictions_df], axis=1)

    # Save the new DataFrame with only 'year', 'suburb', and predictions to a CSV file
    labelled_with_predictions.to_csv(f"../data/curated/predictions/{property_types[i]}_predictions.csv", index=False)

    # Print confirmation
    print(f"Saved {property_types[i]} with 'year', 'suburb', and predictions.")


Saved one_bed_flat with 'year', 'suburb', and predictions.
Saved two_bed_flat with 'year', 'suburb', and predictions.
Saved three_bed_flat with 'year', 'suburb', and predictions.
Saved two_bed_house with 'year', 'suburb', and predictions.
Saved three_bed_house with 'year', 'suburb', and predictions.
Saved four_bed_house with 'year', 'suburb', and predictions.
Saved all_properties with 'year', 'suburb', and predictions.


### Save Error

In [None]:
# Convert the MAE list to a pandas DataFrame
mae_df = pd.DataFrame(mae_list, columns=['MAE'])

# Save the DataFrame to a CSV file
mae_df.to_csv("../data/curated/predictions/mae.csv", index=False)

# Print confirmation
print("MAE list has been saved to '../data/curated/predictions/mae.csv'")

MAE list has been saved to '../data/curated/predictions/mae.csv'
