# Using a Random Forest Regressor to Find Most Influential Features

In [27]:
import pandas as pd

domain_dataset = pd.read_parquet("../data/curated/features_domain.parquet")

In [28]:
domain_dataset.head(10)

Unnamed: 0,url,price,address,property_type,latitude,longitude,Beds,Baths,Parking,bond,...,P20/P50,P10/P50,Gini coefficient,Top 1%,Top 5%,Top 10%,Lowest Quartile,Second Quartile,Third Quartile,Highest Quartile
0,https://www.domain.com.au/10-allara-court-donv...,"$1,400.00","10 Allara Court, Donvale VIC 3111",Townhouse,-37.774273,145.181126,4.0,3.0,2.0,9125.0,...,0.32,0.11,0.555,13.1,27.9,39.7,28.5,22.1,19.0,30.4
1,https://www.domain.com.au/7-pine-ridge-donvale...,$750 per week,"7 Pine Ridge, Donvale VIC 3111",House,-37.791251,145.175649,4.0,2.0,0.0,3259.0,...,0.32,0.11,0.555,13.1,27.9,39.7,28.5,22.1,19.0,30.4
2,https://www.domain.com.au/20-mulsanne-way-donv...,$1300 per week,"20 Mulsanne Way, Donvale VIC 3111",House,-37.797232,145.181264,5.0,2.0,2.0,5649.0,...,0.32,0.11,0.555,13.1,27.9,39.7,28.5,22.1,19.0,30.4
3,https://www.domain.com.au/3-monterey-crescent-...,$825pw / $3585pcm,"3 Monterey Crescent, Donvale VIC 3111",House,-37.792402,145.174323,3.0,1.0,5.0,3585.0,...,0.32,0.11,0.555,13.1,27.9,39.7,28.5,22.1,19.0,30.4
4,https://www.domain.com.au/3-49-leslie-street-d...,$680.00,"3/49 Leslie Street, Donvale VIC 3111",Townhouse,-37.781012,145.180705,3.0,2.0,2.0,2955.0,...,0.32,0.11,0.555,13.1,27.9,39.7,28.5,22.1,19.0,30.4
5,https://www.domain.com.au/6-81-97-mitcham-road...,$575.00,"6/81-97 Mitcham Road, Donvale VIC 3111",Townhouse,-37.797814,145.181397,2.0,1.0,2.0,2499.0,...,0.32,0.11,0.555,13.1,27.9,39.7,28.5,22.1,19.0,30.4
6,https://www.domain.com.au/4-49-leslie-street-d...,$680.00,"4/49 Leslie Street, Donvale VIC 3111",Townhouse,-37.780842,145.180724,3.0,2.0,2.0,2955.0,...,0.32,0.11,0.555,13.1,27.9,39.7,28.5,22.1,19.0,30.4
7,https://www.domain.com.au/10-51-55-leslie-stre...,$500 Per Week,"10/51-55 Leslie Street, Donvale VIC 3111",Apartment / Unit / Flat,-37.781431,145.181474,2.0,1.0,1.0,2173.0,...,0.32,0.11,0.555,13.1,27.9,39.7,28.5,22.1,19.0,30.4
8,https://www.domain.com.au/6-martha-street-donv...,$695 per week,"6 Martha Street, Donvale VIC 3111",Apartment / Unit / Flat,-37.795872,145.174319,3.0,2.0,1.0,3020.0,...,0.32,0.11,0.555,13.1,27.9,39.7,28.5,22.1,19.0,30.4
9,https://www.domain.com.au/55-darvall-street-do...,$800.00,"55 Darvall Street, Donvale VIC 3111",House,-37.802745,145.175619,4.0,2.0,2.0,3476.0,...,0.32,0.11,0.555,13.1,27.9,39.7,28.5,22.1,19.0,30.4


In [29]:

import pandas as pd

# List of columns you want to drop
columns_to_drop = ['bond', 'Facility Count', 'price', 'index_right', 'chg_flag', 'chg_lbl', 'property_id', 'nearest_hospital_id', 'nearest_station_id', 'latitude', 'longitude']  # Replace with actual column names

# Dropping the columns
domain_dataset = domain_dataset.drop(columns_to_drop, axis=1)


In [30]:
import regex
domain_dataset.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in domain_dataset.columns.values]

In [31]:
import pandas as pd

# Calculating the number of missing values for each feature
missing_values = domain_dataset.isnull().sum()

# Filtering out features that have missing values
missing_values = missing_values[missing_values > 0]

# Displaying the features and their counts of missing values
print(missing_values)


Beds                            136
Baths                            67
Parking                           9
train_station_distance_km         2
mean_stores                    5373
total_stores                   5373
count_shopping_centres         5373
ERP_2021                          2
ERP_2026                          2
ERP_2031                          2
population_growth_2021_2026       2
population_growth_2026_2031       2
population_growth_2031_2036       2
total_population_growth           2
total_value_of_items           3853
total_number_of_items          3853
crime_frequency                3853
P80/P20                           5
P80/P50                           5
P20/P50                           5
P10/P50                           5
Gini coefficient                  5
Top 1%                            5
Top 5%                            5
Top 10%                           5
Lowest Quartile                   2
Second Quartile                   2
Third Quartile              

In [32]:
import pandas as pd

# Assuming 'domain_dataset' is your DataFrame and already loaded

# Columns you want to impute with 0
columns_to_impute = ['mean_stores', 'total_stores', 'count_shopping_centres', 'total_value_of_items', 'total_number_of_items', 'crime_frequency']

# Imputing missing values with 0 for these specific columns
domain_dataset[columns_to_impute] = domain_dataset[columns_to_impute].fillna(0)

# Check to confirm that the imputation was successful
print(domain_dataset[columns_to_impute].isnull().sum())


mean_stores               0
total_stores              0
count_shopping_centres    0
total_value_of_items      0
total_number_of_items     0
crime_frequency           0
dtype: int64


In [33]:
import pandas as pd

# Assuming 'domain_dataset' is your DataFrame and already loaded

# Calculating the number of missing values for each feature
missing_values = domain_dataset.isnull().sum()

# Filtering out features that have missing values
missing_values = missing_values[missing_values > 0]

# Displaying the features and their counts of missing values
print(missing_values)


Beds                           136
Baths                           67
Parking                          9
train_station_distance_km        2
ERP_2021                         2
ERP_2026                         2
ERP_2031                         2
population_growth_2021_2026      2
population_growth_2026_2031      2
population_growth_2031_2036      2
total_population_growth          2
P80/P20                          5
P80/P50                          5
P20/P50                          5
P10/P50                          5
Gini coefficient                 5
Top 1%                           5
Top 5%                           5
Top 10%                          5
Lowest Quartile                  2
Second Quartile                  2
Third Quartile                   2
Highest Quartile                 2
dtype: int64


In [34]:
print(domain_dataset.columns)

Index(['url', 'address', 'property_type', 'Beds', 'Baths', 'Parking',
       'extracted_price', 'geometry', 'sa2_code', 'sa2_name',
       ...
       'P20/P50', 'P10/P50', 'Gini coefficient', 'Top 1%', 'Top 5%', 'Top 10%',
       'Lowest Quartile', 'Second Quartile', 'Third Quartile',
       'Highest Quartile'],
      dtype='object', length=103)


In [35]:
# Dropping rows with any NaN values
cleaned_df = domain_dataset.dropna()
print(len(cleaned_df))

9422


In [36]:

# Step 2: Calculate the absolute correlation matrix for the filtered DataFrame
corr_matrix = cleaned_df.corr().abs()

# Step 3: Identify pairs of highly correlated features (greater than the threshold, e.g., 0.9)
threshold = 0.9
high_corr_var = [(i, j, corr_matrix[i][j]) for i in corr_matrix.columns for j in corr_matrix.columns if i != j and corr_matrix[i][j] > threshold]

# Step 4: Sort the pairs by correlation in descending order and select the top 20
sorted_high_corr = sorted(high_corr_var, key=lambda x: x[2], reverse=True)[:70]

# Step 5: Print out the top 20 pairs with their correlation values
print("Top 20 pairs with the highest correlation:")
for i, j, corr_value in sorted_high_corr:
    print(f"{i} and {j}: {corr_value:.4f}")

  corr_matrix = cleaned_df.corr().abs()


Top 20 pairs with the highest correlation:
sa3_code and SA2 code_x: 1.0000
SA2 code_x and sa3_code: 1.0000
areasqkm and area_km2: 1.0000
area_km2 and areasqkm: 1.0000
sa3_code and sa4_code: 1.0000
sa4_code and sa3_code: 1.0000
sa4_code and SA2 code_x: 1.0000
SA2 code_x and sa4_code: 1.0000
OCC_2031 and OCC_2036: 0.9995
OCC_2036 and OCC_2031: 0.9995
HHS_2031 and HHS_2036: 0.9992
HHS_2036 and HHS_2031: 0.9992
HHS_2026 and HHS_2031: 0.9987
HHS_2031 and HHS_2026: 0.9987
OCC_2026 and OCC_2031: 0.9983
OCC_2031 and OCC_2026: 0.9983
ERP_2036 and POPD_2036: 0.9974
POPD_2036 and ERP_2036: 0.9974
OCC_2026 and OCC_2036: 0.9972
OCC_2036 and OCC_2026: 0.9972
HHS_2026 and HHS_2036: 0.9966
HHS_2036 and HHS_2026: 0.9966
ERP_2031 and POPD_2031: 0.9963
POPD_2031 and ERP_2031: 0.9963
Top 5% and Top 10%: 0.9944
Top 10% and Top 5%: 0.9944
PNPD_2031 and PNPD_2036: 0.9940
PNPD_2036 and PNPD_2031: 0.9940
ERP_2021 and POPD_2021: 0.9937
POPD_2021 and ERP_2021: 0.9937
ERP_2026 and POPD_2026: 0.9937
POPD_2026 and 

In [37]:
# Step 1: Calculate the correlation between each feature and 'extracted_price'
corr_with_target = cleaned_df.corr()['extracted_price'].abs()

# Step 2: Set to keep track of dropped features
dropped_features = set()

# Step 3: Iterate over the sorted high-correlation pairs
for i, j, corr_value in sorted_high_corr:
    # Check if either feature has already been dropped
    if i in dropped_features or j in dropped_features:
        continue  # Skip this pair if one of them is already dropped

    # Get the correlation of both features with the target variable, ensuring scalar values are used
    corr_i = corr_with_target[i] if i in corr_with_target else 0
    corr_j = corr_with_target[j] if j in corr_with_target else 0

    # Drop the feature with the lower correlation with the target
    if corr_i < corr_j:
        dropped_features.add(i)
    else:
        dropped_features.add(j)

# Step 4: Drop the selected features from the DataFrame
cleaned_df_dropped = cleaned_df.drop(columns=list(dropped_features))

# Step 5: Print the features that were dropped
print(f"Features dropped: {dropped_features}")


Features dropped: {'OPD_2031', 'SPD_2036', 'PNPD_2031', 'sa3_code', 'POPD_2036', 'PNPD_2021', 'HHS_2031', 'OCC_2026', 'OCC_2031', 'erp_2022_no.', 'POPD_2021', 'POPD_2031', 'HHS_2021', 'area_km2', 'Top 5%', 'ERP_2031', 'sa4_code', 'OPD_2026', 'HHS_2026', 'POPD_2026'}


  corr_with_target = cleaned_df.corr()['extracted_price'].abs()


In [38]:
beds_baths_corr = domain_dataset[['Beds', 'Baths']].corr()

# Print the correlation value
print("Correlation between Beds and Baths:")
print(beds_baths_corr)

Correlation between Beds and Baths:
          Beds    Baths
Beds   1.00000  0.62069
Baths  0.62069  1.00000


In [39]:
import numpy as np
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Assuming 'domain_dataset' is your DataFrame and already loaded
numeric_df = cleaned_df_dropped.select_dtypes(include=[np.number])
# Separating the independent variables and the dependent variable
X = numeric_df.drop('extracted_price', axis=1)
y = numeric_df['extracted_price']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating the Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fitting the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Calculating the mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

# Getting feature importances
feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

# Displaying the top 10 most important features
print(feature_importances.head(20))


Root Mean Squared Error: 123.04969379413848
                                         importance
Baths                                      0.203269
Beds                                       0.157216
SA2 code_x                                 0.141317
Top 10%                                    0.061924
HHS_2036                                   0.048766
cbd_distance_km                            0.042036
train_station_distance_km                  0.031351
distance_to_closest_school                 0.030947
nearest_hospital_distance                  0.029218
nearest_parkres_distance                   0.027221
distance_to_closest_independent_school     0.026598
Mean                                       0.021493
Parking                                    0.016385
Highest Quartile                           0.015704
closest_school                             0.014393
Second Quartile                            0.011715
closest_independent_school                 0.010334
Gini coefficient    

In [40]:
feature_importances.to_csv("../data/curated/RF_feature_importance.csv")