In [151]:
# Import necessary libraries
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import regex

In [152]:
# load data
data = pd.read_parquet('../data/curated/features_domain.parquet')  

In [153]:
data.drop(columns=['url','price','latitude','longitude','geometry', 'geometry_proj', 'index_right','bond'], inplace=True)

In [154]:
data.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in data.columns.values]

In [155]:
y = data['extracted_price']  # Target variable
data = data.drop('extracted_price', axis=1)

In [156]:
data.columns[:50]

Index(['address', 'property_type', 'Beds', 'Baths', 'Parking', 'sa2_code',
       'sa2_name', 'chg_flag', 'chg_lbl', 'sa3_code', 'sa3_name', 'sa4_code',
       'sa4_name', 'gcc_code', 'gcc_name', 'ste_code', 'ste_name', 'aus_code',
       'aus_name', 'areasqkm', 'loci_uri', 'suburb', 'property_id',
       'nearest_station_id', 'train_station_distance_km', 'cbd_distance_km',
       'nearest_hospital_id', 'nearest_hospital_distance', 'mean_stores',
       'total_stores', 'count_shopping_centres', 'SA2 code_x', 'Region',
       'ERP_2021', 'ERP_2026', 'ERP_2031', 'ERP_2036', 'PNPD_2021',
       'PNPD_2026', 'PNPD_2031', 'PNPD_2036', 'POPD_2021', 'POPD_2026',
       'POPD_2031', 'POPD_2036', 'OPD_2021', 'OPD_2026', 'OPD_2031',
       'OPD_2036', 'HHS_2021'],
      dtype='object')

In [159]:
# Step 1: Calculate the correlation matrix
corr_matrix = data.corr().abs()

# Step 2: Identify pairs of highly correlated features (greater than the threshold, e.g., 0.8)
threshold = 0.8
high_corr_var = [(i, j, corr_matrix[i][j]) for i in corr_matrix.columns for j in corr_matrix.columns if i != j and corr_matrix[i][j] > threshold]

# Step 3: Sort the pairs by correlation in descending order and select the top 10
sorted_high_corr = sorted(high_corr_var, key=lambda x: x[2], reverse=True)[:10]

# Step 4: Print out the top 10 pairs with their correlation values
print("Top 10 pairs with the highest correlation:")
for i, j, corr_value in sorted_high_corr:
    print(f"{i} and {j}: {corr_value:.4f}")

Top 10 pairs with the highest correlation:
erp_2022_no. and Earners: 0.8866
Earners and erp_2022_no.: 0.8866
P80/P50 and P20/P50: 0.8486
P20/P50 and P80/P50: 0.8486
Median and Mean: 0.8006
Mean and Median: 0.8006


  corr_matrix = data.corr().abs()


In [158]:
# Drop the specified columns from the DataFrame
data = data.drop(columns=['SA2 code_x', 'sa3_code', 'sa3_name', 'sa4_code', 'sa4_name', 'areasqkm', 'OCC_2036', 'HHS_2036', 'OCC_2026', 'HHS_2026', 'POPD_2036', 
       'HHS_2031', 'SPD_2021', 'SPD_2026', 'OCC_2021', 'OCC_2031','POPD_2031', 'PNPD_2031', 'POPD_2021', 'Top 5%', 'POPD_2026', 'SPD_2036', 'SPD_2031', 'erp_2023_no.', 'erp_change_no.', 
       'erp_change_percentage', 'ERP_2021','ERP_2026', 'ERP_2021',
       'ERP_2026', 'ERP_2031', 'ERP_2036', 'PNPD_2021', 'PNPD_2026',
       'PNPD_2036', 'OPD_2021', 'OPD_2026', 'OPD_2031', 'OPD_2036', 'HHS_2021',
       'population_growth_2021_2026', 'population_growth_2026_2031', 'population_growth_2031_2036', 'SA2 code_y', 'Top 1%', 'P80/P20', 'Gini coefficient', 'total_number_of_items', 'Highest Quartile', 'Third Quartile', 'Lowest Quartile', 'P10/P50', "Baths"])


In [162]:
X = data  # Features

# Step 2: Convert categorical columns if any
# Convert categorical columns to category dtype or one-hot encode them if necessary
X = pd.get_dummies(X, drop_first=True)  # One-hot encoding categorical features

# Step 3: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize and train the XGBoost Regressor
model = XGBRegressor()
model.fit(X_train, y_train)

# Step 5: Make predictions and evaluate the model
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse:.2f}")

# Step 6: Get Feature Importance
importance = model.feature_importances_


RMSE: 138.04




In [165]:
# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
})

# Sort the DataFrame by Importance and select the top 10 features
top_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(20)
top_features.to_csv('../data/curated/top_features_xgboost.csv')
# Print the top 10 most important features
print("Top 20 Most Important Features:")
print(top_features)


Top 20 Most Important Features:
                                                Feature  Importance
34                                              Top 10%    0.146026
31                                                 Mean    0.085039
0                                                  Beds    0.053227
19              population_density_persons_per_km2_2023    0.023603
9462                                property_type_House    0.023332
7                                       cbd_distance_km    0.018513
35                                      Second Quartile    0.013542
10490                                        suburb_NaN    0.013534
11326                nearest_parkres_name_Princes Wharf    0.012439
1                                               Parking    0.010299
15                          pop_change_natural_increase    0.009779
9468                            property_type_Townhouse    0.009166
11303  nearest_parkres_name_Lower Maribyrnong Parklands    0.008051
28              