In [199]:
# Import necessary libraries
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import regex

In [200]:
# load data
data = pd.read_parquet('../data/curated/features_domain.parquet')  

In [201]:
data.drop(columns=['url','price','latitude','longitude','geometry', 'geometry_proj', 'index_right','bond'], inplace=True)

In [202]:
data.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in data.columns.values]

In [203]:
# Step 1: Calculate the correlation matrix
corr_matrix = data.corr().abs()

# Step 2: Identify pairs of highly correlated features (greater than the threshold, e.g., 0.8)
threshold = 0.8
high_corr_var = [(i, j, corr_matrix[i][j]) for i in corr_matrix.columns for j in corr_matrix.columns if i != j and corr_matrix[i][j] > threshold]

# Step 3: Sort the pairs by correlation in descending order and select the top 10
sorted_high_corr = sorted(high_corr_var, key=lambda x: x[2], reverse=True)[:10]

# Step 4: Print out the top 10 pairs with their correlation values
print("Top 10 pairs with the highest correlation:")
for i, j, corr_value in sorted_high_corr:
    print(f"{i} and {j}: {corr_value:.4f}")

Top 10 pairs with the highest correlation:
sa3_code and SA2 code_x: 1.0000
SA2 code_x and sa3_code: 1.0000
areasqkm and area_km2: 1.0000
area_km2 and areasqkm: 1.0000
sa3_code and sa4_code: 1.0000
sa4_code and sa3_code: 1.0000
sa4_code and SA2 code_x: 1.0000
SA2 code_x and sa4_code: 1.0000
OCC_2031 and OCC_2036: 0.9996
OCC_2036 and OCC_2031: 0.9996


  corr_matrix = data.corr().abs()


In [204]:
# Drop the specified columns from the DataFrame
data = data.drop(columns=['SA2 code_x', 'sa3_code', 'sa3_name', 'sa4_code', 'sa4_name', 'areasqkm', 'OCC_2036', 'HHS_2036', 'OCC_2026', 'HHS_2026', 'POPD_2036', 
       'HHS_2031', 'SPD_2021', 'SPD_2026', 'OCC_2021', 'OCC_2031','POPD_2031', 'PNPD_2031', 'POPD_2021', 'Top 5%', 'POPD_2026', 'SPD_2036', 'SPD_2031', 'erp_2023_no.', 'erp_change_no.', 
       'erp_change_percentage', 'ERP_2021','ERP_2026', 'ERP_2021',
       'ERP_2026', 'ERP_2031', 'ERP_2036', 'PNPD_2021', 'PNPD_2026',
       'PNPD_2036', 'OPD_2021', 'OPD_2026', 'OPD_2031', 'OPD_2036', 'HHS_2021',
       'population_growth_2021_2026', 'population_growth_2026_2031', 'population_growth_2031_2036', 'SA2 code_y', 'Top 1%', 'P80/P20', 'Gini coefficient', 'total_number_of_items', 'Highest Quartile', 'Third Quartile', 'Lowest Quartile', 'P10/P50', "Baths"])


In [205]:
X = data  # Features

# Step 2: Convert categorical columns if any
# Convert categorical columns to category dtype or one-hot encode them if necessary
X = pd.get_dummies(X, drop_first=True)  # One-hot encoding categorical features

# Step 3: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize and train the XGBoost Regressor
model = XGBRegressor()
model.fit(X_train, y_train)

# Step 5: Make predictions and evaluate the model
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse:.2f}")

# Step 6: Get Feature Importance
importance = model.feature_importances_


RMSE: 7.82




In [206]:
# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
})

# Sort the DataFrame by Importance and select the top 10 features
top_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(20)
feature_importance_df.to_csv('../data/curated/feature_importance_xgboost.csv')
# Print the top 10 most important features
print("Top 20 Most Important Features:")
print(top_features)


Top 20 Most Important Features:
                                Feature  Importance
2                       extracted_price    0.953673
14                       Facility Count    0.002576
16          pop_change_natural_increase    0.002410
18         pop_change_oversea_migration    0.002389
34                              P20/P50    0.002294
26                 total_value_of_items    0.001652
10                          mean_stores    0.001519
1                               Parking    0.001368
23           distance_to_closest_school    0.001296
11341  nearest_parkres_name_Wattle Park    0.001285
8                       cbd_distance_km    0.001110
19                             area_km2    0.001043
22                       closest_school    0.001026
9524                 sa2_code_207011149    0.000965
10691         nearest_hospital_id_H5383    0.000913
21             nearest_parkres_distance    0.000871
3                              chg_flag    0.000775
27                      crime_fr