In [215]:
# Import necessary libraries
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import regex

In [216]:
# load data
data = pd.read_parquet('../data/curated/features_domain.parquet')  

In [217]:
data.drop(columns=['url','price','latitude','longitude','geometry', 'geometry_proj', 'index_right','bond'], inplace=True)

In [218]:
data.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in data.columns.values]

In [219]:
corr_matrix = data.corr().abs()

# Identify pairs of highly correlated features (greater than the threshold, e.g., 0.9)
threshold = 0.9
high_corr_var = [(i, j, corr_matrix[i][j]) for i in corr_matrix.columns for j in corr_matrix.columns if i != j and corr_matrix[i][j] > threshold]

# Sort the pairs by correlation in descending order and select the top 20
sorted_high_corr = sorted(high_corr_var, key=lambda x: x[2], reverse=True)[:70]

# Print out the top 20 pairs with their correlation values
print("Top 20 pairs with the highest correlation:")
for i, j, corr_value in sorted_high_corr:
    print(f"{i} and {j}: {corr_value:.4f}")

Top 20 pairs with the highest correlation:
sa3_code and SA2 code_x: 1.0000
SA2 code_x and sa3_code: 1.0000
areasqkm and area_km2: 1.0000
area_km2 and areasqkm: 1.0000
sa3_code and sa4_code: 1.0000
sa4_code and sa3_code: 1.0000
sa4_code and SA2 code_x: 1.0000
SA2 code_x and sa4_code: 1.0000
OCC_2031 and OCC_2036: 0.9996
OCC_2036 and OCC_2031: 0.9996
HHS_2031 and HHS_2036: 0.9992
HHS_2036 and HHS_2031: 0.9992
HHS_2026 and HHS_2031: 0.9987
HHS_2031 and HHS_2026: 0.9987
OCC_2026 and OCC_2031: 0.9984
OCC_2031 and OCC_2026: 0.9984
OCC_2026 and OCC_2036: 0.9974
OCC_2036 and OCC_2026: 0.9974
ERP_2036 and POPD_2036: 0.9974
POPD_2036 and ERP_2036: 0.9974
HHS_2026 and HHS_2036: 0.9967
HHS_2036 and HHS_2026: 0.9967
ERP_2031 and POPD_2031: 0.9962
POPD_2031 and ERP_2031: 0.9962
Top 5% and Top 10%: 0.9942
Top 10% and Top 5%: 0.9942
PNPD_2031 and PNPD_2036: 0.9941
PNPD_2036 and PNPD_2031: 0.9941
ERP_2021 and POPD_2021: 0.9936
POPD_2021 and ERP_2021: 0.9936
ERP_2026 and POPD_2026: 0.9936
POPD_2026 and 

  corr_matrix = data.corr().abs()


In [220]:
# Step 1: Calculate the correlation between each feature and 'extracted_price'
corr_with_target = data.corr()['extracted_price'].abs()

# Step 2: Set to keep track of dropped features
dropped_features = set()

# Step 3: Iterate over the sorted high-correlation pairs
for i, j, corr_value in sorted_high_corr:
    # Check if either feature has already been dropped
    if i in dropped_features or j in dropped_features:
        continue  # Skip this pair if one of them is already dropped

    # Get the correlation of both features with the target variable, ensuring scalar values are used
    corr_i = corr_with_target[i] if i in corr_with_target else 0
    corr_j = corr_with_target[j] if j in corr_with_target else 0

    # Drop the feature with the lower correlation with the target
    if corr_i < corr_j:
        dropped_features.add(i)
    else:
        dropped_features.add(j)

# Step 4: Drop the selected features from the DataFrame
cleaned_df_dropped = data.drop(columns=list(dropped_features))

# Step 5: Print the features that were dropped
print(f"Features dropped: {dropped_features}")


Features dropped: {'OCC_2026', 'SPD_2031', 'OCC_2031', 'HHS_2021', 'POPD_2031', 'HHS_2031', 'POPD_2036', 'POPD_2026', 'population_growth_2021_2026', 'Top 5%', 'HHS_2026', 'sa3_code', 'OPD_2031', 'SPD_2036', 'sa4_code', 'POPD_2021', 'erp_2022_no.', 'ERP_2031', 'PNPD_2021', 'SPD_2026', 'area_km2', 'PNPD_2031'}


  corr_with_target = data.corr()['extracted_price'].abs()


In [221]:
y = data['extracted_price']
data = data.drop(columns=['extracted_price'])

In [225]:
X = data  # Features

# Step 2: Convert categorical columns if any
# Convert categorical columns to category dtype or one-hot encode them if necessary
X = pd.get_dummies(X, drop_first=True)  # One-hot encoding categorical features

# Step 3: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize and train the XGBoost Regressor
model = XGBRegressor()
model.fit(X_train, y_train)

# Step 5: Make predictions and evaluate the model
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse:.2f}")

# Step 6: Get Feature Importance
importance = model.feature_importances_


In [None]:
# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
})

# Sort the DataFrame by Importance and select the top 10 features
top_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(20)
feature_importance_df.to_csv('../data/curated/feature_importance_xgboost.csv')
# Print the top 10 most important features
print("Top 20 Most Important Features:")
print(top_features)


Top 20 Most Important Features:
                         Feature  Importance
4                       sa3_code    0.099263
1                          Baths    0.073630
71                          Mean    0.057019
83              Highest Quartile    0.051511
76              Gini coefficient    0.035453
36                      HHS_2036    0.031068
79                       Top 10%    0.024519
72                       P80/P20    0.023076
34                      HHS_2026    0.020997
0                           Beds    0.020306
33                      HHS_2021    0.017377
10068                chg_lbl_New    0.015524
78                        Top 5%    0.013738
9510         property_type_House    0.012033
10580                 suburb_NaN    0.011202
10700  nearest_hospital_id_H0358    0.009829
39                      SPD_2031    0.009534
81               Second Quartile    0.008460
70                        Median    0.008245
82                Third Quartile    0.007972
