In [1]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm

In [2]:
df = pd.read_csv('../../data/curated/individual_property_final.csv')
df.isnull().sum()

Address                             0
Cost                            18636
Property Type                       0
Bedrooms                            0
Bathrooms                           0
Latitude                            0
Longitude                           0
Closest Gov Secondary School        0
Age under 20                        0
Age 20-39                           0
Age 40-59                           0
Age 60+                             0
Postcode                            0
CBD Distance                        0
Train Distance                      0
Electricity Distance                0
Hospital Distance                   0
Library Distance                    0
Park Distance                       0
Tourist Attraction Distance         0
Grocery Distance                    0
Year                                0
SA2_CODE21                          0
Population                          0
Income                              0
LGA_CODE24                          0
Incidents Re

In [3]:
# Select features for predicted
df = df.drop(columns=['Address', 'Latitude', 'Longitude', 'Postcode', 'SA2_CODE21', 'LGA_CODE24', 'Suburb'], axis=1)

categorical_columns = ['Property Type','Closest Gov Secondary School']
# Apply LabelEncoder to each categorical column
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [4]:
df_train = df[(df['Year'] >= 2015) & (df['Year'] <= 2024)]
df_predict = df[(df['Year'] >= 2025) & (df['Year'] <= 2027)]

In [5]:
X = df_train.drop(columns=['Cost'])
y = df_train['Cost']

In [6]:
seed = 37 

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)

In [8]:
# XGB with tuning

param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3, 0.5],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0.1, 1, 10]
}


In [11]:
# Initialize the XGBoost Regressor
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=seed)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=10,  # Reduce iterations for testing
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2,  # Set higher verbosity to see progress in console
    random_state=42,
    n_jobs=2
)


In [12]:
# Fit the model
random_search.fit(X_train, y_train)

# Output the best hyperparameters and RMSE
print(f'Best parameters: {random_search.best_params_}')
best_rmse = np.sqrt(-random_search.best_score_)
print(f'Best RMSE from RandomizedSearchCV: {best_rmse}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.01, max_depth=7, n_estimators=200, reg_alpha=0, reg_lambda=0.1, subsample=0.6; total time=   3.8s
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.01, max_depth=7, n_estimators=200, reg_alpha=0, reg_lambda=0.1, subsample=0.6; total time=   2.3s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=10, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=1.0; total time=  14.7s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_depth=10, n_estimators=400, reg_alpha=0.1, reg_lambda=1, subsample=1.0; total time=  11.3s
[CV] END colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=5, n_estimators=400, reg_alpha=0.01, reg_lambda=10, subsample=1.0; total time=   4.3s
[CV] END colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=5, n_estimators=400, reg_alpha=0.01, reg_lambda=10, subsample=1.0; total time=   3.5s
[

KeyboardInterrupt: 

In [None]:
best_xgb_model = random_search.best_estimator_
