In [132]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from tqdm import tqdm

In [133]:
df = pd.read_csv('../../data/curated/individual_property_final.csv')
df.isnull().sum()

Address                             0
Cost                            18636
Property Type                       0
Bedrooms                            0
Bathrooms                           0
Latitude                            0
Longitude                           0
Closest Gov Secondary School        0
Age under 20                        0
Age 20-39                           0
Age 40-59                           0
Age 60+                             0
Postcode                            0
CBD Distance                        0
Train Distance                      0
Electricity Distance                0
Hospital Distance                   0
Library Distance                    0
Park Distance                       0
Tourist Attraction Distance         0
Grocery Distance                    0
Year                                0
SA2_CODE21                          0
Population                          0
Income                              0
LGA_CODE24                          0
Incidents Re

In [134]:
# Select features for predicted
df = df.drop(columns=['Address', 'Latitude', 'Longitude', 'Postcode', 'SA2_CODE21', 'LGA_CODE24', 'Suburb'], axis=1)

categorical_columns = ['Property Type','Closest Gov Secondary School']
# Apply LabelEncoder to each categorical column
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [135]:
df_train = df[(df['Year'] >= 2015) & (df['Year'] <= 2024)]
df_predict = df[(df['Year'] >= 2025) & (df['Year'] <= 2027)]

In [136]:
X = df_train.drop(columns=['Cost'])
y = df_train['Cost']

In [137]:
seed = 37 

In [138]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)

In [139]:
# Initialize the XGBoost Regressor model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=seed)

# Train the model on the training data
xgb_model.fit(X_train, y_train)


In [140]:
# Make predictions on the test set
y_pred = xgb_model.predict(X_val)


In [141]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE: {rmse}')
r2 = r2_score(y_val, y_pred)
print(f'R²: {r2}')


RMSE: 51.23837502469361
R²: 0.8897943913780326


In [142]:
# XGB with tuning

param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3, 0.5],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0.1, 1, 10]
}


In [158]:
# Initialize the XGBoost Regressor
xgb_model = XGBRegressor(
    objective='reg:squarederror', 
    device = "cuda", # uncomment this line if you got ok graphic card
    random_state=seed)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=100,  # Reduce iterations for testing
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2,  # Set higher verbosity to see progress in console
    random_state=seed,
    n_jobs=1 # adjust this based on your cpu capacity, use 1 if unsure
)


In [159]:
# Fit the model
random_search.fit(X_train, y_train)

# Output the best hyperparameters and RMSE
print(f'Best parameters: {random_search.best_params_}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=400, reg_alpha=0.1, reg_lambda=0.1, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=400, reg_alpha=0.1, reg_lambda=0.1, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=400, reg_alpha=0.1, reg_lambda=0.1, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=400, reg_alpha=0.1, reg_lambda=0.1, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.6, gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=400, reg_alpha=0.1, reg_lambda=0.1, subsample=0.6; total time=   0.7s
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.01, max_depth=10, n_estimators=300, reg_alpha=0.1, reg_lambda=0.1, subsample=0.6; total t

Best parameters: {'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0.1, 'n_estimators': 400, 'max_depth': 7, 'learning_rate': 0.2, 'gamma': 0.1, 'colsample_bytree': 1.0} \
Use this for our model

In [160]:
best_xgb_model = random_search.best_estimator_

In [161]:
# Make predictions on the test set
y_pred = best_xgb_model.predict(X_val)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE: {rmse}')
r2 = r2_score(y_val, y_pred)
print(f'R²: {r2}')

RMSE: 37.226658028971656
R²: 0.9418270482592196


In [162]:
# This looks too high, check overfit

y_train_pred = best_xgb_model.predict(X_train)

# Make predictions on the validation set
y_val_pred = best_xgb_model.predict(X_val)

# Calculate RMSE for training set
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f'Training RMSE: {rmse_train}')

# Calculate RMSE for validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f'Validation RMSE: {rmse_val}')

Training RMSE: 19.884556073039885
Validation RMSE: 37.226658028971656


As this model got best RMSE, use this to predict future price

In [163]:
df = pd.read_csv('../../data/curated/individual_property_final.csv')

In [164]:
# Make predictions for the 3 future years
X_predict = df_predict.drop(columns=['Cost'])
predicted_cost = best_xgb_model.predict(X_predict)


In [165]:
# Combine back to full dataset
df_full = pd.concat([df_train, df_predict], ignore_index=True)

In [166]:
df_train = df[(df['Year'] >= 2015) & (df['Year'] <= 2024)]
df_predict = df[(df['Year'] >= 2025) & (df['Year'] <= 2027)]

In [167]:
df_predict = df_predict.drop(columns=['Cost'])
df_predict['Cost'] = predicted_cost
df_full = pd.concat([df_train, df_predict], ignore_index=True)

In [168]:
# Save df_predict to a CSV file
df_predict.to_csv('../../data/curated/prediction_25_to_27.csv', index=False)
df_full.to_csv('../../data/curated/final_with_predictions_15_to_27.csv', index=False)