In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

file_path = '../data/curated/external/final_file.csv'
df = pd.read_csv(file_path)

df = df.dropna()

# Defining features (X) and target variables (y)
X = df[['beds', 'baths', 'parking', 'minimum_distance_station', 
'minimum_distance_school', 'minimum_distance_police', 
'minimum_distance_supermarket', 'minimum_distance_library', 
'minimum_distance_gym', 'minimum_distance_cbd', '2022_population', 
'2023_population', 'ERP change %', 'Net overseas migration', 
'Population density 2023 (persons/km2)', 'Median_tot_prsnl_inc_weekly', 
'2021_population', 'Mar 2021', 'Jun 2021', 'Sep 2021', 
'Dec 2021', 'Mar 2022', 'Jun 2022', 'Sep 2022', 'Dec 2022', 'Mar 2023',
'Number_of_Schools', '2021crime', '2022crime', '2023crime', 'Median age', 
'People aged 0-14 years', 'People aged 15-64 years', 'People aged 65 years and over']]
y = df['price'] 

# Divide the data into training sets and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a random forest model
rf_model = RandomForestRegressor(random_state=42)

# Define a hyperparameter grid
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [3, 5, 7],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'bootstrap': [True, False]
}

# Hyperparameter tuning using GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

# train model
grid_search_rf.fit(X_train, y_train)

# Prediction using the best parameters
best_rf_model = grid_search_rf.best_estimator_

# predict
y_train_pred_rf = best_rf_model.predict(X_train)
y_test_pred_rf = best_rf_model.predict(X_test)

# evaluation model
mse_train_rf = mean_squared_error(y_train, y_train_pred_rf)
rmse_train_rf = np.sqrt(mse_train_rf)
mae_train_rf = mean_absolute_error(y_train, y_train_pred_rf)
r2_train_rf = r2_score(y_train, y_train_pred_rf)

mse_test_rf = mean_squared_error(y_test, y_test_pred_rf)
rmse_test_rf = np.sqrt(mse_test_rf)
mae_test_rf = mean_absolute_error(y_test, y_test_pred_rf)
r2_test_rf = r2_score(y_test, y_test_pred_rf)

print(f"Evaluation indicators on the training set:")
print(f"MSE: {mse_train_rf:.4f}, RMSE: {rmse_train_rf:.4f}, MAE: {mae_train_rf:.4f}, R²: {r2_train_rf:.4f}")

print(f"Evaluation indicators on the testing set:")
print(f"MSE: {mse_test_rf:.4f}, RMSE: {rmse_test_rf:.4f}, MAE: {mae_test_rf:.4f}, R²: {r2_test_rf:.4f}")


In [None]:
# Acquired feature importance
importances = best_rf_model.feature_importances_

# Combine feature names with importance
feature_importance_df = pd.DataFrame({
'Feature': X.columns,
'Importance': importances
})

# In order of importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Output feature importance
print(feature_importance_df)

In [None]:
# Visual feature importance
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importance')
plt.gca().invert_yaxis() 
plt.show()

# Save the plot as a PNG file
plt.savefig('..plots/rf_importance.png', format='png', bbox_inches='tight')