In [None]:
import pandas as pd

file_path = '../data/curated/external/final_file.csv'
data = pd.read_csv(file_path)

population_2021 = data['2021_population']
population_2022 = data['2022_population']
population_2023 = data['2023_population']

crime_2021 = data['2021crime']
crime_2022 = data['2022crime']
crime_2023 = data['2023crime']

# Calculated population growth rate
population_growth_rate_21_22 = (population_2022 - population_2021) / population_2021 * 100
population_growth_rate_22_23 = (population_2023 - population_2022) / population_2022 * 100
average_population_growth_rate_21_23 = (population_growth_rate_21_22 + population_growth_rate_22_23) / 2

# Calculated crime growth rate
crime_growth_rate_21_22 = (crime_2022 - crime_2021) / crime_2021 * 100
crime_growth_rate_22_23 = (crime_2023 - crime_2022) / crime_2022 * 100
average_crime_growth_rate_21_23 = (crime_growth_rate_21_22 + crime_growth_rate_22_23) / 2


In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

# Read the data
file_path = '../data/curated/external/final_file.csv'
df = pd.read_csv(file_path)

# Handle missing values
df = df.dropna()

# Features (X) and target variable (y)
X = df[['beds', 'baths', 'parking', 'minimum_distance_station',
        'minimum_distance_school', 'minimum_distance_police',
        'minimum_distance_supermarket', 'minimum_distance_library',
        'minimum_distance_gym', 'minimum_distance_cbd', '2022_population',
        '2023_population', 'ERP change %', 'Net overseas migration',
        'Population density 2023 (persons/km2)', 'Median_tot_prsnl_inc_weekly',
        '2021_population', 'Mar 2021', 'Jun 2021', 'Sep 2021', 
        'Dec 2021', 'Mar 2022', 'Jun 2022', 'Sep 2022', 'Dec 2022', 'Mar 2023',
        'Number_of_Schools', '2021crime', '2022crime', '2023crime', 'Median age',
        'People aged 0-14 years', 'People aged 15-64 years', 'People aged 65 years and over']]
y = df['price']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror')

# Define the hyperparameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# Prediction using the best parameters
best_model = grid_search.best_estimator_

# Predict training and test sets
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Model evaluation
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Evaluation indicators on the training set: MSE: {mse_train:.4f}, RMSE: {rmse_train:.4f}, MAE: {mae_train:.4f}, R²: {r2_train:.4f}")
print(f"Evaluation indicators on the testing set: MSE: {mse_test:.4f}, RMSE: {rmse_test:.4f}, MAE: {mae_test:.4f}, R²: {r2_test:.4f}")

FileNotFoundError: [Errno 2] No such file or directory: '../data/curated/external/final_file.csv'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importances
importances = best_model.feature_importances_

# Create a DataFrame to combine feature names and their importance values
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Saving the feature importance DataFrame to a CSV file
output_file_path = '../data/curated/external/xgboost_feature_importance.csv'
feature_importance_df.to_csv(output_file_path, index=False)

In [1]:
import matplotlib.pyplot as plt

# Get feature importance based on gain
importance = best_model.get_booster().get_score(importance_type='gain')

# Convert the importance data into a DataFrame and sort it
importance_df = pd.DataFrame({
    'Feature': list(importance.keys()),
    'Importance': list(importance.values())
}).sort_values(by='Importance', ascending=False)

# Plot the feature importance as a bar chart
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.xlabel('Gain')
plt.ylabel('Features')
plt.title('XGBoost Feature Importance (Based on Gain)')
plt.gca().invert_yaxis()  # Invert the y-axis
plt.show()

# Save the plot as a PNG file
plt.savefig('..plots/xgboost_importance.png', format='png', bbox_inches='tight')


NameError: name 'best_model' is not defined

In [None]:
# Assume annual growth rates (e.g., changes in population and crime rates)
population_growth_rate = average_population_growth_rate_21_23
crime_growth_rate = average_crime_growth_rate_21_23

# Iterate through each Suburb and forecast house prices for the next three years
future_predictions_list = []

for suburb in df['Suburb'].unique():
    suburb_data = df[df['Suburb'] == suburb]
    
    # Create feature data for the next three years
    future_years_data = pd.DataFrame({
        'beds': [suburb_data['beds'].values[0]] * 3,
        'baths': [suburb_data['baths'].values[0]] * 3,
        'parking': [suburb_data['parking'].values[0]] * 3,
        'minimum_distance_station': [suburb_data['minimum_distance_station'].values[0]] * 3,
        'minimum_distance_school': [suburb_data['minimum_distance_school'].values[0]] * 3,
        'minimum_distance_police': [suburb_data['minimum_distance_police'].values[0]] * 3,
        'minimum_distance_supermarket': [suburb_data['minimum_distance_supermarket'].values[0]] * 3,
        'minimum_distance_library': [suburb_data['minimum_distance_library'].values[0]] * 3,
        'minimum_distance_gym': [suburb_data['minimum_distance_gym'].values[0]] * 3,
        'minimum_distance_cbd': [suburb_data['minimum_distance_cbd'].values[0]] * 3,
        # Population predictions based on growth rates
        '2021_population': [suburb_data['2021_population'].values[0] * (1 + population_growth_rate) ** i for i in range(3)],
        '2022_population': [suburb_data['2022_population'].values[0] * (1 + population_growth_rate) ** i for i in range(3)],
        '2023_population': [suburb_data['2023_population'].values[0] * (1 + population_growth_rate) ** i for i in range(3)],
        'ERP change %': [suburb_data['ERP change %'].values[0]] * 3,
        'Net overseas migration': [suburb_data['Net overseas migration'].values[0]] * 3,
        'Population density 2023 (persons/km2)': [suburb_data['Population density 2023 (persons/km2)'].values[0]] * 3,
        'Median_tot_prsnl_inc_weekly': [suburb_data['Median_tot_prsnl_inc_weekly'].values[0]] * 3,
        'Number_of_Schools': [suburb_data['Number_of_Schools'].values[0]] * 3,
        '2021crime': [suburb_data['2021crime'].values[0] * (1 + crime_growth_rate) ** i for i in range(3)],
        '2022crime': [suburb_data['2022crime'].values[0] * (1 + crime_growth_rate) ** i for i in range(3)],
        '2023crime': [suburb_data['2023crime'].values[0] * (1 + crime_growth_rate) ** i for i in range(3)],
        # Age characteristics remain the same
        'Median age': [suburb_data['Median age'].values[0]] * 3,
        'People aged 0-14 years': [suburb_data['People aged 0-14 years'].values[0]] * 3,
        'People aged 15-64 years': [suburb_data['People aged 15-64 years'].values[0]] * 3,
        'People aged 65 years and over': [suburb_data['People aged 65 years and over'].values[0]] * 3,
        # Quarterly data remain constant
        'Mar 2021': [suburb_data['Mar 2021'].values[0]] * 3,
        'Jun 2021': [suburb_data['Jun 2021'].values[0]] * 3,
        'Sep 2021': [suburb_data['Sep 2021'].values[0]] * 3,
        'Dec 2021': [suburb_data['Dec 2021'].values[0]] * 3,
        'Mar 2022': [suburb_data['Mar 2022'].values[0]] * 3,
        'Jun 2022': [suburb_data['Jun 2022'].values[0]] * 3,
        'Sep 2022': [suburb_data['Sep 2022'].values[0]] * 3,
        'Dec 2022': [suburb_data['Dec 2022'].values[0]] * 3,
        'Mar 2023': [suburb_data['Mar 2023'].values[0]] * 3
    })

    # Ensure future_years_data's column names match the training set
    future_years_data = future_years_data.reindex(columns=X_train.columns, fill_value=0)

    # Use the trained model to forecast house prices over the next three years
    future_predictions = best_model.predict(future_years_data)

    # Add predictions to the DataFrame
    future_years_data['predicted_price'] = future_predictions
    future_years_data['Suburb'] = suburb
    future_years_data['year'] = [2024, 2025, 2026]  # Add year column for predictions

    # Append the predictions for each suburb
    future_predictions_list.append(future_years_data)

# Combine forecast results for all suburbs
all_future_predictions = pd.concat(future_predictions_list, ignore_index=True)

# Pivot the forecast results by year
future_predictions_pivot = all_future_predictions.pivot(index='Suburb', columns='year', values='predicted_price')

# Save the forecasted results to a CSV file
output_file_path_future = '../data/curated/future_predicted_prices_by_suburb.csv'
future_predictions_pivot.to_csv(output_file_path_future)