In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import statsmodels.api as sm

## ARIMA

In [None]:

file_path = '../data/curated/external/cleaned_final_data.csv' 
df = pd.read_csv(file_path)

### clean the frist 12 columns data
columns_to_drop = df.columns[:12].tolist() + ['Jun 2021', 'Sep 2021', 'Dec 2021', 'Jun 2022', 'Sep 2022', 'Dec 2022']
df_cleaned = df.drop(columns=columns_to_drop)

### rename 
df_cleaned = df_cleaned.rename(columns={
    'Mar 2021': '2021_average_suburb_price',
    'Mar 2022': '2022_average_suburb_price',
    'Mar 2023': '2023_average_suburb_price'
})

### drop empty and duplicates
df_no_nulls = df_cleaned.dropna()
df_no_nulls.drop_duplicates(subset='Suburb', inplace=True)

### start modeling
### combine the suburbs
suburb_time_series = df_no_nulls[['2021_average_suburb_price', '2022_average_suburb_price', '2023_average_suburb_price']]
predictions = {}

### every property 3 year prediction
for suburb in df_no_nulls['Suburb'].unique():
   
    suburb_data = suburb_time_series[df_no_nulls['Suburb'] == suburb].values.flatten()
    model = ARIMA(suburb_data, order=(1, 0, 0))
    forecast = model_fit.forecast(steps=3)
    
    predictions[suburb] = forecast


forecast_df = pd.DataFrame(predictions, index=['2024_predicted_price', '2025_predicted_price', '2026_predicted_price']).T

df_forecast = df_no_nulls.merge(forecast_df, left_on='Suburb', right_index=True)

output_file_path = '../data/curated/forecasted_suburb_prices.csv'
df_forecast.to_csv(output_file_path, index=False)

print(f"saved into: {output_file_path}")








In [None]:
from sklearn.metrics import r2_score

merged_time_series = suburb_time_series.iloc[:, 1:].mean(axis=0).values.astype(float)

model = ARIMA(merged_time_series, order=(1, 0, 0))
model_fit = model.fit()

### AIC,BIC
total_aic = model_fit.aic
total_bic = model_fit.bic

print(f"total AIC: {total_aic}")
print(f"total BIC: {total_bic}")

forecast = model_fit.forecast(steps=3)
predicted_values = model_fit.predict()

actual_values = merged_time_series

r_squared = r2_score(actual_values, predicted_values)

print(f"R²: {r_squared}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression


file_path = '../data/curated/external/cleaned_final_data.csv' 
df = pd.read_csv(file_path)

### select the column and rename
columns_to_drop = df.columns[:12].tolist() + ['Jun 2021', 'Sep 2021', 'Dec 2021', 'Jun 2022', 'Sep 2022', 'Dec 2022']
df_cleaned = df.drop(columns=columns_to_drop)

df_cleaned = df_cleaned.rename(columns={
    'Mar 2021': '2021_average_suburb_price',
    'Mar 2022': '2022_average_suburb_price',
    'Mar 2023': '2023_average_suburb_price'
})


df_no_nulls = df_cleaned.dropna()
df_no_nulls = df_no_nulls.drop_duplicates(subset='Suburb')


years = np.array([2021, 2022, 2023]).reshape(-1, 1)


predictions = {}

### start predicting
for suburb in df_no_nulls['Suburb'].unique():

    suburb_data = df_no_nulls[df_no_nulls['Suburb'] == suburb]
    

    price_series = suburb_data[['2021_average_suburb_price', '2022_average_suburb_price', '2023_average_suburb_price']].values.flatten()
    
    ### crime
    population_data = suburb_data[['2021 popluation', '2022 popluation', '2023 popluation']].values.flatten()
    crime_data = suburb_data[['2021crime', '2022crime', '2023crime']].values.flatten()
    
    ### create linear regression
    exog_data = np.column_stack((years, population_data, crime_data))
    model = LinearRegression()
    model.fit(exog_data, price_series)
    
    ### future years
    future_years = np.array([2024, 2025, 2026]).reshape(-1, 1)
    
    ### using 2023 crime data
    future_exog = np.column_stack((future_years, [population_data[-1]]*3, [crime_data[-1]]*3))
    forecast = model.predict(future_exog)
    
    predictions[suburb] = forecast

forecast_df = pd.DataFrame(predictions, index=['2024_predicted_price', '2025_predicted_price', '2026_predicted_price']).T

### merge the data
df_forecast = df_no_nulls.merge(forecast_df, left_on='Suburb', right_index=True)

#
output_path = '../data/curated/suburb_predictions_with_regression.csv'
df_forecast.to_csv(output_path, index=False)

print("saved 'suburb_predictions_with_regression.csv'")


In [None]:
### calcaulate the growth rate 
### find out the top 10 
df_24_25 = df_forecast['2025_predicted_price'] - df_forecast['2024_predicted_price']
df_25_26 = df_forecast['2026_predicted_price'] - df_forecast['2025_predicted_price']

df_forecast['2025 growth (%)'] = (df_24_25 / df_forecast['2024_predicted_price']) * 100
df_forecast['2026 growth (%)'] = (df_25_26 / df_forecast['2025_predicted_price']) * 100

tot = df_forecast['2025 growth (%)']  + df_forecast['2026 growth (%)']
df_forecast['growth'] = tot
predict_sort = df_forecast.sort_values(by='growth', ascending=False)
predict_sort[['Suburb','growth']].iloc[0:10]

### heat map of all features

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


file_path = '../data/curated/external/cleaned_final_data.csv'
df = pd.read_csv(file_path)

### remove Suburb and history price
columns_to_drop = ['Suburb', 'Mar 2021', 'Jun 2021', 'Sep 2021', 'Dec 2021', 
                   'Mar 2022', 'Jun 2022', 'Sep 2022', 'Dec 2022', 'Mar 2023']
df_cleaned = df.drop(columns=columns_to_drop)

### heat map
correlation_matrix = df_cleaned.corr(numeric_only=True)

plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


### map growth rate of xgboost

In [None]:
data = pd.read_csv('../data/curated/future_predicted_growth_rate.csv')

In [None]:
pivot_24_25 = data['2025'] - data['2024']
pivot_25_26 = data['2026'] - data['2025']

data['2025 growth (%)'] = ( pivot_24_25 / data['2024']) * 100
data['2026 growth (%)'] = (pivot_25_26 / data['2025']) * 100

tot = data['2025 growth (%)']  + data['2026 growth (%)']
data['growth'] = tot
predict_sort = data.sort_values(by='growth', ascending=False)
predict_sort.iloc[0:10]

In [None]:
import folium

map_center = [-37.8136, 144.9631] 
mymap = folium.Map(location=map_center, zoom_start=13)

### top 10
locations = [
    {"name": "Ballarat", "coordinates": [-37.5622, 143.8503]},
    {"name": "Dandenong", "coordinates": [-37.8980, 144.6550]},
    {"name": "Werribee", "coordinates": [-37.8622, 144.6662]},
    {"name": "Hoppers Crossing", "coordinates": [-36.7570, 144.2780]},
    {"name": "Bendigo", "coordinates": [-38.1440, 145.1294]},
    {"name": "Frankston", "coordinates": [-38.0853, 145.2776]},
    {"name": "Cranbourne", "coordinates": [-38.0325, 145.3446]},
    {"name": "Shepparton", "coordinates": [-36.3805, 145.3995]},
    {"name": "St Albans", "coordinates": [-37.7584, 144.8573]},  
    {"name": "Craigieburn", "coordinates": [-37.5985, 144.9174]},     
    
]


for location in locations:
    folium.Marker(
        location["coordinates"],  
        popup=location["name"], 
        icon=folium.Icon(color='blue', icon='info-sign')  
    ).add_to(mymap)

mymap