In [1]:
import pandas as pd
import json

In [2]:
population_df = pd.read_excel('14100DO0001_2011-23.xlsx', sheet_name='Table 1', skiprows=6, skipfooter=5, )

In [3]:
with open('../../data/landing/postcode_mapping.json') as f:
    postcode_mapping = json.load(f)

f.close()

suburb_names = set([suburb for sublist in postcode_mapping.values() for suburb in sublist])

In [4]:
population_df = population_df[population_df['Label'].isin(suburb_names)]
population_df.dropna(axis=1, how='all', inplace=True)

population_df = population_df[['Label', 'Year', 'Estimated resident population (no.)', 'Population density (persons/km2)', 'Median age - persons (years)', 'Total fertility rate (births per female) (rate)', 'Standardised death rate (per 1000 people) (rate)']]
population_df.dropna(axis=0, how='any', inplace=True)

population_df = population_df[population_df['Year'] >= 2018]
population_df = population_df[population_df['Year'] <= 2023]

population_df['Year'] = population_df['Year'].astype(int)

In [5]:
population_df[population_df['Label'] == 'Carlton']

Unnamed: 0,Label,Year,Estimated resident population (no.),Population density (persons/km2),Median age - persons (years),Total fertility rate (births per female) (rate),Standardised death rate (per 1000 people) (rate)
11019,Carlton,2018,20831,11453.8,25.7,0.82,4.6
11020,Carlton,2019,21029,11562.7,26.3,0.68,5.2
11021,Carlton,2020,20865,11472.5,27.1,0.55,4.9
11022,Carlton,2021,17064,9382.5,27.8,0.54,5.2
11023,Carlton,2022,18057,9928.5,26.9,0.56,5.7
11024,Carlton,2023,21376,11753.5,-,-,-


In [6]:
# Remove duplicate rows based on 'Label' and 'Year'
population_df = population_df.drop_duplicates(subset=['Label', 'Year'])

pivot_df = population_df.pivot(
    index='Label', 
    columns='Year', 
    values='Estimated resident population (no.)'
)

# Rename the columns to make them more readable
pivot_df.columns = [f'Estimated_population_{int(col)}' for col in pivot_df.columns]

In [7]:
pivot_df[pivot_df.index == 'Carlton']

Unnamed: 0_level_0,Estimated_population_2018,Estimated_population_2019,Estimated_population_2020,Estimated_population_2021,Estimated_population_2022,Estimated_population_2023
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Carlton,20831,21029,20865,17064,18057,21376


In [8]:
# Replace '-' with NaN
pivot_df.replace('-', None, inplace=True)

# Drop rows with any NaN values (which were previously '-')
pivot_df.dropna(inplace=True)

# Select the population columns (excluding 'Label') and convert to int
population_columns = [col for col in pivot_df.columns if col.startswith('Estimated_population_')]

# Convert the selected columns to int
pivot_df[population_columns] = pivot_df[population_columns].astype(int)

In [9]:
pivot_df

Unnamed: 0_level_0,Estimated_population_2018,Estimated_population_2019,Estimated_population_2020,Estimated_population_2021,Estimated_population_2022,Estimated_population_2023
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abbotsford,9527,9594,9672,9258,9513,10008
Airport West,8169,8390,8362,8240,8295,8464
Albert Park,16728,17081,16955,16011,16177,16861
Albion,3663,3929,4192,4371,4548,4708
Alexandra,6646,6687,6690,6771,6794,6836
...,...,...,...,...,...,...
Yackandandah,4778,4906,5052,5189,5329,5455
Yarram,5437,5474,5545,5555,5588,5580
Yarraville,15991,16092,16068,15651,15661,16020
Yarrawonga,8297,8418,8508,8593,8727,8812


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Prepare the feature data (2019-2022 population data) and target (growth rate for 2023)
X = pivot_df[['Estimated_population_2018', 'Estimated_population_2019', 'Estimated_population_2020', 'Estimated_population_2021', 'Estimated_population_2022']]
y = pivot_df['Estimated_population_2023']

# Split into train and test sets (test set will be the data for 2023)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30034)

In [11]:
from sklearn.linear_model import LinearRegression

# Train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on the test set (2023)
lr_predictions = lr_model.predict(X_test)

In [12]:
from sklearn.ensemble import RandomForestRegressor

# Train the Random Forest model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Predict on the test set (2023)
rf_predictions = rf_model.predict(X_test)

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

# Train the Gradient Boosting model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)

# Predict on the test set (2023)
gb_predictions = gb_model.predict(X_test)

In [14]:
# Define a function to calculate RMSE
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Calculate RMSE for each model
lr_rmse = calculate_rmse(y_test, lr_predictions)
rf_rmse = calculate_rmse(y_test, rf_predictions)
gb_rmse = calculate_rmse(y_test, gb_predictions)

print(f"Linear Regression RMSE: {lr_rmse}")
print(f"Random Forest RMSE: {rf_rmse}")
print(f"Gradient Boosting RMSE: {gb_rmse}")

Linear Regression RMSE: 499.08874583186383
Random Forest RMSE: 21844.86762372526
Gradient Boosting RMSE: 19848.438238053142


In [15]:
pivot_df['prediction_difference'] = lr_model.predict(pivot_df[['Estimated_population_2018', 'Estimated_population_2019', 'Estimated_population_2020', 'Estimated_population_2021', 'Estimated_population_2022']]) - pivot_df['Estimated_population_2023']

In [17]:
pivot_df['prediction_difference'].abs().mean()

np.float64(167.64443669816353)