### Import Library

In [284]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [285]:
# read yearly income csv
yearly_income = pd.read_csv('../data/curated/pop_income_postcode.csv', index_col=0)

# Keep only the specified columns
yearly_income = yearly_income[['SA2', 'SA2 name', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', 'average_income', 'income_growth_rate', 'affluence_index', 'postcode']]

In [286]:
yearly_income

Unnamed: 0,SA2,SA2 name,2016-17,2017-18,2018-19,2019-20,2020-21,average_income,income_growth_rate,affluence_index,postcode
0,201011001,Alfredton,50596.0,52448.0,53932.0,55204.0,58036.0,54043.2,0.147047,37830.284114,3350.0
1,201011002,Ballarat,50093.0,51736.0,53688.0,53784.0,55998.0,53059.8,0.117881,37141.895364,3354.0
2,201011005,Buninyong,48877.0,51034.0,52377.0,54308.0,56408.0,52600.8,0.154081,36820.606224,3357.0
3,201011006,Delacombe,46176.0,47759.0,49909.0,51915.0,52792.0,49710.2,0.143278,34797.182983,3352.0
4,201011006,Delacombe,46176.0,47759.0,49909.0,51915.0,52792.0,49710.2,0.143278,34797.182983,3358.0
...,...,...,...,...,...,...,...,...,...,...,...
984,217041478,Moyne - West,41889.0,43136.0,45256.0,46843.0,50360.0,45496.8,0.202225,31847.820667,3287.0
985,217041478,Moyne - West,41889.0,43136.0,45256.0,46843.0,50360.0,45496.8,0.202225,31847.820667,3286.0
986,217041478,Moyne - West,41889.0,43136.0,45256.0,46843.0,50360.0,45496.8,0.202225,31847.820667,3289.0
987,217041479,Warrnambool - North,44509.0,46945.0,48855.0,50654.0,52518.0,48696.2,0.179941,34087.393982,3280.0


In [287]:
from statsmodels.tsa.arima.model import ARIMA

In [288]:
# Extract only income columns for prediction
income_columns = ['2016-17', '2017-18', '2018-19', '2019-20', '2020-21']
suburbs = yearly_income[['SA2']]  # Keeping track of suburb names

# Function to fit ARIMA model and predict future 3 years income
def predict_future_income(income_series, forecast_years=3):
    # Handle NaN by filling with the median 
    income_series = income_series.fillna(income_series.median())

    # Fit ARIMA model
    model = ARIMA(income_series, order=(2, 1, 2))
    model_fit = model.fit()

    # Forecast the next 3 years
    forecast = model_fit.forecast(steps=forecast_years)
    
    return list(forecast)

# Predicting future income for each suburb and storing the results
for idx, row in yearly_income.iterrows():

    past_income = row[income_columns].astype(float)  # Convert income data to float for ARIMA model
    # Get future income predictions
    future_income = predict_future_income(past_income, forecast_years=3)
    # Adding predicted future income to the dataset
    yearly_income.at[idx, '2021-22(pred)'] =  future_income[0]
    yearly_income.at[idx, '2022-23(pred)'] =  future_income[1]
    yearly_income.at[idx, '2023-24(pred)'] =  future_income[2]

In [289]:
yearly_income = yearly_income.dropna(axis='rows')
yearly_income

Unnamed: 0,SA2,SA2 name,2016-17,2017-18,2018-19,2019-20,2020-21,average_income,income_growth_rate,affluence_index,postcode,2021-22(pred),2022-23(pred),2023-24(pred)
0,201011001,Alfredton,50596.0,52448.0,53932.0,55204.0,58036.0,54043.2,0.147047,37830.284114,3350.0,59598.456960,61736.679369,63299.471674
1,201011002,Ballarat,50093.0,51736.0,53688.0,53784.0,55998.0,53059.8,0.117881,37141.895364,3354.0,55849.340973,57143.146334,56690.672918
2,201011005,Buninyong,48877.0,51034.0,52377.0,54308.0,56408.0,52600.8,0.154081,36820.606224,3357.0,58390.032469,60078.161740,61977.834711
3,201011006,Delacombe,46176.0,47759.0,49909.0,51915.0,52792.0,49710.2,0.143278,34797.182983,3352.0,53340.484044,53479.345721,53435.657234
4,201011006,Delacombe,46176.0,47759.0,49909.0,51915.0,52792.0,49710.2,0.143278,34797.182983,3358.0,53340.484044,53479.345721,53435.657234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,217041478,Moyne - West,41889.0,43136.0,45256.0,46843.0,50360.0,45496.8,0.202225,31847.820667,3287.0,51787.538934,54570.351073,56003.012955
985,217041478,Moyne - West,41889.0,43136.0,45256.0,46843.0,50360.0,45496.8,0.202225,31847.820667,3286.0,51787.538934,54570.351073,56003.012955
986,217041478,Moyne - West,41889.0,43136.0,45256.0,46843.0,50360.0,45496.8,0.202225,31847.820667,3289.0,51787.538934,54570.351073,56003.012955
987,217041479,Warrnambool - North,44509.0,46945.0,48855.0,50654.0,52518.0,48696.2,0.179941,34087.393982,3280.0,54318.128580,56066.440822,57780.848537


In [290]:
# Group by postcode and calculate the median predicted income for each postcode
yearly_income_grouped = yearly_income.groupby('postcode')['2023-24(pred)'].median().reset_index()

yearly_income_grouped.rename(columns={'2023-24(pred)': 'median income of 2023-24'}, inplace=True)

# Merge the result as a new median income column into the original dataframe
yearly_income = pd.merge(yearly_income, yearly_income_grouped, on='postcode')

yearly_income

Unnamed: 0,SA2,SA2 name,2016-17,2017-18,2018-19,2019-20,2020-21,average_income,income_growth_rate,affluence_index,postcode,2021-22(pred),2022-23(pred),2023-24(pred),median income of 2023-24
0,201011001,Alfredton,50596.0,52448.0,53932.0,55204.0,58036.0,54043.2,0.147047,37830.284114,3350.0,59598.456960,61736.679369,63299.471674,57507.352645
1,201011483,Canadian - Mount Clear,43944.0,46025.0,47887.0,48536.0,50350.0,47348.4,0.145776,33143.923733,3350.0,50691.987185,51716.464491,51715.233615,57507.352645
2,201011002,Ballarat,50093.0,51736.0,53688.0,53784.0,55998.0,53059.8,0.117881,37141.895364,3354.0,55849.340973,57143.146334,56690.672918,56690.672918
3,201011005,Buninyong,48877.0,51034.0,52377.0,54308.0,56408.0,52600.8,0.154081,36820.606224,3357.0,58390.032469,60078.161740,61977.834711,61977.834711
4,201011006,Delacombe,46176.0,47759.0,49909.0,51915.0,52792.0,49710.2,0.143278,34797.182983,3352.0,53340.484044,53479.345721,53435.657234,52503.896743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
890,217041478,Moyne - West,41889.0,43136.0,45256.0,46843.0,50360.0,45496.8,0.202225,31847.820667,3304.0,51787.538934,54570.351073,56003.012955,56003.012955
891,217041478,Moyne - West,41889.0,43136.0,45256.0,46843.0,50360.0,45496.8,0.202225,31847.820667,3301.0,51787.538934,54570.351073,56003.012955,56003.012955
892,217041478,Moyne - West,41889.0,43136.0,45256.0,46843.0,50360.0,45496.8,0.202225,31847.820667,3287.0,51787.538934,54570.351073,56003.012955,56003.012955
893,217041479,Warrnambool - North,44509.0,46945.0,48855.0,50654.0,52518.0,48696.2,0.179941,34087.393982,3280.0,54318.128580,56066.440822,57780.848537,57780.848537


In [291]:
# select the column we need
yearly_income_selected = pd.DataFrame(yearly_income, columns=["postcode", "SA2", "median income of 2023-24"])

In [292]:
yearly_income_selected

Unnamed: 0,postcode,SA2,median income of 2023-24
0,3350.0,201011001,57507.352645
1,3350.0,201011483,57507.352645
2,3354.0,201011002,56690.672918
3,3357.0,201011005,61977.834711
4,3352.0,201011006,52503.896743
...,...,...,...
890,3304.0,217041478,56003.012955
891,3301.0,217041478,56003.012955
892,3287.0,217041478,56003.012955
893,3280.0,217041479,57780.848537


In [293]:
yearly_income_selected['postcode'] = yearly_income_selected['postcode'].astype('int64')
yearly_income_selected

Unnamed: 0,postcode,SA2,median income of 2023-24
0,3350,201011001,57507.352645
1,3350,201011483,57507.352645
2,3354,201011002,56690.672918
3,3357,201011005,61977.834711
4,3352,201011006,52503.896743
...,...,...,...
890,3304,217041478,56003.012955
891,3301,217041478,56003.012955
892,3287,217041478,56003.012955
893,3280,217041479,57780.848537


In [294]:
# preprocess past rent data
rent_data = pd.read_csv("../data/curated/rent_cleaned.csv", index_col=0)
rent_data_selected = pd.DataFrame(rent_data, columns=["suburb", "postcode", "latitude", "longitude", "price_per_week"])
rent_data_selected

Unnamed: 0,suburb,postcode,latitude,longitude,price_per_week
0,Mount Martha,3934,-38.268818,145.014034,1600.0
1,Bentleigh East,3165,-37.909091,145.055152,780.0
2,Ferntree Gully,3156,-37.890452,145.267922,585.0
3,Frankston North,3200,-38.115353,145.163260,475.0
4,Camberwell,3124,-37.840108,145.094830,590.0
...,...,...,...,...,...
2957,Melbourne,3000,-37.809580,144.960120,680.0
2958,Melbourne,3000,-37.810002,144.958818,800.0
2959,Capel Sound,3940,-38.372306,144.884492,500.0
2960,Bendigo,3550,-36.759161,144.267363,560.0


In [295]:
rent_data_selected.isnull().sum()

suburb            0
postcode          0
latitude          0
longitude         0
price_per_week    0
dtype: int64

In [296]:
rent_data_selected = rent_data_selected.groupby(['postcode', 'suburb', 'latitude', 'longitude'])['price_per_week'].median().reset_index()
rent_data_selected.rename(columns={'price_per_week': 'median rent per week of 2023-24'}, inplace=True)
rent_data_selected

Unnamed: 0,postcode,suburb,latitude,longitude,median rent per week of 2023-24
0,3000,Melbourne,-37.851214,144.980093,670.0
1,3000,Melbourne,-37.845242,144.979642,700.0
2,3000,Melbourne,-37.843986,144.976371,495.0
3,3000,Melbourne,-37.842589,144.977041,480.0
4,3000,Melbourne,-37.841987,144.975566,550.0
...,...,...,...,...,...
2710,3984,Pioneer Bay,-38.379923,145.548373,350.0
2711,3995,Kilcunda,-38.545338,145.459290,440.0
2712,3995,North Wonthaggi,-38.583300,145.595902,540.0
2713,3995,Wonthaggi,-38.618064,145.588395,385.0


In [297]:
# Use the weekly rent price to get the total annual rent price
rent_data_selected['median rent per week of 2023-24'] = rent_data_selected['median rent per week of 2023-24']*52
rent_data_selected

Unnamed: 0,postcode,suburb,latitude,longitude,median rent per week of 2023-24
0,3000,Melbourne,-37.851214,144.980093,34840.0
1,3000,Melbourne,-37.845242,144.979642,36400.0
2,3000,Melbourne,-37.843986,144.976371,25740.0
3,3000,Melbourne,-37.842589,144.977041,24960.0
4,3000,Melbourne,-37.841987,144.975566,28600.0
...,...,...,...,...,...
2710,3984,Pioneer Bay,-38.379923,145.548373,18200.0
2711,3995,Kilcunda,-38.545338,145.459290,22880.0
2712,3995,North Wonthaggi,-38.583300,145.595902,28080.0
2713,3995,Wonthaggi,-38.618064,145.588395,20020.0


In [306]:
combined_data = pd.merge(yearly_income_selected, rent_data_selected, on=['postcode'], how='inner')
combined_data = combined_data[['postcode', 'SA2', 'suburb', 'latitude', 'longitude', 'median income of 2023-24', 'median rent per week of 2023-24']]
combined_data

Unnamed: 0,postcode,SA2,suburb,latitude,longitude,median income of 2023-24,median rent per week of 2023-24
0,3350,201011001,Alfredton,-37.566019,143.785460,57507.352645,24960.0
1,3350,201011001,Alfredton,-37.565994,143.792568,57507.352645,25480.0
2,3350,201011001,Alfredton,-37.565575,143.798101,57507.352645,23660.0
3,3350,201011001,Alfredton,-37.564730,143.799695,57507.352645,28080.0
4,3350,201011001,Alfredton,-37.561417,143.817393,57507.352645,21840.0
...,...,...,...,...,...,...,...
4052,3280,217041479,Warrnambool,-38.373465,142.506210,57780.848537,24960.0
4053,3280,217041479,Warrnambool,-38.364072,142.513889,57780.848537,36400.0
4054,3280,217041479,Warrnambool,-38.359425,142.512095,57780.848537,33800.0
4055,3280,217041479,Warrnambool,-38.357031,142.467634,57780.848537,28080.0


In [307]:
combined_data.isnull().sum()

postcode                           0
SA2                                0
suburb                             0
latitude                           0
longitude                          0
median income of 2023-24           0
median rent per week of 2023-24    0
dtype: int64

In [308]:
combined_data['affordable index of 2023-24'] = combined_data['median income of 2023-24'] / combined_data['median rent per week of 2023-24']
combined_data

Unnamed: 0,postcode,SA2,suburb,latitude,longitude,median income of 2023-24,median rent per week of 2023-24,affordable index of 2023-24
0,3350,201011001,Alfredton,-37.566019,143.785460,57507.352645,24960.0,2.303980
1,3350,201011001,Alfredton,-37.565994,143.792568,57507.352645,25480.0,2.256960
2,3350,201011001,Alfredton,-37.565575,143.798101,57507.352645,23660.0,2.430573
3,3350,201011001,Alfredton,-37.564730,143.799695,57507.352645,28080.0,2.047983
4,3350,201011001,Alfredton,-37.561417,143.817393,57507.352645,21840.0,2.633121
...,...,...,...,...,...,...,...,...
4052,3280,217041479,Warrnambool,-38.373465,142.506210,57780.848537,24960.0,2.314938
4053,3280,217041479,Warrnambool,-38.364072,142.513889,57780.848537,36400.0,1.587386
4054,3280,217041479,Warrnambool,-38.359425,142.512095,57780.848537,33800.0,1.709493
4055,3280,217041479,Warrnambool,-38.357031,142.467634,57780.848537,28080.0,2.057723


In [309]:
combined_data = combined_data.drop_duplicates()
combined_data

Unnamed: 0,postcode,SA2,suburb,latitude,longitude,median income of 2023-24,median rent per week of 2023-24,affordable index of 2023-24
0,3350,201011001,Alfredton,-37.566019,143.785460,57507.352645,24960.0,2.303980
1,3350,201011001,Alfredton,-37.565994,143.792568,57507.352645,25480.0,2.256960
2,3350,201011001,Alfredton,-37.565575,143.798101,57507.352645,23660.0,2.430573
3,3350,201011001,Alfredton,-37.564730,143.799695,57507.352645,28080.0,2.047983
4,3350,201011001,Alfredton,-37.561417,143.817393,57507.352645,21840.0,2.633121
...,...,...,...,...,...,...,...,...
4052,3280,217041479,Warrnambool,-38.373465,142.506210,57780.848537,24960.0,2.314938
4053,3280,217041479,Warrnambool,-38.364072,142.513889,57780.848537,36400.0,1.587386
4054,3280,217041479,Warrnambool,-38.359425,142.512095,57780.848537,33800.0,1.709493
4055,3280,217041479,Warrnambool,-38.357031,142.467634,57780.848537,28080.0,2.057723


In [310]:
# save data to csv

combined_data.to_csv("../data/curated/rent_income_2023-24.csv")

In [311]:
# Sort by affordable index by 
top_affordable_index = combined_data.sort_values(by='affordable index of 2023-24', ascending=False)

# Find top 10 areas with affordable indexs
top_affordable_index.iloc[:10,:]

Unnamed: 0,postcode,SA2,suburb,latitude,longitude,median income of 2023-24,median rent per week of 2023-24,affordable index of 2023-24
621,3212,203021043,Lara,-38.013475,144.401553,60759.903158,10400.0,5.842298
2680,3981,212011548,Koo Wee Rup,-38.181543,145.469598,60515.983555,10400.0,5.818845
1324,3000,206041119,Melbourne,-37.812024,144.969996,88862.163133,16120.0,5.512541
988,3058,206011497,Coburg North,-37.721361,144.958501,62347.983754,11440.0,5.449999
994,3058,210031236,Coburg North,-37.721361,144.958501,62347.983754,11440.0,5.449999
2260,3081,209011199,Heidelberg Heights,-37.732063,145.060126,62008.956946,11440.0,5.420363
1678,3181,206061136,Windsor,-37.853356,144.99253,81175.176456,15080.0,5.382969
782,3690,204031492,Wodonga,-36.128816,146.863823,55626.836071,10400.0,5.348734
1670,3181,206061136,Prahran,-37.848507,144.990328,81175.176456,15990.0,5.076621
3955,3396,215011394,Hopetoun,-35.733631,142.366467,52549.283369,10400.0,5.052816


In [312]:
duplicates = top_affordable_index[top_affordable_index.duplicated()]
print(duplicates)

Empty DataFrame
Columns: [postcode, SA2, suburb, latitude, longitude, median income of 2023-24, median rent per week of 2023-24, affordable index of 2023-24]
Index: []


In [313]:
import folium

# Create the base map, setting the center location
m = folium.Map(location=[-37.8136, 144.9631], zoom_start=10)  # Melbourne city center, adjust as needed

# Loop through the top 10 most affordable areas
for idx, row in top_affordable_index.iloc[:10].iterrows():
    # Create a marker; location parameter can be latitude and longitude for each area
    # Assume that you have latitude and longitude coordinates for each suburb or postcode
    folium.Marker(
        location=[row['latitude'], row['longitude']],  # Use coordinates based on the postcode
        popup=f"{row['suburb']} (Affordable Index: {row['affordable index of 2023-24']:.2f})",
    ).add_to(m)

# Display the map
m