## Preprocess Income Dataset 

### Import Libraries & Data 

In [248]:
# need to make script to scrape correspondence file
# https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/correspondences/CG_SA2_2016_SA2_2021.csv

In [249]:
import warnings
import regex as re
import pandas as pd

warnings.filterwarnings('ignore')

In [250]:
# Household Income 
income_2016 = pd.read_csv("../data/landing/2016Census_G29_VIC_SA2.csv")
income_2021 = pd.read_csv("../data/landing/2021Census_G33_VIC_SA2.csv")

# SA2 Codes and Suburb Names 
sa2_2016 = pd.read_csv("../data/landing/SA2_codes_2016.csv")
sa2_2021 = pd.read_excel("../data/landing/SA2_codes_2021.xlsx", sheet_name=0)

correspondence = pd.read_csv('../data/landing/CG_SA2_2016_SA2_2021.csv')

### Preprocess Income Data

In [251]:
# Check Schema and Null Values 
# print(income_2016.info())
# print(income_2021.info())

# Make column names lowercase
income_2016.columns = income_2016.columns.str.lower()
income_2021.columns = income_2021.columns.str.lower()

# Make the column names cohesive
income_2016 = income_2016.rename(columns={
    "sa2_maincode_2016": "sa2_code_2016"
})

# Only retrieve the total household income columns 
income_2016 = income_2016[['sa2_code_2016'] + income_2016.filter(regex='_tot$').columns.tolist()]
income_2021 = income_2021[['sa2_code_2021'] + income_2021.filter(regex='_tot$').columns.tolist()]

# Drop the last 3 columns as they are not relevant 
income_2016 = income_2016.iloc[:, :-3]
income_2021 = income_2021.iloc[:, :-3]

# Drop the second column as it is not relevant 
income_2016 = income_2016.drop(income_2016.columns[1], axis=1)
income_2021 = income_2021.drop(income_2021.columns[1], axis=1)

### Preprocess the SA2 Codes and Suburb Names Dataframes

In [252]:
# Make column names lowercase
sa2_2016.columns = sa2_2016.columns.str.lower()
sa2_2021.columns = sa2_2021.columns.str.lower()

# Make all string columns lower case
sa2_2016 = sa2_2016.applymap(lambda x: x.lower() if isinstance(x, str) else x)
sa2_2021 = sa2_2021.applymap(lambda x: x.lower() if isinstance(x, str) else x)

# Only retrieve the SA2 codes and suburb names 
sa2_2016 = sa2_2016[["sa2_maincode_2016", "sa2_name_2016"]]
sa2_2021 = sa2_2021[["asgs_structure", "census_code_2021", "census_name_2021"]]

# For 2021, filter only for SA2 codes and names 
sa2_2021 = sa2_2021[
    (sa2_2021['asgs_structure'] == 'sa2') & 
    (sa2_2021['census_code_2021'].astype(str).str.startswith('2'))
]

# Clean the column names and dataframe 
sa2_2016 = sa2_2016.rename(columns={
    "sa2_maincode_2016": "sa2_code_2016"
})

sa2_2021 = sa2_2021.rename(columns={
    "census_code_2021": "sa2_code_2021",
    "census_name_2021": "sa2_name_2021"
}).drop(columns=["asgs_structure"])

# Exlcude the (vic.) that some rows have in sa2_name 
sa2_2016['sa2_name_2016'] = sa2_2016['sa2_name_2016'].str.replace(r'\s*\(vic\.\)', '', regex=True)
sa2_2021['sa2_name_2021'] = sa2_2021['sa2_name_2021'].str.replace(r'\s*\(vic\.\)', '', regex=True)

# Check Schema 
# print(sa2_2016.info())
# print(sa2_2021.info())

# Convert code columns to int64 and name columns to string 
sa2_2016['sa2_code_2016'] = sa2_2016['sa2_code_2016'].astype('int64')
sa2_2016['sa2_name_2016'] = sa2_2016['sa2_name_2016'].astype('str')

sa2_2021['sa2_code_2021'] = sa2_2021['sa2_code_2021'].astype('int64')
sa2_2021['sa2_name_2021'] = sa2_2021['sa2_name_2021'].astype('str')

### Map SA2 Codes in the Income Dataset to the Suburb Names

In [253]:
# Merge income datasets with sa2 datasets to replace the SA2 code with the SA2 name
income_2016 = income_2016.merge(sa2_2016, left_on='sa2_code_2016', right_on='sa2_code_2016', how='inner')
income_2016['sa2_code_2016'] = income_2016['sa2_name_2016']  
income_2016 = income_2016.drop(['sa2_code_2016'], axis=1)  

# Reorder to place sa2_name_2016 as the first column
income_2016 = income_2016[['sa2_name_2016'] + [col for col in income_2016.columns if col != 'sa2_name_2016']]

# Do the same for 2021 income dataset
income_2021 = income_2021.merge(sa2_2021, left_on='sa2_code_2021', right_on='sa2_code_2021', how='inner')
income_2021['sa2_code_2021'] = income_2021['sa2_name_2021']  
income_2021 = income_2021.drop(['sa2_code_2021'], axis=1)  

income_2021 = income_2021[['sa2_name_2021'] + [col for col in income_2021.columns if col != 'sa2_name_2021']]

### Correspondence

This is where I thought of doing the mapping for 2016 sa2 name to 2021 sa2 name

In [254]:
correspondence = pd.read_csv('../data/landing/CG_SA2_2016_SA2_2021.csv')

correspondence['SA2_NAME_2016'] = correspondence['SA2_NAME_2016'].str.lower()
correspondence['SA2_NAME_2021'] = correspondence['SA2_NAME_2021'].str.lower()

correspondence = correspondence[~correspondence['SA2_NAME_2016'].isna() & ~correspondence['SA2_NAME_2021'].isna()]

# Only get data for Victoria
correspondence = correspondence[correspondence['SA2_CODE_2021'].str.startswith('2')]

### Summing and Removing Directional Modifiers (Saleha's Preprocessing Code)

In [255]:
# Define directional modifiers and the word 'surrounds' to be removed
directional_modifiers = [' - east', ' - west', ' - north', ' - south', ' - central', ' surrounds', ' (north)', ' (south)', ' (east)', ' (west)', ' region']
pattern = '|'.join([re.escape(suffix) for suffix in directional_modifiers])

income_2016['sa2_name_2016'] = income_2016['sa2_name_2016'].str.replace(pattern, '', regex=True)
income_2021['sa2_name_2021'] = income_2021['sa2_name_2021'].str.replace(pattern, '', regex=True)

# Split sa2_name where multiple names are separated by hyphens
income_2016['sa2_name_2016'] = income_2016['sa2_name_2016'].str.split(' - ')
income_2021['sa2_name_2021'] = income_2021['sa2_name_2021'].str.split(' - ')

# Explode the lists into separate rows
income_2016_exploded = income_2016.explode('sa2_name_2016')
income_2016_exploded = income_2016_exploded.reset_index(drop=True)

income_2021_exploded = income_2021.explode('sa2_name_2021')
income_2021_exploded = income_2021_exploded.reset_index(drop=True)

# Mapping for the SA2 names to the correct suburbs
sa2_name_mapping = {
    'ballarat' : 'ballarat central',
    'flemington racecourse' : 'flemington',
    'southbank wharf' : 'south wharf',
    'port melbourne industrial' : 'port melbourne',
    'reservoir east' : 'reservoir',
    'reservoir west' : 'reservoir',
    'research warrandyte' : 'warrandyte',
    'essendon airport' : 'essendon',
    'gladstone parkmeadows' : 'gladstone park',
    'craigieburn west' : 'craigieburn',
    'wandin' : 'wandin north',
    'pakenham east' : 'pakenham',
    'pakenham west' : 'pakenham',
    'narre warren west' : 'narre warren',
    'berwick east' : 'berwick',
    'berwick west' : 'berwick',
    'point cook east' : 'point cook',
    'point cook west' : 'point cook',
    'truganina east' : 'truganina',
    'truganina west' : 'truganina',
    'melbourne cbd' : 'melbourne'
}

# Remove the "(vic.)" from sa2_name values
income_2016_exploded['sa2_name_2016'] = income_2016_exploded['sa2_name_2016'].str.replace(r'\s*\(vic\.\)', '', regex=True)
income_2016_exploded['sa2_name_2016'] = income_2016_exploded['sa2_name_2016'].replace(sa2_name_mapping)

income_2021_exploded['sa2_name_2021'] = income_2021_exploded['sa2_name_2021'].str.replace(r'\s*\(vic\.\)', '', regex=True)
income_2021_exploded['sa2_name_2021'] = income_2021_exploded['sa2_name_2021'].replace(sa2_name_mapping)

In [256]:
tot_cols = income_2016.columns[income_2016.columns.str.contains('tot')]

income_2016_exploded[tot_cols] = income_2016_exploded[tot_cols].astype('int')
income_2021_exploded[tot_cols] = income_2021_exploded[tot_cols].astype('int')

# Create the aggregation dictionary
aggregation_functions = {col: 'sum' for col in tot_cols}

# Apply the groupby and aggregation
income_2016_grouped = income_2016_exploded.groupby('sa2_name_2016').agg(aggregation_functions).reset_index()
income_2021_grouped = income_2021_exploded.groupby('sa2_name_2021').agg(aggregation_functions).reset_index()

In [257]:
# List of suburbs that are in 2016 but not in 2021
# list(income_2021_grouped[~income_2021_grouped['suburb'].isin(income_2016_grouped['suburb'])]['suburb'].unique())

# Is exactly the same as thr 3rd code chunk under Predictions

In [258]:
# Rename SA2 name column to be suburb
income_2016_grouped = income_2016_grouped.rename(columns={
    "sa2_name_2016": "suburb"
})

income_2021_grouped = income_2021_grouped.rename(columns={
    "sa2_name_2021": "suburb"
})

# Check dataframes that there are no suburb names with '-'
# print(income_2016_grouped[income_2016_grouped['suburb'].str.contains(r'-', na=False)])
# print(income_2021_grouped[income_2021_grouped['suburb'].str.contains(r'-', na=False)])

# Check both dataframes have the same column names 
print(income_2016_grouped.columns.equals(income_2021_grouped.columns))

# add a tag for the year to each column
income_2016_grouped = income_2016_grouped.rename(columns={col: f"{col}_2016" if col != 'suburb' else col for col in income_2016_grouped.columns})
income_2021_grouped = income_2021_grouped.rename(columns={col: f"{col}_2021" if col != 'suburb' else col for col in income_2021_grouped.columns})

# Check dataframe dimensions
print(income_2016_grouped.shape)
print(income_2021_grouped.shape)

# View final dataframes 
income_2021_grouped.head()

### Save Dataframes

In [261]:
income_2016_grouped.to_csv('../data/curated/income_2016.csv', index=False)
income_2021_grouped.to_csv('../data/curated/income_2021.csv', index=False)

print("All Saved!")

All Saved!


## Predictions

In [262]:
income_2016 = pd.read_csv('../data/curated/income_2016.csv')
income_2021 = pd.read_csv('../data/curated/income_2021.csv')

In [263]:
# Assuming the columns are structured similarly
df_2016 = income_2016.set_index('suburb')
df_2021 = income_2021.set_index('suburb')

# Rename columns for merging clarity
df_2016.columns = df_2016.columns.str.replace('_2016', '')
df_2021.columns = df_2021.columns.str.replace('_2021', '')

# Create a combined DataFrame for interpolation and prediction
df_combined = pd.concat([df_2016.assign(year=2016), df_2021.assign(year=2021)], axis=0).reset_index()

# Set multi-index with year and suburb
#df_combined = df_combined.set_index(['suburb', 'year'])

df_combined

Unnamed: 0,suburb,hi_1_149_tot,hi_150_299_tot,hi_300_399_tot,hi_400_499_tot,hi_500_649_tot,hi_650_799_tot,hi_800_999_tot,hi_1000_1249_tot,hi_1250_1499_tot,hi_1500_1749_tot,hi_1750_1999_tot,hi_2000_2499_tot,hi_2500_2999_tot,hi_3000_3499_tot,hi_3500_3999_tot,hi_4000_more_tot,year
0,abbotsford,27,72,68,114,95,120,175,187,239,210,223,502,255,216,238,407,2016
1,aberfeldie,70,134,213,461,345,528,607,759,719,618,555,1150,713,504,556,1294,2016
2,airport west,24,56,90,200,120,250,198,243,251,186,150,316,220,135,100,127,2016
3,albert park,54,94,129,206,160,253,267,334,355,326,302,700,346,270,650,1198,2016
4,albion,40,95,124,281,168,231,236,249,246,170,131,243,156,76,46,54,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,yarraville,40,53,87,240,134,206,224,269,289,287,302,656,424,470,367,1587,2021
995,yarrawonga,23,45,85,324,195,360,312,282,283,173,135,321,196,136,73,170,2021
996,yarriambiack,23,49,103,316,164,260,204,232,203,135,148,239,131,84,44,116,2021
997,yea,15,32,60,140,80,139,128,137,118,87,82,155,94,62,37,106,2021


In [245]:
# Step 1: Identify suburbs that don't have a corresponding entry for the year 2016
suburbs_without_2016 = df_combined.loc[df_combined['year'] == 2016, 'suburb'].unique()

# Step 2: Filter out rows where the suburb does not have the year 2016
df_no_2016_suburb = df_combined[~df_combined['suburb'].isin(suburbs_without_2016)]

# Suburbs that are in 2021 but not in 2016
df_no_2016_suburb['suburb'].unique()

array(['armstrong creek', 'avondale heights', 'ballarat east',
       'ballarat north', 'baranduda', 'brookfield', 'canadian',
       'charlemont', 'cheltenham ', 'clifton hill', 'clyde north',
       'cobblebank', 'cremorne', 'diggers rest', 'exford', 'eynesbury',
       'fraser rise', 'industrial', 'invermay', 'kialla', 'kurunjang',
       'leneva', 'lovely banks', 'manor lakes', 'migratory',
       'mount clear', 'mount duneed', 'no usual address', 'notting hill',
       'oak park', 'offshore', 'plumpton', 'quandong', 'redan',
       'residential', 'royal botanic gardens victoria', 'sebastopol',
       'shepparton east', 'shipping', 'south wharf', 'strathtulloh',
       'toolern vale', 'warrenheip', 'weir views'], dtype=object)

In [None]:
import numpy as np 

all_years = np.arange(2016, 2022)

# Ensure each suburb has rows for every year (including the missing years)
df_complete = (
    df_combined
    .groupby('suburb')
    .apply(lambda group: group.set_index('year').reindex(all_years).ffill().bfill().reset_index())
    .reset_index(drop=True)
)

columns_with_tot = [col for col in df_complete.columns if 'tot' in col]

# Set the values to NaN for the years 2017 to 2020 for these columns
df_complete.loc[df_complete['year'].between(2017, 2020), columns_with_tot] = np.nan

# df_complete.head(20)

In [242]:
# Fill in the missing values for 2017 - 2020
df_interpolated = df_complete.groupby('suburb').apply(lambda x: x.interpolate(method='linear')).reset_index(drop=True)

# Make them integers since they're counts
df_interpolated[columns_with_tot] = df_interpolated[columns_with_tot].astype(int)

In [269]:
# Note that for suburbs that did not exist in 2016, their 2017-2020 values == 2021 value
# For an example, try uncommenting the line below
# df_interpolated[df_interpolated['suburb'] == 'armstrong creek']

In [277]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

# Step 1: Ensure the DataFrame is sorted by suburb and year
df_interpolated = df_interpolated.sort_values(by=['suburb', 'year'])

# Step 2: Define a function to fit ARIMA and make predictions
def predict_arima(suburb_df, column, order=(1,1,1), start_year=2022, end_year=2027):
    # Fit ARIMA model
    model = ARIMA(suburb_df[column], order=order)
    model_fit = model.fit()

    # Predict values for the future years (2022 to 2027)
    future_years = list(range(start_year, end_year + 1))
    forecast = model_fit.predict(start=len(suburb_df), end=len(suburb_df) + len(future_years) - 1)
    
    return pd.Series(forecast, index=future_years)

# Step 3: Initialize an empty DataFrame to store the results
predicted_data = pd.DataFrame()

# Step 4: Loop over each suburb and each income band column
for suburb in df_interpolated['suburb'].unique():
    suburb_df = df_interpolated[df_interpolated['suburb'] == suburb].set_index('year')
    
    for column in df_interpolated.columns:
        if 'tot' in column and column != 'year':  # Only apply to income band columns
            predictions = predict_arima(suburb_df, column)
            
            # Store the predictions in a new DataFrame with the suburb and year information
            temp_df = pd.DataFrame({column: predictions})
            temp_df['suburb'] = suburb
            temp_df['year'] = temp_df.index
            
            # Append the results to the main DataFrame
            predicted_data = pd.concat([predicted_data, temp_df], axis=0)

# Step 5: Combine the original and predicted data
final_df = pd.concat([df_interpolated, predicted_data]).sort_values(by=['suburb', 'year'])

# Reset index for better readability
final_df = final_df.reset_index(drop=True)

# Display final DataFrame with predictions
final_df.head()


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_pr

Unnamed: 0,year,suburb,hi_1_149_tot,hi_150_299_tot,hi_300_399_tot,hi_400_499_tot,hi_500_649_tot,hi_650_799_tot,hi_800_999_tot,hi_1000_1249_tot,hi_1250_1499_tot,hi_1500_1749_tot,hi_1750_1999_tot,hi_2000_2499_tot,hi_2500_2999_tot,hi_3000_3499_tot,hi_3500_3999_tot,hi_4000_more_tot
0,2016,abbotsford,27.0,72.0,68.0,114.0,95.0,120.0,175.0,187.0,239.0,210.0,223.0,502.0,255.0,216.0,238.0,407.0
1,2017,abbotsford,27.0,66.0,70.0,115.0,104.0,118.0,172.0,201.0,252.0,218.0,233.0,536.0,263.0,239.0,227.0,503.0
2,2018,abbotsford,27.0,60.0,73.0,116.0,113.0,117.0,170.0,215.0,266.0,227.0,243.0,571.0,271.0,262.0,217.0,600.0
3,2019,abbotsford,27.0,54.0,76.0,117.0,122.0,115.0,168.0,230.0,280.0,235.0,253.0,605.0,279.0,285.0,206.0,697.0
4,2020,abbotsford,27.0,48.0,79.0,118.0,131.0,114.0,166.0,244.0,294.0,244.0,263.0,640.0,287.0,308.0,196.0,794.0


In [280]:
final_df[final_df['suburb'] == 'melbourne'].head(20)

Unnamed: 0,year,suburb,hi_1_149_tot,hi_150_299_tot,hi_300_399_tot,hi_400_499_tot,hi_500_649_tot,hi_650_799_tot,hi_800_999_tot,hi_1000_1249_tot,hi_1250_1499_tot,hi_1500_1749_tot,hi_1750_1999_tot,hi_2000_2499_tot,hi_2500_2999_tot,hi_3000_3499_tot,hi_3500_3999_tot,hi_4000_more_tot
29172,2016,melbourne,438.0,592.0,446.0,522.0,617.0,765.0,950.0,1268.0,1093.0,901.0,768.0,1352.0,567.0,363.0,541.0,626.0
29173,2017,melbourne,420.0,588.0,492.0,563.0,691.0,838.0,1061.0,1411.0,1242.0,1016.0,875.0,1584.0,635.0,468.0,514.0,788.0
29174,2018,melbourne,402.0,585.0,539.0,605.0,765.0,911.0,1173.0,1555.0,1391.0,1131.0,982.0,1817.0,703.0,574.0,488.0,951.0
29175,2019,melbourne,385.0,581.0,586.0,647.0,840.0,984.0,1284.0,1698.0,1540.0,1246.0,1090.0,2049.0,771.0,680.0,462.0,1114.0
29176,2020,melbourne,367.0,578.0,633.0,689.0,914.0,1057.0,1396.0,1842.0,1689.0,1361.0,1197.0,2282.0,839.0,786.0,436.0,1277.0
29177,2021,melbourne,350.0,575.0,680.0,731.0,989.0,1131.0,1508.0,1986.0,1839.0,1476.0,1305.0,2515.0,907.0,892.0,410.0,1440.0
29178,2022,melbourne,,,,,,,,,,,,,,,,
29179,2022,melbourne,,,,,,,,,,,,,,,,
29180,2022,melbourne,,,,,,,,,,,,,,,,
29181,2022,melbourne,,,,,,,,,,,,,,,,


In [None]:
warnings.filterwarnings('ignore')