## PLEASE READ: DATA DOWNLOAD

### Before running this notebook, please ensure that you have downloaded the files in the `Income Landing Data` folder from our [Google Drive](https://drive.google.com/drive/folders/1JzqWIVPAHOvMeD0X1u3RefYBSj1PehZ0?usp=sharing), saving them to the `data/landing/` directory.

## Preprocess Income Dataset 

### Import Libraries & Data 

In [3]:
import warnings
import regex as re
import numpy as np 
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

# Ignore warnings
warnings.filterwarnings('ignore')

# Household Income 
income_2016 = pd.read_csv("../../data/landing/2016Census_G29_VIC_SA2.csv")
income_2021 = pd.read_csv("../../data/landing/2021Census_G33_VIC_SA2.csv")

# SA2 Codes and Suburb Names 
sa2_2016 = pd.read_csv("../../data/landing/SA2_codes_2016.csv")
sa2_2021 = pd.read_excel("../../data/landing/SA2_codes_2021.xlsx", sheet_name=0)

### Preprocess Income Data

In [4]:
# Make column names lowercase
income_2016.columns = income_2016.columns.str.lower()
income_2021.columns = income_2021.columns.str.lower()

# Make the column names cohesive
income_2016 = income_2016.rename(columns={
    "sa2_maincode_2016": "sa2_code_2016"
})

# Only retrieve the total household income columns 
income_2016 = income_2016[['sa2_code_2016'] + income_2016.filter(regex='_tot$').columns.tolist()]
income_2021 = income_2021[['sa2_code_2021'] + income_2021.filter(regex='_tot$').columns.tolist()]

# Drop the last 3 columns as they are not relevant 
income_2016 = income_2016.iloc[:, :-3]
income_2021 = income_2021.iloc[:, :-3]

# Drop the second column as it is not relevant 
income_2016 = income_2016.drop(income_2016.columns[1], axis=1)
income_2021 = income_2021.drop(income_2021.columns[1], axis=1)

### Preprocess the SA2 Codes and Suburb Names Dataframes

In [5]:
# Make column names lowercase
sa2_2016.columns = sa2_2016.columns.str.lower()
sa2_2021.columns = sa2_2021.columns.str.lower()

# Make all string columns lower case
sa2_2016 = sa2_2016.applymap(lambda x: x.lower() if isinstance(x, str) else x)
sa2_2021 = sa2_2021.applymap(lambda x: x.lower() if isinstance(x, str) else x)

# Only retrieve the SA2 codes and suburb names 
sa2_2016 = sa2_2016[["sa2_maincode_2016", "sa2_name_2016"]]
sa2_2021 = sa2_2021[["asgs_structure", "census_code_2021", "census_name_2021"]]

# For 2021, filter only for SA2 codes and names 
sa2_2021 = sa2_2021[
    (sa2_2021['asgs_structure'] == 'sa2') & 
    (sa2_2021['census_code_2021'].astype(str).str.startswith('2'))
]

# Clean the column names and dataframe 
sa2_2016 = sa2_2016.rename(columns={
    "sa2_maincode_2016": "sa2_code_2016"
})

sa2_2021 = sa2_2021.rename(columns={
    "census_code_2021": "sa2_code_2021",
    "census_name_2021": "sa2_name_2021"
}).drop(columns=["asgs_structure"])

# Convert code columns to int64 and name columns to string 
sa2_2016['sa2_code_2016'] = sa2_2016['sa2_code_2016'].astype('int64')
sa2_2016['sa2_name_2016'] = sa2_2016['sa2_name_2016'].astype('str')

sa2_2021['sa2_code_2021'] = sa2_2021['sa2_code_2021'].astype('int64')
sa2_2021['sa2_name_2021'] = sa2_2021['sa2_name_2021'].astype('str')

### Map SA2 Codes in the Income Dataset to the Suburb Names

In [8]:
# Merge income datasets with sa2 datasets to replace the SA2 code with the SA2 name
income_2016 = income_2016.merge(sa2_2016, left_on='sa2_code_2016', right_on='sa2_code_2016', how='inner')
income_2016['sa2_code_2016'] = income_2016['sa2_name_2016']  
income_2016 = income_2016.drop(['sa2_code_2016'], axis=1)  

# Reorder to place sa2_name_2016 as the first column
income_2016 = income_2016[['sa2_name_2016'] + [col for col in income_2016.columns if col != 'sa2_name_2016']]

# Do the same for 2021 income dataset
income_2021 = income_2021.merge(sa2_2021, left_on='sa2_code_2021', right_on='sa2_code_2021', how='inner')
income_2021['sa2_code_2021'] = income_2021['sa2_name_2021']  
income_2021 = income_2021.drop(['sa2_code_2021'], axis=1)  
income_2021 = income_2021[['sa2_name_2021'] + [col for col in income_2021.columns if col != 'sa2_name_2021']]

### Summing and Removing Directional Modifiers

In [9]:
# Define directional modifiers and the word 'surrounds' to be removed
directional_modifiers = [' - east', ' - west', ' - north', ' - south', ' - central', ' surrounds', ' (north)', ' (south)', ' (east)', ' (west)', ' region']
pattern = '|'.join([re.escape(suffix) for suffix in directional_modifiers])

income_2016['sa2_name_2016'] = income_2016['sa2_name_2016'].str.replace(pattern, '', regex=True)
income_2021['sa2_name_2021'] = income_2021['sa2_name_2021'].str.replace(pattern, '', regex=True)

# Split sa2_name where multiple names are separated by hyphens
income_2016['sa2_name_2016'] = income_2016['sa2_name_2016'].str.split(' - ')
income_2021['sa2_name_2021'] = income_2021['sa2_name_2021'].str.split(' - ')

# Explode the lists into separate rows
income_2016_exploded = income_2016.explode('sa2_name_2016')
income_2016_exploded = income_2016_exploded.reset_index(drop=True)

income_2021_exploded = income_2021.explode('sa2_name_2021')
income_2021_exploded = income_2021_exploded.reset_index(drop=True)

# Remove trailing whitespaces
income_2016_exploded['sa2_name_2016'] = income_2016_exploded['sa2_name_2016'].str.rstrip()
income_2021_exploded['sa2_name_2021'] = income_2021_exploded['sa2_name_2021'].str.rstrip()

# Mapping for the SA2 names to the correct suburbs
sa2_name_mapping = {
    'ballarat' : 'ballarat central',
    'flemington racecourse' : 'flemington',
    'southbank wharf' : 'south wharf',
    'port melbourne industrial' : 'port melbourne',
    'reservoir east' : 'reservoir',
    'reservoir west' : 'reservoir',
    'research warrandyte' : 'warrandyte',
    'essendon airport' : 'essendon',
    'gladstone parkmeadows' : 'gladstone park',
    'craigieburn west' : 'craigieburn',
    'wandin' : 'wandin north',
    'pakenham east' : 'pakenham',
    'pakenham west' : 'pakenham',
    'narre warren west' : 'narre warren',
    'berwick east' : 'berwick',
    'berwick west' : 'berwick',
    'point cook east' : 'point cook',
    'point cook west' : 'point cook',
    'truganina east' : 'truganina',
    'truganina west' : 'truganina',
    'melbourne cbd' : 'melbourne'
}

# Remove the "(vic.)" from sa2_name values
income_2016_exploded['sa2_name_2016'] = income_2016_exploded['sa2_name_2016'].str.replace(r'\s*\(vic\.\)', '', regex=True)
income_2021_exploded['sa2_name_2021'] = income_2021_exploded['sa2_name_2021'].str.replace(r'\s*\(vic\.\)', '', regex=True)

# Replace some names according with the mapping dictionary defined above
income_2016_exploded['sa2_name_2016'] = income_2016_exploded['sa2_name_2016'].replace(sa2_name_mapping)
income_2021_exploded['sa2_name_2021'] = income_2021_exploded['sa2_name_2021'].replace(sa2_name_mapping)

# Convert counts to integers
tot_cols = income_2016.columns[income_2016.columns.str.contains('tot')]

income_2016_exploded[tot_cols] = income_2016_exploded[tot_cols].astype('int')
income_2021_exploded[tot_cols] = income_2021_exploded[tot_cols].astype('int')

# Create the aggregation dictionary
aggregation_functions = {col: 'sum' for col in tot_cols}

# Group by SA2 names and aggregate
income_2016_grouped = income_2016_exploded.groupby('sa2_name_2016').agg(aggregation_functions).reset_index()
income_2021_grouped = income_2021_exploded.groupby('sa2_name_2021').agg(aggregation_functions).reset_index()

# Rename SA2 name column to be suburb
income_2016_grouped = income_2016_grouped.rename(columns={
    "sa2_name_2016": "suburb"
})

income_2021_grouped = income_2021_grouped.rename(columns={
    "sa2_name_2021": "suburb"
})

# Save as csv's
income_2016_grouped.to_csv('../../data/curated/income_2016.csv', index=False)
income_2021_grouped.to_csv('../../data/curated/income_2021.csv', index=False)

## Interpolations & Extrapolations

In [12]:
income_2016 = pd.read_csv('../../data/curated/income_2016.csv')
income_2021 = pd.read_csv('../../data/curated/income_2021.csv')

In [14]:
# Assuming the columns are structured similarly
df_2016 = income_2016.set_index('suburb')
df_2021 = income_2021.set_index('suburb')

# Rename columns for merging clarity
df_2016.columns = df_2016.columns.str.replace('_2016', '')
df_2021.columns = df_2021.columns.str.replace('_2021', '')

# Create a combined DataFrame for interpolation and prediction
df_combined = pd.concat([df_2016.assign(year=2016), df_2021.assign(year=2021)], axis=0).reset_index()

# Find suburbs that have 2021 data
suburbs_with_2021 = df_combined[df_combined['year'] == 2021]['suburb']

# Filter out rows where the suburb is not in the list of suburbs with 2021 data
df_combined = df_combined[df_combined['suburb'].isin(suburbs_with_2021)]

### Linear Interpolation

In [15]:
all_years = np.arange(2016, 2022)

# Ensure each suburb has rows for every year (including the missing years)
df_complete = (
    df_combined
    .groupby('suburb')
    .apply(lambda group: group.set_index('year').reindex(all_years).ffill().bfill().reset_index())
    .reset_index(drop=True)
)

columns_with_tot = [col for col in df_complete.columns if 'tot' in col]

# Set the values to NaN for the years 2017 to 2020 for these columns
df_complete.loc[df_complete['year'].between(2017, 2020), columns_with_tot] = np.nan

# Fill in the missing values for 2017 - 2020
df_interpolated = df_complete.groupby('suburb').apply(lambda x: x.interpolate(method='linear')).reset_index(drop=True)

# Make them integers since they're counts
df_interpolated[columns_with_tot] = df_interpolated[columns_with_tot].astype(int)

### Forecasting Future Income Data

In [None]:
df = df_interpolated.copy()

# List of columns to forecast
columns_to_forecast = df.columns[2:]  

# Forecast years
forecast_years = [2022, 2023, 2024, 2025, 2026, 2027]

# Create an empty list to store forecasted results
forecast_results = []

# Iterate over each suburb
for suburb in df['suburb'].unique():
    
    # Filter data for the specific suburb
    suburb_data = df[df['suburb'] == suburb].set_index('year')
    
    # Dictionary to store the forecasted values for each column
    suburb_forecast = {'suburb': suburb, 'year': forecast_years}
    
    for column in columns_to_forecast:
        # Extract the time series data for the current column
        ts = suburb_data[column]
        
        # Fit the ARIMA model (p, d, q can be optimized or set manually)
        model = ARIMA(ts, order=(1,1,1))
        fitted_model = model.fit()
        
        # Forecast the next 6 years
        forecast = fitted_model.forecast(steps=len(forecast_years))
        
        # Add the forecasted values to the suburb_forecast dictionary
        suburb_forecast[column] = forecast.values
    
    # Append the forecast to the result list
    forecast_results.append(pd.DataFrame(suburb_forecast))

# Concatenate all forecast results into a single DataFrame
forecast_df = pd.concat(forecast_results).reset_index(drop=True)

# Concatenate df_interpolated and forecast_df along the row axis (axis=0)
df_combined = pd.concat([df_interpolated, forecast_df], axis=0)

# Sort by suburb and year to maintain chronological order
df_combined = df_combined.sort_values(by=['suburb', 'year']).reset_index(drop=True)

In [None]:
# Select only the numeric columns
numeric_columns = df_combined.select_dtypes(include=[np.number]).columns

# Replace negative values with zero only in the numeric columns
df_combined[numeric_columns] = df_combined[numeric_columns].applymap(lambda x: 0 if x < 0 else x)

# Replace numbers to integers
df_combined[numeric_columns] = df_combined[numeric_columns].applymap(lambda x: round(max(0, x)))

# Save as csv
df_combined.to_csv('../../data/curated/income_forecasted.csv', index=False)