Before running this notebook, ensure that you have ran `population_scrape.py` under the `scripts` directory.

Also, ensure that you have downloaded the income dataset from this link in csv format and put it in the data/landing folder: 
- https://digital.atlas.gov.au/datasets/digitalatlas::abs-income-including-government-allowances-by-2021-sa2-nov-2023/about 

Alternatively, you can download the income dataset directly from this link:
- https://drive.google.com/file/d/1A79Anfe2QT1IpOtsBVMor-KvRcSuFZME/view?usp=sharing

Note that only UniMelb accounts can access the Google Drive link

### Import Libraries & Read in Files

In [16]:
import warnings
import pandas as pd

# Suppress all warnings
warnings.filterwarnings('ignore')

income = pd.read_csv('../data/landing/income.csv')
population = pd.read_csv('../data/landing/population.csv')

### Feature Selection & Column Renaming for Income Dataset

In [12]:
# Define regex patterns to extract only the columns that we want
gov_pensions_regex = r'^Government pensions and allowances: (Age pension|Commonwealth rent assistance)'

total_income_regex = (
    r'^Personal income: (Total income earners|Median total income|Mean total income|Total income)'
    r' \(excl\. Government pensions and allowances\)(?!.*(Income share|Quartile|p))'
)

# Find the relevant columns using regex patterns
gov_pensions_columns = income.filter(regex=gov_pensions_regex).columns
total_income_columns = income.filter(regex=total_income_regex).columns

sa2_name  = ['Statistical Areas Level 2 2021 name']

# Combine all selected columns
selected_columns =  sa2_name + list(gov_pensions_columns) + list(total_income_columns)

# Filter to include only the selected features
filtered_income = income[selected_columns]

# Renaming columns
new_income_names = {
    'Statistical Areas Level 2 2021 name': 'sa2_name',
    'Government pensions and allowances: Age pension (no.) (Data year: 2023)': 'gov_age_pension_count_2023',
    'Government pensions and allowances: Commonwealth rent assistance (no.) (Data year: 2023)': 'gov_rent_assist_count_2023',
    'Personal income: Total income earners (excl. Government pensions and allowances)(no.) (Data year: 2020)': 'personal_income_count_2020',
    'Personal income: Total income earners (excl. Government pensions and allowances): Median age (years) (Data year: 2020)': 'personal_income_median_age_2020',
    'Personal income: Total income (excl. Government pensions and allowances) ($m) (Data year: 2020)': 'personal_total_income_millions_2020',
    'Personal income: Median total income (excl. Government pensions and allowances) ($) (Data year: 2020)': 'median_personal_total_income_2020',
    'Personal income: Mean total income (excl. Government pensions and allowances) ($) (Data year: 2020)': 'mean_personal_total_income_2020',
    'Personal income: Total income (excl. Government pensions and allowances): Gini coefficient (Data year: 2020)' : 'gini_coef_2020'
}

filtered_income.rename(columns=new_income_names, inplace=True)

# Get rid of trailing whitespaces for matching
filtered_income['sa2_name'] = filtered_income['sa2_name'].str.rstrip()

### Feature Selection & Column Renaming for Population Dataset

In [13]:
# Removing irrelevant columns
columns_to_drop = ['GCCSA code', 'GCCSA name', 'SA4 code', 'SA4 name', 'SA3 code', 'SA3 name']
population = population.drop(columns_to_drop, axis=1)

# Rename columns
new_population_cols = {
    'SA2 code': 'sa2_code',
    'SA2 name': 'sa2_name',
    'ERP at 30 June 2022 no.': 'erp_june_2022_count',
    'ERP at 30 June 2023 no.': 'erp_june_2023_count',
    'ERP change 2022-23 no.': 'erp_change_count',
    'ERP change 2022-23 %': 'erp_change_percentage',
    'Components of population change 2022-23 Natural increase no.': 'natural_increase_count',
    'Components of population change 2022-23 Net internal migration no.': 'net_internal_migration_count',
    'Components of population change 2022-23 Net overseas migration no.': 'net_overseas_migration_count',
    'Area (km2)': 'area_km2',
    'Population density 2023 (persons/km2)': 'pop_density_persons_km2'
}

population.rename(columns=new_population_cols, inplace=True)

# Drop empty / invalid SA2 names or codes
population = population[~population['sa2_code'].isna()]
population = population[~population['sa2_name'].isna()]

# Remove SA2 code as we don't need it anymore
population = population.drop('sa2_code', axis=1)

# Get rid of trailing whitespaces for matching
population['sa2_name'] = population['sa2_name'].str.rstrip()

### Merge Population & Income Dataset

In [14]:
demographics = pd.merge(population, filtered_income, on='sa2_name')
demographics.to_csv('../data/raw/demographics.csv', index=False)