In [1]:
import pandas as pd

In [2]:
# Load the Excel file from `data/landing` directory
file_path = '../../data/landing/income_14100DO0004_2011-23.xlsx'  
excel_data = pd.ExcelFile(file_path)

In [3]:
# `data/raw` directory: Convert Table 1 Sheet in Excel to CSV 
for sheet in excel_data.sheet_names:
    if sheet == 'Table 1':
        sheet_df = pd.read_excel(file_path, sheet_name=sheet)
        csv_file = f"../../data/raw/Income_SA2_{sheet}.csv"
        sheet_df.to_csv(csv_file, index=False)
        print(f"Saved '{sheet}' as {csv_file}")

Saved 'Table 1' as ../../data/raw/Income_SA2_Table 1.csv


# Table 1 Sheet (Victoria state by SA2)
### INCOME (INCLUDING GOVERNMENT ALLOWANCES), Australia, State and Territory, Statistical Areas Level 2-4, Greater Capital City Statistical Areas, 2011, 2016-2022

In [4]:
# `data/raw directory: Load the CSV file for Table 1, skipping the 6 lines of Header
table1_df = pd.read_csv('../../data/raw/Income_SA2_Table 1.csv', skiprows=6)

# display tables and features
table1_df.head()
print(f"Total entries: {len(table1_df)}")

Total entries: 23278


  table1_df = pd.read_csv('../../data/raw/Income_SA2_Table 1.csv', skiprows=6)


In [5]:
# select Victoria suburbs which has the 'Code' starting with '2', then remove GCCSA Code (Greater Capital City Statistical Areas) 
suburbs_victoria_df = table1_df[table1_df['Code'].astype(str).apply(lambda x: x.isnumeric() and x.startswith('2'))]

print(f'Total suburbs: {len(suburbs_victoria_df)}')
suburbs_victoria_df.head(50)

Total suburbs: 4848


Unnamed: 0,Code,Label,Year,Employee income earners (no.),Employee income earners - median age (years),Total employee income ($m),Median employee income ($),Mean employee income ($),Employee income as main source of income (%),Own unincorporated business income earners (no.),...,Income inadequately described or not stated (%),Median equivalised total household income (weekly) ($),$1-$499 per week (%).1,$500-$999 per week (%).1,$1000-$1999 per week (%).1,$2000-$2999 per week (%).1,$3000 or more per week (%).1,Nil income (%).1,Partial income stated (%),All incomes not stated (%)
16,2,Victoria,2011.0,-,-,-,-,-,-,-,...,-,751,-,-,-,-,-,-,-,-
17,2,Victoria,2016.0,3048110,38,174887.8,47469,57376,76.1,501879,...,8.7,864,21,29.1,28.9,6.4,2.7,2,7.5,2.5
18,2,Victoria,2017.0,3151400,38,184293.1,48219,58480,76.4,523202,...,-,-,-,-,-,-,-,-,-,-
19,2,Victoria,2018.0,3239361,38,196542,50019,60673,77,539640,...,-,-,-,-,-,-,-,-,-,-
20,2,Victoria,2019.0,3347084,38,208873.6,51688,62405,76.9,559515,...,-,-,-,-,-,-,-,-,-,-
21,2,Victoria,2020.0,3393012,38,219234.7,53323,64614,77.2,568765,...,-,-,-,-,-,-,-,-,-,-
22,2,Victoria,2021.0,-,-,-,-,-,-,-,...,6.8,1074,15.7,25.8,34.5,10.6,5.2,2,4.4,1.8
23,2,Victoria,2022.0,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
432,201,Ballarat,2011.0,-,-,-,-,-,-,-,...,-,616,-,-,-,-,-,-,-,-
433,201,Ballarat,2016.0,73277,41,3664.4,43732,50008,75.6,13365,...,9.6,708,27.8,32.8,23.6,3.2,1.3,1.3,6.8,3.1


In [6]:
# select entries where 'Label' contains 'Victoria'
label_victoria_df = table1_df[table1_df['Label'].str.contains('Victoria', na=False)]
print(f'Total entries containing Victoria: {len(label_victoria_df)}')
label_victoria_df.tail(10)

Total entries containing Victoria: 56


Unnamed: 0,Code,Label,Year,Employee income earners (no.),Employee income earners - median age (years),Total employee income ($m),Median employee income ($),Mean employee income ($),Employee income as main source of income (%),Own unincorporated business income earners (no.),...,Income inadequately described or not stated (%),Median equivalised total household income (weekly) ($),$1-$499 per week (%).1,$500-$999 per week (%).1,$1000-$1999 per week (%).1,$2000-$2999 per week (%).1,$3000 or more per week (%).1,Nil income (%).1,Partial income stated (%),All incomes not stated (%)
19798,506021123,Victoria Park - Lathlain - Burswood,2021.0,-,-,-,-,-,-,-,...,6.5,1314,11.1,20.7,35.5,14.7,9,1.9,6,1
19799,506021123,Victoria Park - Lathlain - Burswood,2022.0,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
22160,702051068,Victoria River,2011.0,-,-,-,-,-,-,-,...,-,535,-,-,-,-,-,-,-,-
22161,702051068,Victoria River,2016.0,260,37,9.8,29004,37565,93.1,-,...,27.7,527,37.1,15.1,20.3,4.5,1.7,3,11.9,6.4
22162,702051068,Victoria River,2017.0,295,35,11.6,29193,39258,95.2,18,...,-,-,-,-,-,-,-,-,-,-
22163,702051068,Victoria River,2018.0,338,35,12.2,24001,36181,95.9,15,...,-,-,-,-,-,-,-,-,-,-
22164,702051068,Victoria River,2019.0,357,35,13.2,26434,37104,95.7,23,...,-,-,-,-,-,-,-,-,-,-
22165,702051068,Victoria River,2020.0,386,35,15.3,28639,39614,95.7,23,...,-,-,-,-,-,-,-,-,-,-
22166,702051068,Victoria River,2021.0,-,-,-,-,-,-,-,...,14.2,442,50.1,13.3,16.9,6.3,1.5,0.6,7.6,3.8
22167,702051068,Victoria River,2022.0,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


We notice that other Victorian locations were included like: 
* Royal Botanic Gardens, 
* Victoria River, 
* Victoria Park, 
* Victoria Point

In [7]:
# combine the two filtered datasets above
cleaned_victoria_df = pd.concat([suburbs_victoria_df, label_victoria_df]).drop_duplicates()
print(f'Total rows after filtered suburbs and : {len(cleaned_victoria_df)}')

Total rows after filtered suburbs and : 4888


In [8]:
print(f"Total columns: {len(cleaned_victoria_df.columns)}")
cleaned_victoria_df.columns

Total columns: 68


Index(['Code', 'Label', 'Year', 'Employee income earners (no.)',
       'Employee income earners - median age (years)',
       'Total employee income ($m)', 'Median employee income ($)',
       'Mean employee income ($)',
       'Employee income as main source of income (%)',
       'Own unincorporated business income earners (no.)',
       'Own unincorporated business income earners - median age (years)',
       'Total own unincorporated business income ($m)',
       'Median own unincorporated business income ($)',
       'Mean own unincorporated business income ($)',
       'Own unincorporated business income as main source of income (%)',
       'Investment income earners (no.)',
       'Investment income earners - median age (years)',
       'Total investment income ($m)', 'Median investment income ($)',
       'Mean investment income ($)',
       'Investment income as main source of income (%)',
       'Superannuation and annuity income earners (no.)',
       'Superannuation and a

In [9]:
# filter relevant columns 
relevant_columns = [
    'Code', 'Label', 'Year', 
    'Employee income earners (no.)', 'Total employee income ($m)', 
    'Median employee income ($)', 'Mean employee income ($)', 
    'Own unincorporated business income earners (no.)', 
    'Total own unincorporated business income ($m)', 
    'Median own unincorporated business income ($)', 
    'Mean own unincorporated business income ($)', 
    'Investment income earners (no.)', 
    'Total investment income ($m)', 'Median investment income ($)', 
    'Mean investment income ($)', 
    'Total income (excl. Government pensions and allowances) ($m)', 
    'Median total income (excl. Government pensions and allowances) ($)', 
    'Mean total income (excl. Government pensions and allowances) ($)', 
    'Total income (excl. Government pensions and allowances) - gini coefficient', 
    'Total income (excl. Government pensions and allowances) - income share of top 1% of earners'
]

cleaned_victoria_df = cleaned_victoria_df[relevant_columns]


In [10]:
# drop missing values
cleaned_victoria_df = cleaned_victoria_df.dropna()
print(f'Total entries after dropping missing values: {len(cleaned_victoria_df)}')

Total entries after dropping missing values: 4888


In [11]:
# total locations in victoria 
len(cleaned_victoria_df)

4888

In [12]:
# final table
cleaned_victoria_df

Unnamed: 0,Code,Label,Year,Employee income earners (no.),Total employee income ($m),Median employee income ($),Mean employee income ($),Own unincorporated business income earners (no.),Total own unincorporated business income ($m),Median own unincorporated business income ($),Mean own unincorporated business income ($),Investment income earners (no.),Total investment income ($m),Median investment income ($),Mean investment income ($),Total income (excl. Government pensions and allowances) ($m),Median total income (excl. Government pensions and allowances) ($),Mean total income (excl. Government pensions and allowances) ($),Total income (excl. Government pensions and allowances) - gini coefficient,Total income (excl. Government pensions and allowances) - income share of top 1% of earners
16,2,Victoria,2011.0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
17,2,Victoria,2016.0,3048110,174887.8,47469,57376,501879,12011,11177,23932,2416209,23279.2,293,6854,202318.3,46984,60350,0.479,9.5
18,2,Victoria,2017.0,3151400,184293.1,48219,58480,523202,12933.2,11268,24719,2446441,23835.4,243,9743,213668.9,47709,61489,0.48,9.5
19,2,Victoria,2018.0,3239361,196542,50019,60673,539640,13763.8,11707,25505,2517491,25895.6,217,10286,228631.1,49266,63442,0.481,9.6
20,2,Victoria,2019.0,3347084,208873.6,51688,62405,559515,14371.5,11711,25686,2641282,27614.3,171,10455,242783.1,51027,65366,0.477,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22163,702051068,Victoria River,2018.0,338,12.2,24001,36181,15,0.3,21000,19417,88,0.1,33,1078,12,33656,44652,0.447,-
22164,702051068,Victoria River,2019.0,357,13.2,26434,37104,23,0.1,4021,5632,103,0.3,34,2994,13,34493,43083,0.454,-
22165,702051068,Victoria River,2020.0,386,15.3,28639,39614,23,-0.1,1378,-6478,114,0.1,7,1179,14.6,36161,44761,0.452,-
22166,702051068,Victoria River,2021.0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-


In [13]:
# `/data/curated/` directory: Save to a CSV file
cleaned_victoria_df.to_csv('../../data/curated/Preprocessed_Income_Victoria_SA2.csv', index=False)