In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# set main directory
main_directory = os.getcwd().strip('notebooks')

In [3]:
# load files, data taken from Table 3 of
# https://www.abs.gov.au/statistics/people/population/regional-population-age-and-sex/2021/32350DS0001_2021.xlsx
# headings deleted from original excel file
age_SA2 = pd.read_csv(main_directory+"data/raw/SA2_AGE_POP.csv")

In [4]:
# drop first NA column
age_SA2.drop([0], inplace = True)

In [5]:
# filter out all other states as we only want Victoria

age_SA2 = age_SA2[age_SA2['S/T name'] == 'Victoria']
age_SA2

Unnamed: 0,S/T code,S/T name,GCCSA code,GCCSA name,SA4 code,SA4 name,SA3 code,SA3 name,SA2 code,SA2 name,...,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85 and over,Total persons
642,2,Victoria,2RVIC,Rest of Vic.,201,Ballarat,20101,Ballarat,201011001,Alfredton,...,1142,974,848,709,666,596,419,253,164,16823
643,2,Victoria,2RVIC,Rest of Vic.,201,Ballarat,20101,Ballarat,201011002,Ballarat,...,818,833,812,822,721,736,526,387,380,12076
644,2,Victoria,2RVIC,Rest of Vic.,201,Ballarat,20101,Ballarat,201011005,Buninyong,...,532,495,452,526,474,398,204,134,85,7232
645,2,Victoria,2RVIC,Rest of Vic.,201,Ballarat,20101,Ballarat,201011006,Delacombe,...,565,530,538,479,428,432,280,175,228,10640
646,2,Victoria,2RVIC,Rest of Vic.,201,Ballarat,20101,Ballarat,201011007,Smythes Creek,...,326,344,312,311,242,186,95,40,31,4213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159,2,Victoria,2RVIC,Rest of Vic.,217,Warrnambool and South West,21703,Colac - Corangamite,217031476,Otway,...,264,303,363,422,404,311,182,103,71,3978
1160,2,Victoria,2RVIC,Rest of Vic.,217,Warrnambool and South West,21704,Warrnambool,217041477,Moyne - East,...,444,477,555,526,475,339,210,153,113,6989
1161,2,Victoria,2RVIC,Rest of Vic.,217,Warrnambool and South West,21704,Warrnambool,217041478,Moyne - West,...,618,675,735,731,781,671,420,228,252,9963
1162,2,Victoria,2RVIC,Rest of Vic.,217,Warrnambool and South West,21704,Warrnambool,217041479,Warrnambool - North,...,1374,1435,1444,1404,1245,1150,843,543,504,22462


In [6]:
# drop irrelevant columns which are not SA2

age_SA2.drop(age_SA2.iloc[:, 0:8], inplace=True, axis=1)


In [7]:
# create bins based on age groups
# structure for age groups
#  0-14 years (children), 15-24 years (early working age), 25-54 years (prime working age), 55-64 years (mature working age),
# 65 years and over (elderly)


age_SA2['Children'] = age_SA2.iloc[:,2:5].sum(axis=1)
age_SA2['Early_Working_age'] = age_SA2.iloc[:,5:7].sum(axis=1)
age_SA2['Prime_Working_age'] = age_SA2.iloc[:,7:14].sum(axis=1)
age_SA2['Mature_Working_age'] = age_SA2.iloc[:,14:16].sum(axis=1)
age_SA2['Elderly'] = age_SA2.iloc[:,16:21].sum(axis=1)


In [8]:
# drop the age range columns

age_SA2.drop(age_SA2.iloc[:, 2:21], inplace=True, axis=1)

In [9]:
age_SA2.to_csv(main_directory+"data/curated/SA2_Vic_age_clean.csv")