In [12]:
import pandas as pd

### 1. Dataset for the Index of Economic Resources (IER) 

In [13]:
df_index_economic_resources = pd.read_excel("../../data/raw/affluence_dataset/affluence_part1.xlsx",
                           sheet_name="Table 4",
                           header = 5)

In [14]:
df_index_economic_resources
df_index_economic_resources = df_index_economic_resources.rename(columns={
    'Unnamed: 4': 'Unused', 
    'Unnamed: 8': 'Unused',  
    'Rank.1': 'State Rank',
    'Decile.1': 'State Decile',
    'Percentile.1': 'State Percentile',
    '% Usual Resident Population without an SA1 level score': 'SA1 Missing Population %'
})

df_index_economic_resources = df_index_economic_resources.drop(columns=['Unused'])

df_index_economic_resources.head()


Unnamed: 0,2021 Statistical Area Level 2 (SA2) 9-Digit Code,2021 Statistical Area Level 2 (SA2) Name,Usual Resident Population,Score,Rank,Decile,Percentile,State,State Rank,State Decile,State Percentile,Minimum score for SA1s in area,Maximum score for SA1s in area,SA1 Missing Population %
0,101021007,Braidwood,4343.0,1027.06161,1515.0,7.0,65.0,NSW,406.0,7.0,65.0,965.49647,1089.600613,0.0
1,101021008,Karabar,8517.0,1000.067743,1144.0,5.0,49.0,NSW,316.0,6.0,51.0,741.774041,1125.960432,0.0
2,101021009,Queanbeyan,11342.0,944.71457,477.0,3.0,21.0,NSW,122.0,2.0,20.0,804.157734,1111.608982,0.001587
3,101021010,Queanbeyan - East,5085.0,969.44332,732.0,4.0,32.0,NSW,191.0,4.0,31.0,808.734984,1172.367031,0.006686
4,101021012,Queanbeyan West - Jerrabomberra,12744.0,1109.054901,2234.0,10.0,95.0,NSW,565.0,9.0,90.0,911.382971,1235.509932,0.0


In [15]:
columns = df_index_economic_resources.columns

# Create a new list of column names by prefixing 'IER_' to 'Score', 'Rank', etc
new_columns = []
for col in columns:
    # If the column Name does not contain 'Code' and '2021 SA Name', prefix 'IER_'
    if 'Code' not in col and '2021 SA Name' not in col:
        new_columns.append(f"IER_{col}")
    else:
        new_columns.append(col)

df_index_economic_resources.columns = new_columns

print(df_index_economic_resources.columns)


Index(['2021 Statistical Area Level 2  (SA2) 9-Digit Code',
       'IER_2021 Statistical Area Level 2 (SA2) Name ',
       'IER_Usual Resident Population', 'IER_Score', 'IER_Rank', 'IER_Decile',
       'IER_Percentile', 'IER_State', 'IER_State Rank', 'IER_State Decile',
       'IER_State Percentile', 'IER_Minimum score for SA1s in area',
       'IER_Maximum score for SA1s in area', 'IER_SA1 Missing Population %'],
      dtype='object')


In [16]:
df_index_economic_resources = df_index_economic_resources.drop(["IER_Maximum score for SA1s in area",
                                                                "IER_SA1 Missing Population %",
                                                                "IER_Minimum score for SA1s in area",
                                                                "IER_Rank",
                                                                "IER_Decile",
                                                                "IER_Percentile"]
                                                                , axis=1)

In [17]:
df_index_economic_resources = df_index_economic_resources.rename(columns={
    '2021 Statistical Area Level 2  (SA2) 9-Digit Code': 'SA2 code',
    'IER_2021 Statistical Area Level 2 (SA2) Name ': 'SA2 name'
})
print(df_index_economic_resources.dtypes)

SA2 code                          object
SA2 name                          object
IER_Usual Resident Population    float64
IER_Score                        float64
IER_State                         object
IER_State Rank                   float64
IER_State Decile                 float64
IER_State Percentile             float64
dtype: object


In [18]:
# Delete rows in the SA2 code column that contain non-numeric values
df_index_economic_resources = df_index_economic_resources[pd.to_numeric(df_index_economic_resources['SA2 code'], errors='coerce').notnull()]

df_index_economic_resources['SA2 code'] = df_index_economic_resources['SA2 code'].astype('Int64')

print(df_index_economic_resources.dtypes)

SA2 code                           Int64
SA2 name                          object
IER_Usual Resident Population    float64
IER_Score                        float64
IER_State                         object
IER_State Rank                   float64
IER_State Decile                 float64
IER_State Percentile             float64
dtype: object


### filter all non-VIC

In [19]:
df_index_economic_resources = df_index_economic_resources[df_index_economic_resources['IER_State'] == 'VIC']

df_index_economic_resources = df_index_economic_resources.drop(["IER_State"]
                                                                , axis=1)

In [20]:
df_index_economic_resources.head(5)

Unnamed: 0,SA2 code,SA2 name,IER_Usual Resident Population,IER_Score,IER_State Rank,IER_State Decile,IER_State Percentile
627,201011001,Alfredton,16835.0,1035.994075,332.0,7.0,65.0
628,201011002,Ballarat,12131.0,978.412277,160.0,4.0,32.0
629,201011005,Buninyong,7261.0,1052.539664,392.0,8.0,77.0
630,201011006,Delacombe,10661.0,1006.242565,244.0,5.0,48.0
631,201011007,Smythes Creek,4230.0,1090.794166,479.0,10.0,94.0


### 2. Dataset for the Index of Education and Occupation (IEO)

In [21]:
df_index_education_occupation = pd.read_excel("../../data/raw/affluence_dataset/affluence_part1.xlsx",
                           sheet_name="Table 5",
                           header = 5)

df_index_education_occupation = df_index_education_occupation.rename(columns={
    'Unnamed: 4': 'Unused', 
    'Unnamed: 8': 'Unused',  
    'Rank.1': 'State Rank',
    'Decile.1': 'State Decile',
    'Percentile.1': 'State Percentile',
    '% Usual Resident Population without an SA1 level score': 'SA1 Missing Population %'
})

df_index_education_occupation = df_index_education_occupation.drop(columns=['Unused'])

columns = df_index_education_occupation.columns

new_columns = []
for col in columns:
    if 'Code' not in col and '2021 SA Name' not in col:
        new_columns.append(f"IEO_{col}")
    else:
        new_columns.append(col)

df_index_education_occupation.columns = new_columns

print(df_index_education_occupation.columns)


Index(['2021 Statistical Area Level 2  (SA2) 9-Digit Code',
       'IEO_2021 Statistical Area Level 2 (SA2) Name ',
       'IEO_Usual Resident Population', 'IEO_Score', 'IEO_Rank', 'IEO_Decile',
       'IEO_Percentile', 'IEO_State', 'IEO_State Rank', 'IEO_State Decile',
       'IEO_State Percentile', 'IEO_Minimum score for SA1s in area',
       'IEO_Maximum score for SA1s in area', 'IEO_SA1 Missing Population %'],
      dtype='object')


In [22]:
df_index_education_occupation = df_index_education_occupation.drop(["IEO_Maximum score for SA1s in area",
                                                                "IEO_SA1 Missing Population %",
                                                                "IEO_Minimum score for SA1s in area",
                                                                "IEO_Rank",
                                                                "IEO_Decile",
                                                                "IEO_Percentile",
                                                                "IEO_Usual Resident Population"]
                                                                , axis=1)

In [23]:
df_index_education_occupation = df_index_education_occupation.rename(columns={
    '2021 Statistical Area Level 2  (SA2) 9-Digit Code': 'SA2 code',
    'IEO_2021 Statistical Area Level 2 (SA2) Name ': 'SA2 name'
})
print(df_index_education_occupation.dtypes)

SA2 code                 object
SA2 name                 object
IEO_Score               float64
IEO_State                object
IEO_State Rank          float64
IEO_State Decile        float64
IEO_State Percentile    float64
dtype: object


In [24]:
# Delete rows in the SA2 code column that contain non-numeric values
df_index_education_occupation = df_index_education_occupation[pd.to_numeric(df_index_education_occupation['SA2 code'], errors='coerce').notnull()]

df_index_education_occupation['SA2 code'] = df_index_education_occupation['SA2 code'].astype('Int64')

print(df_index_education_occupation.dtypes)

SA2 code                  Int64
SA2 name                 object
IEO_Score               float64
IEO_State                object
IEO_State Rank          float64
IEO_State Decile        float64
IEO_State Percentile    float64
dtype: object


In [25]:
df_index_education_occupation = df_index_education_occupation[df_index_education_occupation['IEO_State'] == 'VIC']

df_index_education_occupation = df_index_education_occupation.drop(["IEO_State"]
                                                                , axis=1)
df_index_education_occupation.count()

SA2 code                516
SA2 name                516
IEO_Score               516
IEO_State Rank          516
IEO_State Decile        516
IEO_State Percentile    516
dtype: int64

In [26]:
df_index_education_occupation.head(5)

Unnamed: 0,SA2 code,SA2 name,IEO_Score,IEO_State Rank,IEO_State Decile,IEO_State Percentile
629,201011001,Alfredton,1009.718884,267.0,6.0,52.0
630,201011002,Ballarat,1075.750257,380.0,8.0,74.0
631,201011005,Buninyong,1049.966026,340.0,7.0,66.0
632,201011006,Delacombe,936.676307,87.0,2.0,17.0
633,201011007,Smythes Creek,959.52356,147.0,3.0,29.0


## 3. Combine dataset together(IER & IEO)

In [27]:
df_combined_IEO_IER = pd.merge(df_index_education_occupation, 
                               df_index_economic_resources,
                               on='SA2 code', 
                               how='left', 
                               suffixes=('', '_drop')) 

# Delete duplicate columns (with drop suffix)
df_combined_IEO_IER = df_combined_IEO_IER.drop([col for col in df_combined_IEO_IER.columns if 'drop' in col], axis=1)
# 3-missing values when combing
df_combined_IEO_IER.count()

SA2 code                         516
SA2 name                         516
IEO_Score                        516
IEO_State Rank                   516
IEO_State Decile                 516
IEO_State Percentile             516
IER_Usual Resident Population    513
IER_Score                        513
IER_State Rank                   513
IER_State Decile                 513
IER_State Percentile             513
dtype: int64

In [28]:
print(df_combined_IEO_IER.columns)

Index(['SA2 code', 'SA2 name', 'IEO_Score', 'IEO_State Rank',
       'IEO_State Decile', 'IEO_State Percentile',
       'IER_Usual Resident Population', 'IER_Score', 'IER_State Rank',
       'IER_State Decile', 'IER_State Percentile'],
      dtype='object')


In [29]:
df_combined_IEO_IER.to_csv("../../data/curated/affluence_cleaned/combined_IEO_IER.csv",
                            index=False)