In [1]:
import regex as re
import pandas as pd

In [2]:
population = pd.read_csv('../data/landing/new_population.csv')

# Combine first and second row to make the new column names
population.columns = [f"{col}_{str(population.iloc[0, idx])}" for idx, col in enumerate(population.columns)]

# Drop the first row since it's now part of the header
population = population.drop(0).reset_index(drop=True)

# Rename columns
new_cols = [
    "gccsa_code", "gcsa_name", "sa4_code", "sa4_name", "sa3_code", "sa3_name", 
    "sa2_code", "sa2_name", "erp_2001", "erp_2002", "erp_2003","erp_2004",
    "erp_2005","erp_2006","erp_2007","erp_2008","erp_2009","erp_2010",
    "erp_2011","erp_2012","erp_2013","erp_2014","erp_2015","erp_2016",
    "erp_2017","erp_2018","erp_2019", "erp_2020", "erp_2021", "erp_2022", 
    "erp_2023"
]

population.columns = new_cols

# Remove nan value for gccsa_code
population = population[~population['gccsa_code'].isna()]

# Only filter data in Victoria
population = population[population['gccsa_code'].str.contains('vic|mel', case=False)]

# Lower suburb name
population['sa2_name'] = population['sa2_name'].str.lower()

# Drop columns we don't need
drop_cols = ["gccsa_code", "gcsa_name", "sa4_code", "sa4_name", "sa3_code", "sa3_name"]
population = population.drop(columns=drop_cols)

In [31]:
# Define directional modifiers and the word 'surrounds' to be removed
directional_modifiers = [' - east', ' - west', ' - north', ' - south', ' - central', ' surrounds', ' (north)', ' (south)', ' (east)', ' (west)']
pattern = '|'.join([re.escape(suffix) for suffix in directional_modifiers])
population['sa2_name'] = population['sa2_name'].str.replace(pattern, '', regex=True)

# Split sa2_name where multiple names are separated by hyphens
population['sa2_name'] = population['sa2_name'].str.split(' - ')

# Explode the lists into separate rows
population_exploded = population.explode('sa2_name')
population_exploded = population_exploded.reset_index(drop=True)

# Mapping for the SA2 names to the correct suburbs
sa2_name_mapping = {
    'ballarat' : 'ballarat central',
    'flemington racecourse' : 'flemington',
    'southbank wharf' : 'south wharf',
    'port melbourne industrial' : 'port melbourne',
    'reservoir east' : 'reservoir',
    'reservoir west' : 'reservoir',
    'research warrandyte' : 'warrandyte',
    'essendon airport' : 'essendon',
    'gladstone parkmeadows' : 'gladstone park',
    'craigieburn west' : 'craigieburn',
    'wandin' : 'wandin north',
    'pakenham east' : 'pakenham',
    'pakenham west' : 'pakenham',
    'narre warren west' : 'narre warren',
    'berwick east' : 'berwick',
    'berwick west' : 'berwick',
    'point cook east' : 'point cook',
    'point cook west' : 'point cook',
    'truganina east' : 'truganina',
    'truganina west' : 'truganina',
    'melbourne cbd' : 'melbourne'
}

# Remove the "(vic.)" from sa2_name values
population_exploded['sa2_name'] = population_exploded['sa2_name'].str.replace(r'\s*\(vic\.\)', '', regex=True)
population_exploded['sa2_name'] = population_exploded['sa2_name'].replace(sa2_name_mapping)

In [42]:
erp_cols = population.columns[population.columns.str.contains('erp')]
population_exploded[erp_cols] = population_exploded[erp_cols].astype('int')

# Create the aggregation dictionary
aggregation_functions = {col: 'sum' for col in erp_cols}

# Apply the groupby and aggregation
population_grouped = population_exploded.groupby('sa2_name').agg(aggregation_functions).reset_index()

In [46]:
population_grouped.to_csv('../data/curated/new_population.csv', index=False)