In [209]:
import regex as re
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [210]:
population = pd.read_csv('../data/landing/new_population.csv')

# Combine first and second row to make the new column names
population.columns = [f"{col}_{str(population.iloc[0, idx])}" for idx, col in enumerate(population.columns)]

# Drop the first row since it's now part of the header
population = population.drop(0).reset_index(drop=True)

# Rename columns
new_cols = [
    "gccsa_code", "gcsa_name", "sa4_code", "sa4_name", "sa3_code", "sa3_name", 
    "sa2_code", "sa2_name", "erp_2001", "erp_2002", "erp_2003","erp_2004",
    "erp_2005","erp_2006","erp_2007","erp_2008","erp_2009","erp_2010",
    "erp_2011","erp_2012","erp_2013","erp_2014","erp_2015","erp_2016",
    "erp_2017","erp_2018","erp_2019", "erp_2020", "erp_2021", "erp_2022", 
    "erp_2023"
]

population.columns = new_cols

# Remove nan value for gccsa_code
population = population[~population['gccsa_code'].isna()]

# Only filter data in Victoria
population = population[population['gccsa_code'].str.contains('vic|mel', case=False)]

# Drop columns we don't need
drop_cols = ["gccsa_code", "gcsa_name", "sa4_code", "sa4_name", "sa3_code", "sa3_name"]
population = population.drop(columns=drop_cols)

# Lower suburb name
population['sa2_name'] = population['sa2_name'].str.lower()

In [211]:
# Define directional modifiers and the word 'surrounds' to be removed
directional_modifiers = [' - east', ' - west', ' - north', ' - south', ' - central', ' surrounds', ' (north)', ' (south)', ' (east)', ' (west)']
pattern = '|'.join([re.escape(suffix) for suffix in directional_modifiers])
population['sa2_name'] = population['sa2_name'].str.replace(pattern, '', regex=True)

# Split sa2_name where multiple names are separated by hyphens
population['sa2_name'] = population['sa2_name'].str.split(' - ')

# Explode the lists into separate rows
population_exploded = population.explode('sa2_name')
population_exploded = population_exploded.reset_index(drop=True)

# Mapping for the SA2 names to the correct suburbs
sa2_name_mapping = {
    'ballarat' : 'ballarat central',
    'flemington racecourse' : 'flemington',
    'southbank wharf' : 'south wharf',
    'port melbourne industrial' : 'port melbourne',
    'reservoir east' : 'reservoir',
    'reservoir west' : 'reservoir',
    'research warrandyte' : 'warrandyte',
    'essendon airport' : 'essendon',
    'gladstone parkmeadows' : 'gladstone park',
    'craigieburn west' : 'craigieburn',
    'wandin' : 'wandin north',
    'pakenham east' : 'pakenham',
    'pakenham west' : 'pakenham',
    'narre warren west' : 'narre warren',
    'berwick east' : 'berwick',
    'berwick west' : 'berwick',
    'point cook east' : 'point cook',
    'point cook west' : 'point cook',
    'truganina east' : 'truganina',
    'truganina west' : 'truganina',
    'melbourne cbd' : 'melbourne'
}

# Remove the "(vic.)" from sa2_name values
population_exploded['sa2_name'] = population_exploded['sa2_name'].str.replace(r'\s*\(vic\.\)', '', regex=True)
population_exploded['sa2_name'] = population_exploded['sa2_name'].replace(sa2_name_mapping)

In [212]:
erp_cols = population.columns[population.columns.str.contains('erp')]
population_exploded[erp_cols] = population_exploded[erp_cols].astype('int')

# Create the aggregation dictionary
aggregation_functions = {col: 'sum' for col in erp_cols}

# Apply the groupby and aggregation
population_grouped = population_exploded.groupby('sa2_name').agg(aggregation_functions).reset_index()

# Extrapolation

In [213]:
new_cols = ["sa2_name", "2001", "2002", "2003","2004","2005","2006","2007","2008",
            "2009","2010","2011","2012","2013","2014","2015","2016","2017","2018",
            "2019", "2020", "2021", "2022", "2023"]

population_grouped.columns = new_cols

In [214]:
train_data = population_grouped.iloc[:, 1:24]  # Years 2001 to 2023

In [215]:
from statsmodels.tsa.arima.model import ARIMA

predictions_arima = []

for index, row in train_data.iterrows():
    model = ARIMA(row.values, order=(1, 1, 1))
    model_fit = model.fit()
    forecast = model_fit.forecast(steps=4)
    predictions_arima.append(forecast)

predictions_arima = np.array(predictions_arima)

In [217]:
predictions_df = pd.DataFrame(predictions_arima, columns=[2024, 2025, 2026, 2027])
final_population = pd.concat([population_grouped, predictions_df], axis=1)

In [218]:
# Convert all entries to integers
numbered_columns = [col for col in final_population.columns if isinstance(col, int)]
final_population[numbered_columns] = final_population[numbered_columns].astype(int)

# Drop 2001 to 2015 columns
columns_to_drop = [str(year) for year in range(2001, 2016)]
final_population = final_population.drop(columns=columns_to_drop)

In [220]:
melted_df = final_population.melt(id_vars=['sa2_name'], 
                             var_name='year', 
                             value_name='erp')

# Convert 'year' column to int for consistency
melted_df['year'] = melted_df['year'].astype(int)

# Group by suburb and year
melted_df.sort_values(by=['sa2_name', 'year'])

# Save as csv
melted_df.to_csv('../data/curated/final_population.csv', index=False)