In [3]:
import os
import pandas as pd

# File paths
population_past = '../../data/raw/ABS_population/past_population_data_filtered.csv'  
population_forecast = '../../data/raw/ABS_population/population_forecast_2024_2027.csv'  
output_dir = '../../data/raw/merge_past_forcasting_data/'

# Read the CSV files
df_past = pd.read_csv(population_past)
df_forecast = pd.read_csv(population_forecast)

# Display the columns of each file
print("Columns in the first CSV file:")
print(df_past.columns.tolist())

print("\nColumns in the second CSV file:")
print(df_forecast.columns.tolist())

Columns in the first CSV file:
['SA2 code', 'population 2001', 'population 2002', 'population 2003', 'population 2004', 'population 2005', 'population 2006', 'population 2007', 'population 2008', 'population 2009', 'population 2010', 'population 2011', 'population 2012', 'population 2013', 'population 2014', 'population 2015', 'population 2016', 'population 2017', 'population 2018', 'population 2019', 'population 2020', 'population 2021', 'population 2022', 'population 2023']

Columns in the second CSV file:
['area code', 'population 2024', 'population 2025', 'population 2026', 'population 2027']


In [4]:
# Read the CSV files
df_past = pd.read_csv(population_past)
df_forecast = pd.read_csv(population_forecast)

# Filter past population data for the years 2015 to 2023
years_to_keep = [f'population {year}' for year in range(2015, 2024)]
df_past = df_past[['SA2 code'] + years_to_keep]

# Melt the past population data to match the desired format
df_past_melted = df_past.melt(id_vars=['SA2 code'], 
                              value_vars=years_to_keep,
                              var_name='Year', value_name='Population')

# Clean up the Year column in the past data
df_past_melted['Year'] = df_past_melted['Year'].str.extract('(\d+)').astype(int)

# Rename 'area code' in forecast data to 'SA2 code' for consistency
df_forecast.rename(columns={'area code': 'SA2 code'}, inplace=True)

# Melt the forecast population data to match the desired format
df_forecast_melted = df_forecast.melt(id_vars=['SA2 code'], 
                                      value_vars=[col for col in df_forecast.columns if 'population' in col],
                                      var_name='Year', value_name='Population')

# Clean up the Year column in the forecast data
df_forecast_melted['Year'] = df_forecast_melted['Year'].str.extract('(\d+)').astype(int)

# Concatenate past and forecast data
merged_df = pd.concat([df_past_melted, df_forecast_melted], ignore_index=True)

# Sort by 'Year' and 'SA2 code'
merged_df = merged_df.sort_values(by=['Year', 'SA2 code']).reset_index(drop=True)

# Reorder columns to ensure 'Year' is the first column
merged_df = merged_df[['Year', 'SA2 code', 'Population']]

# Check if the output directory exists, if not, create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the merged DataFrame to a new CSV file
output_file = os.path.join(output_dir, 'merged_population_data.csv')
merged_df.to_csv(output_file, index=False)

print("Merged CSV file created successfully!")

Merged CSV file created successfully!
