# Preprocessing range data by region

Import library

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import os

Read csv with landing data

In [2]:
dataframes = {
    'population': pd.read_csv('../../data/landing/region_data/long_run/Estimated_resident_population.csv'),
    'population_density': pd.read_csv('../../data/landing/region_data/long_run/Population_density_persons_km2.csv'),
    'percentage_working_population': pd.read_csv('../../data/landing/region_data/long_run/Working_age_population_aged_1564_years.csv'),
    'internal_arrivals': pd.read_csv('../../data/landing/region_data/long_run/Internal_arrivals.csv'),
    'nBusiness': pd.read_csv('../../data/landing/region_data/long_run/Total_number_of_businesses.csv'),
    'nHouses': pd.read_csv('../../data/landing/region_data/long_run/Houses__total.csv'),
    'nTownhouses': pd.read_csv('../../data/landing/region_data/long_run/Townhouses__total.csv'),
    'nApartments': pd.read_csv('../../data/landing/region_data/long_run/Apartments__total.csv'),
    'nDwellings': pd.read_csv('../../data/landing/region_data/long_run/Total_dwellings.csv'),
    'median_income': pd.read_csv('../../data/landing/region_data/long_run/Median_total_income_excl._Government_pensions_and_allowances_$.csv'),
    'nJob': pd.read_csv('../../data/landing/region_data/long_run/Number_of_jobs.csv'),
    'percentage_year_12': pd.read_csv('../../data/landing/region_data/long_run/Completed_year_12_or_equivalent_%.csv'),
    'nEmployed': pd.read_csv('../../data/landing/region_data/long_run/Employed.csv'),
    'nUnEmployed': pd.read_csv('../../data/landing/region_data/long_run/Unemployed.csv'),
    'nRented': pd.read_csv('../../data/landing/region_data/long_run/Rented.csv'),
    'nHomeless': pd.read_csv('../../data/landing/region_data/long_run/Count_of_homeless_persons.csv')
}

## Preprocessing

In [3]:
for key, df in dataframes.items():
    df.replace(' ', '', regex=True, inplace=True)
    for col in df.columns[1:]:
        df[col] = pd.to_numeric(df[col], errors='coerce')

In [4]:
dataframes['percentage_working_population'] = dataframes['percentage_working_population'].drop(columns='2023')
dataframes['internal_arrivals'] = dataframes['internal_arrivals'].drop(columns = '2018').drop(columns = '2019').drop(columns = '2020').drop(columns = '2021')

In [5]:
dataframes['percentage_working_population']

Unnamed: 0,Region,2018,2019,2020,2021,2022
0,Alfredton,8636,9177,9858.0,10662.0,11402.0
1,Ballarat,7754,7789,7678.0,7525.0,7401.0
2,Buninyong,4721,4712,4566.0,4407.0,4392.0
3,Delacombe,5290,5755,6304.0,6806.0,7619.0
4,SmythesCreek,2627,2692,2703.0,2716.0,2724.0
...,...,...,...,...,...,...
514,Otway,2315,2353,2385.0,2389.0,2352.0
515,Moyne-East,4212,4236,4258.0,4297.0,4350.0
516,Moyne-West,5825,5819,5779.0,5759.0,5787.0
517,Warrnambool-North,13808,13939,13994.0,13892.0,13946.0


In [6]:
region_names = dataframes['population']['Region']
range_region_df = region_names.copy()

for key, df in dataframes.items():
    if key != 'population':  
        df_years_only = df.drop(columns=['Region'])
        df_years_only.columns = [f"{key}_{col}" for col in df_years_only.columns]
        range_region_df = pd.concat([range_region_df, df_years_only], axis=1)


# Define the path for the CSV file
csv_file_path = '../../data/raw/region_data/range_region_data.csv'

# Ensure the directory exists before saving the file
os.makedirs(os.path.dirname(csv_file_path), exist_ok=True)

# Save the dataframe as a CSV file
range_region_df.to_csv(csv_file_path, index=False)


## Predict each Attribute with linear regression

In [7]:
years_to_predict = np.arange(2001, 2027).reshape(-1, 1)
predicted_dataframes = {}
# predict every region for each dataframe
for name, df in dataframes.items():
    prediction_results = []
    # predict every data for each region
    for index, row in df.iterrows():
        # get the data that we have
        existing_years = np.array([int(col) for col in df.columns[1:]]).reshape(-1, 1)
        existing_values = row[1:].values 
        existing_values_numeric = pd.to_numeric(existing_values, errors='coerce').reshape(-1, 1)
        # remove None
        non_nan_mask = ~np.isnan(existing_values_numeric).flatten() 
        existing_years_filtered = existing_years[non_nan_mask]  
        existing_values_filtered = existing_values_numeric[non_nan_mask] 
        # make sure there have at least 2 data， then do the linear regression
        if len(existing_years_filtered) >= 2:
            model = LinearRegression()
            model.fit(existing_years_filtered, existing_values_filtered)
            predicted_values = model.predict(years_to_predict).flatten()
        else:
            predicted_values = [np.nan] * len(years_to_predict)
        # manipulating the data less than 0
        for i in range(len(predicted_values)):
            if predicted_values[i] < 0:
                j = i - 1
                while j >= 0 and predicted_values[j] < 0:
                    j -= 1
                if j >= 0 and predicted_values[j] >= 0:
                    predicted_values[i] = predicted_values[j]
                else:
                    k = i + 1
                    while k < len(predicted_values) and predicted_values[k] < 0:
                        k += 1
                    if k < len(predicted_values) and predicted_values[k] >= 0:
                        predicted_values[i] = predicted_values[k]
                        
        prediction_results.append([row[0]] + list(predicted_values)) 
    
    # create new data frame
    new_columns = ['Region'] + [str(year) for year in range(2001, 2027)]
    predicted_df = pd.DataFrame(prediction_results, columns=new_columns)
    predicted_dataframes[name] = predicted_df

In [8]:
predicted_dataframes['nHomeless'].head()

Unnamed: 0,Region,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026
0,Alfredton,2.6,2.6,2.6,2.6,2.6,2.6,2.6,2.6,2.6,...,30.6,36.2,41.8,47.4,53.0,58.6,64.2,69.8,75.4,81.0
1,Ballarat,150.5,146.6,142.7,138.8,134.9,131.0,127.1,123.2,119.3,...,88.1,84.2,80.3,76.4,72.5,68.6,64.7,60.8,56.9,53.0
2,Buninyong,,,,,,,,,,...,,,,,,,,,,
3,Delacombe,21.166667,22.266667,23.366667,24.466667,25.566667,26.666667,27.766667,28.866667,29.966667,...,38.766667,39.866667,40.966667,42.066667,43.166667,44.266667,45.366667,46.466667,47.566667,48.666667
4,SmythesCreek,,,,,,,,,,...,,,,,,,,,,


Save as csv，the first column is region name

In [9]:
region_names = predicted_dataframes['population']['Region']
projection_df = pd.DataFrame({'region_name': region_names})
for attribute, df in predicted_dataframes.items():
    # Create a list to store the year data dictionary for each region
    attribute_data = []
    # create dictionary
    for index, row in df.iterrows():
        year_data = {str(year): row[str(year)] for year in range(2001, 2027)}
        attribute_data.append(year_data)
    # add to coverted dataframe
    projection_df[attribute] = attribute_data
# Define the path for saving the CSV file
csv_file_path = '../../data/raw/region_data/cleaned_range_data_with_projection.csv'

# Ensure the directory exists before saving the file
os.makedirs(os.path.dirname(csv_file_path), exist_ok=True)

# Save the dataframe as a CSV file
projection_df.to_csv(csv_file_path, index=False)

# Print the first few rows of the dataframe
print(projection_df.head())


    region_name                                         population  \
0     Alfredton  {'2001': 1046.4380952380598, '2002': 1046.4380...   
1      Ballarat  {'2001': 14016.247619047645, '2002': 13917.790...   
2     Buninyong  {'2001': 8041.771428571432, '2002': 8005.88571...   
3     Delacombe  {'2001': 443.06666666665114, '2002': 443.06666...   
4  SmythesCreek  {'2001': 3320.028571428571, '2002': 3363.51428...   

                                  population_density  \
0  {'2001': 19.802857142858556, '2002': 19.802857...   
1  {'2001': 1132.236190476191, '2002': 1124.28476...   
2  {'2001': 155.8161904761905, '2002': 155.124761...   
3  {'2001': 12.873333333329356, '2002': 12.873333...   
4  {'2001': 31.665714285714216, '2002': 32.082857...   

                       percentage_working_population  \
0  {'2001': 123.20000000018626, '2002': 123.20000...   
1  {'2001': 9472.399999999994, '2002': 9375.39999...   
2  {'2001': 6389.3000000000175, '2002': 6293.0, '...   
3  {'2001': 74.899

## Calculate the growth rate

In [10]:
def calculate_growth_rates(dataframes):
    growth_rates = {}  
    for attribute, df in dataframes.items():
        valid_columns = df.columns[1:]  
        if len(valid_columns) >= 2:
            years = np.array([int(year) for year in valid_columns]).reshape(-1, 1)
            slopes = []
            for index, row in df.iterrows():
                population_data = row[1:].values 
                valid_data = [v for v in population_data if pd.notnull(v)]  
                if len(valid_data) >= 2:
                    model = LinearRegression()
                    model.fit(years[:len(valid_data)], np.array(valid_data).reshape(-1, 1))
                    slope = model.coef_[0][0] 
                    slopes.append(slope)
                else:
                    slopes.append(None) 
            growth_rates[attribute] = pd.DataFrame({
                'Region': df['Region'],
                'Growth Rate': slopes
            })
        else:
            print(f"Attribute {attribute} does not have enough data.")
    
    return growth_rates


In [11]:
growth_rates = calculate_growth_rates(dataframes)

In [None]:
merged_growth_rates = pd.DataFrame({'Region': dataframes['population']['Region']})
for attribute, df in growth_rates.items():
    merged_growth_rates[attribute] = df['Growth Rate']
# Define the path for saving the CSV file
output_csv_path = '../../data/raw/region_data/region_growth_rates.csv'

# Ensure the directory exists before saving the file
os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)

# Save the merged growth rates dataframe as a CSV file
merged_growth_rates.to_csv(output_csv_path, index=False)