In [153]:
import pandas as pd

In [154]:
data = pd.read_csv("../data/curated/NO_DOMAIN_DATASET.csv")
coast = pd.read_csv("../data/curated/vic_beach_proximity.csv")
schools = pd.read_csv("../data/curated/schools_by_SAL.csv")
tourism = pd.read_csv("../data/curated/victoria_gdp_tourism.csv")
airbnb = pd.read_csv("../data/curated/airbnb_counts.csv")

In [155]:
data_train = data
data_train.shape


(7358, 63)

In [156]:
# get years from 2016-2023


data_train = pd.merge(data_train, tourism, on='year', how='inner')
data_train = pd.merge(data_train, airbnb, left_on='SAL_CODE', right_on='SAL_CODE21', how='left')
data_train = pd.merge(data_train, schools, on='SAL_CODE', how='inner')

data_train.rename(columns={'counts':'airbnb_count'}, inplace=True)

In [157]:
data_train.shape

(5551, 65)

In [158]:
to_exclude = pd.read_csv('../data/landing/filtered_sal_codes.csv')

to_exclude = list(to_exclude['SAL_CODE'])


In [159]:
# also exclude any airport
data_train = data_train[~data_train['SAL_suburb'].str.contains('Airport')]

data_train = data_train[~data_train['SAL_CODE'].isin(to_exclude)]

data_train = data_train[~data_train['SAL_suburb'].str.contains("Ravenhall")]

# Get counts per year
year_counts = data_train['year'].value_counts().sort_index()


In [160]:
cols = ['distance_to_CBD', 'time_to_CBD','distance_to_station', 'time_to_station', 
        'year', 'A Crimes against the person',
       'Average_household_size', 'Average_num_psns_per_bedroom',
       'B Property and deception offences', 'C Drug offences',
       'D Public order and security offences', 'E Justice procedures offences',
       'F Other offences', 'Median_age_persons', 'Median_tot_fam_inc_weekly',
       'Median_tot_hhd_inc_weekly', 'Median_tot_prsnl_inc_weekly', 'Tot_P_P',
        'average_quarterly_count', 'average_weekly_rent',
        'commercial', 'education', 'food_establishments',
       'healthcare', 'industrial', 'public_transport', 'recreation',
       'residential', 'shopping', 'inflation',"SAL_CODE", "median_score", 'average_price',"proximity_to_beach", 'gdp_impact', 'airbnb_count']

data_train = data_train[cols]



KeyError: "['gdp_impact', 'airbnb_count'] not in index"

In [138]:
import geopandas as gpd

In [139]:
# get data to be divided by area
div_area = ['commercial', 'education', 'food_establishments','healthcare', 'industrial', 'public_transport', 'recreation','residential', 'shopping']

# get SAL codes
sal_boundaries = gpd.read_file('../data/landing/SAL_data/SAL_2021_AUST_GDA2020.shp')
sal_boundaries = sal_boundaries[['SAL_CODE21','AREASQKM21']]

# Filter out non-numeric SAL_CODE21 values
sal_boundaries = sal_boundaries[sal_boundaries['SAL_CODE21'].apply(lambda x: str(x).isdigit())]
sal_boundaries['SAL_CODE21'] = sal_boundaries['SAL_CODE21'].astype(int)

data_train = pd.merge(data_train, sal_boundaries, left_on='SAL_CODE', right_on='SAL_CODE21', how='inner')

# divide by area
for col in div_area:
    data_train[col + ' density'] = data_train[col] / data_train['AREASQKM21']
    data_train.drop(columns=[col], inplace=True)

data_train['pop density'] = data_train['Tot_P_P'] / data_train['AREASQKM21']

data_train = data_train.drop(columns=['AREASQKM21','SAL_CODE21'])

In [140]:
years_interest = list(range(2016, 2030))
interest_rates = [
    1.75,  # 2016
    1.50,  # 2017
    1.50,  # 2018
    1.50,  # 2019
    0.25,  # 2020 (COVID-19 impact, very low rates)
    0.10,  # 2021 (near-zero rates)
    0.35,  # 2022 (start of increase)
    3.10,  # 2023 (rise due to inflation concerns)
    4.00,  # 2024 (projection)
    3.75,  # 2025 (projection)
    3.50,  # 2026 (projection)
    3.25,  # 2027 (projection)
    3.00,  # 2028 (projection)
    2.75   # 2029 (projection)
]

# Creating a DataFrame
interest_rate_table = pd.DataFrame({
    "Year": years_interest,
    "Projected Interest Rate (%)": interest_rates
})

# create df with years and gdp growth rate
gdp_df = pd.DataFrame({
    'year': range(2016, 2030),
    'interest_rate': interest_rates
})

# Merge the GDP growth rate data with the main dataframe on the 'year' column
data_train = pd.merge(data_train, gdp_df, on='year', how='left')

In [141]:
import pandas as pd

# Assuming your DataFrame is called df
# First, ensure that your DataFrame is sorted by 'year'
df = data_train
df = df.sort_values('year')

# Convert 'inflation' rates from percentages to decimals
df['inflation_decimal'] = df['inflation'] / 100

# Create a DataFrame containing all years from 2016 to the maximum year in your data
years = pd.DataFrame({'year': range(2016, df['year'].max() + 1)})

# Merge with the inflation data to ensure all years are included
inflation_data = pd.merge(years, df[['year', 'inflation_decimal']].drop_duplicates(), on='year', how='left')

# Forward-fill any missing inflation rates (if any years are missing)
inflation_data['inflation_decimal'] = inflation_data['inflation_decimal'].fillna(method='ffill')

# Calculate the cumulative inflation factor from 2016 to each year
inflation_data['cumulative_inflation_factor'] = (1 + inflation_data['inflation_decimal']).cumprod()

# Create a dictionary to map years to cumulative inflation factors
cumulative_inflation_dict = inflation_data.set_index('year')['cumulative_inflation_factor'].to_dict()

# Map the cumulative inflation factors back to the original DataFrame
df['cumulative_inflation_factor'] = df['year'].map(cumulative_inflation_dict)


# Update the 'inflation' column to reflect the cumulative effect from 2016
df['inflation'] = df['cumulative_inflation_factor']

# Drop the temporary columns if you no longer need them
df = df.drop(columns=['inflation_decimal', 'cumulative_inflation_factor'])


In [142]:
import numpy as np

In [143]:
import matplotlib.pyplot as plt
import seaborn as sns

In [144]:
# combine inflation and gdp growth rate per year:
# Merge the GDP growth rate data with the main dataframe on the 'year' column
inflation_data = pd.merge(inflation_data, gdp_df, on='year', how='left')
inflation_data

Unnamed: 0,year,inflation_decimal,cumulative_inflation_factor,interest_rate
0,2016,0.013,1.013,1.75
1,2017,0.021,1.034273,1.5
2,2018,0.019,1.053924,1.5
3,2019,0.013,1.067625,1.5
4,2020,0.022,1.091113,0.25
5,2021,0.011,1.103115,0.1
6,2022,0.051,1.159374,0.35
7,2023,0.07,1.24053,3.1
8,2024,0.036,1.285189,4.0
9,2025,0.03,1.323745,3.75


In [145]:
# inflation affected columns
inflation_affected_cols = ['average_weekly_rent', 'Median_tot_fam_inc_weekly', 'Median_tot_hhd_inc_weekly', 'Median_tot_prsnl_inc_weekly', 'gdp_impact']

# Apply inflation adjustment to the affected columns
for col in inflation_affected_cols:
    df[col + '/inflation'] = df[col] / df['inflation']

    df.drop(columns=[col], inplace=True)

In [146]:

df['gdp_cbd/inflation/beach'] = np.log(df['gdp_impact/inflation']) * df['proximity_to_beach']
df['gdp_cbd/inflation/cbd'] = np.log(df['gdp_impact/inflation']) * df['distance_to_CBD']
df['gpd/inflation/airbnb'] = np.log(df['gdp_impact/inflation']) * df['airbnb_count']

# drop the original columns
df.drop(columns=['gdp_impact/inflation'], inplace=True)

In [147]:
df.drop(columns=['inflation'], inplace=True)

In [148]:
# divide by average_household_size to get 'per person' rent
df['average_weekly_rent/inflation/household_size'] = df['average_weekly_rent/inflation'] / (df['Average_household_size'] + 0.01)
df.drop(columns=['average_weekly_rent/inflation'], inplace=True)

In [149]:
import numpy as np

In [150]:
# get crime rate per person
crime_cols = ['A Crimes against the person', 'B Property and deception offences', 'C Drug offences', 'D Public order and security offences', 'E Justice procedures offences', 'F Other offences']

for col in crime_cols:
    df[col + '/per_person'] = df[col] / (df['Tot_P_P'] + 0.01)

    df.drop(columns=[col], inplace=True)

In [151]:
# log data for every column but average weekly rent
for col in df.columns:
    if col not in ['average_weekly_rent/inflation/household_size', 'SAL_CODE', 'year', 'interest_rate']:
        df[col] = df[col].apply(lambda x: np.log(x + 1))

In [152]:
df.to_csv('../data/curated/Processed Data Final.csv', index=False)