# Joining Features

In [1]:
import pandas as pd
import json

from tqdm.auto import tqdm
tqdm.pandas()

import sys, os
sys.path.append(os.path.abspath('../'))
from scripts.utils import create_dir
import time
start_time = time.time()

## Preparing the Historical Dataset (2017-2023)

In [None]:
rental_hist_df = pd.read_csv('../data/curated/rental_data_cleaned.csv')
rental_hist_df.rename(columns={'sa2_code_left': 'sa2_code'}, inplace=True)
rental_hist_df = rental_hist_df[
    [
        'address', 'suburb', 'bed', 'bath', 'car', 'type', 'year', 'sa2_code',
        'median_income', 'population', 'cpi', 'unemployment_rate',
        'lat', 'lng',  'geometry', 'rented_price'
    ]
]
# strip only beginning whitespace character in suburb
rental_hist_df['suburb'] = rental_hist_df['suburb'].str.lstrip()
# convert sa2_code to string
rental_hist_df['sa2_code'] = rental_hist_df['sa2_code'].astype('Int64').astype(str)
# remove type that is not house or unit
rental_hist_df = rental_hist_df[
    rental_hist_df['type'].isin(['House', 'Unit/apmt'])
]
rental_hist_df

In [None]:
proximity_hist_df = pd.read_csv('../data/raw/proximity/time_city.csv')
proximity_hist_df

In [None]:
price_df = pd.read_csv('../data/raw/property-buy-sell/oldlistings_buy_avg.csv')
price_hist_df = price_df[
    (price_df['year'] >= 2017) &
    (price_df['year'] <= 2023)
]
price_hist_df

Join historical

In [None]:
hist_df = rental_hist_df.merge(proximity_hist_df, on=['lat', 'lng'], how='inner')
hist_df = hist_df.merge(price_hist_df, on=['suburb', 'year'], how='inner')
hist_df

## Preparing the Current Dataset (2024)

In [None]:
rental_curr_df = pd.read_csv('../data/raw/rental/rental_data_2024.csv')
#rename
rental_curr_df = rental_curr_df.rename(columns={
    'cpi_x': 'cpi',
    'Statistical Areas Level 2 2021 name': 'sa2_name'
})
rental_curr_df = rental_curr_df[
    [
        'address', 'suburb', 'bed', 'bath', 'car', 'type', 'year', 'sa2_code',
        'median_income', 'population', 'cpi', 'unemployment_rate',
        'lat', 'lng',  'geometry', 'rented_price'
    ]
]
# strip only beginning whitespace character in suburb
rental_curr_df['suburb'] = rental_curr_df['suburb'].str.lstrip()
# convert sa2_code to string
rental_curr_df['sa2_code'] = rental_curr_df['sa2_code'].astype('Int64').astype(str)
# remove type that is not house or unit
rental_curr_df = rental_curr_df[
    rental_curr_df['type'].isin(['House', 'Unit/apmt'])
]
rental_curr_df

In [None]:
proximity_curr_df = pd.read_csv('../data/raw/proximity/time_city_2024.csv')
proximity_curr_df

In [None]:
price_curr_df = price_df[price_df['year'] == 2024]
price_curr_df

In [None]:
curr_df = rental_curr_df.merge(proximity_curr_df, on=['lat', 'lng'], how='inner')
curr_df = curr_df.merge(price_curr_df, on=['suburb', 'year'], how='inner')
curr_df

In [None]:
# Join historical and current data
df = pd.concat([hist_df, curr_df], ignore_index=True)
df

## Preparing Future Dataset (2029)

In [None]:
# Create dictionary for unique suburb as key and SA2 code as value
suburb_sa2_dict = \
    df[['suburb', 'sa2_code']] \
        .drop_duplicates(). \
            set_index('suburb') \
                .to_dict()['sa2_code']
suburb_sa2_dict

In [None]:
df['type'].unique()

In [None]:
future_df = df[[
    'suburb', 'type', 'sa2_code', 'bed', 'bath', 'car', 'time_city',
    'avg_property_price', 'rented_price'
]]
future_df

In [None]:
# group by suburb, type, and sa2_code - aggregate by average for the rest
future_df = future_df.groupby([
    'suburb', 'type', 'sa2_code'
]).agg({
    'bed': 'mean',
    'bath': 'mean',
    'car': 'mean',
    'time_city': 'mean',
    'avg_property_price': 'mean',
    'rented_price': 'mean'
}).reset_index()
future_df

In [None]:
# import future predictions data

# CPI
future_cpi = pd.read_csv('../data/raw/predictions/cpi_data.csv')
future_cpi_2029 = float(
    future_cpi[future_cpi['year'] == 2029]['Annual change (%)'] \
    .iloc[0]
)

# Unemployment rate
future_unemployment = pd.read_csv('../data/raw/predictions/yearly_avg_unemployment.csv')
future_unemployment_2029 = float(
    future_unemployment[
        future_unemployment['year'] == 2029
    ]['Yearly Average Seasonally Adjusted (%)'] \
    .iloc[0]

)

# Median income
future_income = pd.read_csv('../data/raw/predictions/predicted_median_income.csv')
future_income_2029_df = future_income[
    future_income['year'] == 2029
][['sa2_code', 'median_income']]
future_income_2029_df['sa2_code'] = future_income_2029_df['sa2_code'].astype(str)

# Population
future_population = pd.read_csv('../data/raw/predictions/population_prediction.csv')
future_population_2029_df = future_population[
    future_population['year'] == 2029
][['Statistical Areas Level 2 2021 code', 'Projected persons (total)']]
future_population_2029_df = future_population_2029_df.rename(columns={
    'Statistical Areas Level 2 2021 code': 'sa2_code',
    'Projected persons (total)': 'population'
})
future_population_2029_df['sa2_code'] = future_population_2029_df['sa2_code'].astype(str)

# merge future predictions data
future_df_merge = future_df.merge(future_income_2029_df, on='sa2_code', how='inner')
future_df_merge = future_df_merge.merge(future_population_2029_df, on='sa2_code', how='inner')
future_df_merge['cpi'] = future_cpi_2029
future_df_merge['unemployment_rate'] = future_unemployment_2029
future_df_merge

## Output to CSVs

In [None]:
output = False
if output:
    create_dir('../data/curated')
    # df.to_csv('../data/curated/rental-17-24.csv', index=False)
    future_df_merge.to_csv('../data/curated/suburb-rental-29.csv', index=False)
    with open('../data/curated/suburb_to_sa2.json', 'w') as f:
        json.dump(suburb_sa2_dict, f)