## this notebook merges values of external features to the dataset

Time period of founded data 
income per person: 2006 - 2019
population density: 2001-2021
GDP: 1960 - 2020
Saving rate: 1970 - 2020
crime cases: 2013 - 2022

Since we plan to train models for 2013 - 2022, those missing values of external features (i.e. 2020 - 2022 of income per person) will be filled up with predicted features' values.

In [22]:
import pandas as pd
import os
import numpy as np

In [23]:
# create path
new_path = '../../data/curated/merged_dataset/'

if not os.path.exists(new_path):
    os.makedirs(new_path)

## Merge GDP and saving rate and drop month to get values of all attributes based on year

In [24]:
# load gdp and saving rate data and predicted gdp and saving rate data
gdp_sr = pd.read_csv('../../data/curated/gdp_with_saving_rate.csv')
pred_gdp_sr = pd.read_csv('../../data/curated/feature_prediction/21_27_gdp_with_saving_predicted.csv')

In [25]:
year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

for i in year:
    if i not in [2021, 2022]:
        data = pd.read_csv(f'../../data/curated/min_distance_sa2_organised/{i}_property_with_SA2.csv', index_col=[0])

        # fill up missing distance data with maximum distance
        max_cbd = np.nanmax(data['min_distance_to_cbd'].values)
        max_park = np.nanmax(data['min_distance_to_park'].values)
        max_prim = np.nanmax(data['min_distance_to_prim'].values)
        max_second = np.nanmax(data['min_distance_to_second'].values)
        max_train = np.nanmax(data['min_distance_to_train'].values)
        max_poli = np.nanmax(data['min_distance_to_poli'].values)
        max_hosp = np.nanmax(data['min_distance_to_hosp'].values)
        max_shop = np.nanmax(data['min_distance_to_shop'].values)

        data = data.fillna({'min_distance_to_cbd': max_cbd, 'min_distance_to_park': max_park, 'min_distance_to_prim': max_prim, 'min_distance_to_second': max_second, 
        'min_distance_to_train': max_train, 'min_distance_to_hosp': max_hosp, 'min_distance_to_poli': max_poli, 'min_distance_to_shop': max_shop})    
        
        # drop months to get values of all attributes based on year
        data = data.groupby(['year', 'address', 'latitude', 'longitude', 'postcode',
            'sa2_2021', 'sa2_2016', 'residence_type', 'nbed', 'nbath', 'ncar'], as_index=False)\
        .agg({'min_distance_to_cbd': 'first', 'min_distance_to_park': 'first', 'min_distance_to_prim': 'first', 'min_distance_to_second': 'first', 'min_distance_to_train': 'first', 
        'min_distance_to_hosp': 'first', 'min_distance_to_poli': 'first', 'min_distance_to_shop': 'first', 'weekly_rent': 'mean'})

        # merge gdp and saving rate data with dataset of organised dataset (except 2021 and 2022) based on year
        gdp = list(gdp_sr.loc[gdp_sr['year'] == i, 'gdp(USD Millioins)'])[0]
        sr = list(gdp_sr.loc[gdp_sr['year'] == i, 'saving_rate(% of GDP)'])[0]
        
        data[list(gdp_sr.columns)[2]] = gdp
        data[list(gdp_sr.columns)[3]] = sr

    else:
        data = pd.read_csv(f'../../data/curated/min_distance_sa2_organised/{i}_property_with_SA2.csv', index_col=[0])

        # fill up missing distance data with maximum distance
        max_cbd = np.nanmax(data['min_distance_to_cbd'].values)
        max_park = np.nanmax(data['min_distance_to_park'].values)
        max_prim = np.nanmax(data['min_distance_to_prim'].values)
        max_second = np.nanmax(data['min_distance_to_second'].values)
        max_train = np.nanmax(data['min_distance_to_train'].values)
        max_poli = np.nanmax(data['min_distance_to_poli'].values)
        max_hosp = np.nanmax(data['min_distance_to_hosp'].values)
        max_shop = np.nanmax(data['min_distance_to_shop'].values)

        data = data.fillna({'min_distance_to_cbd': max_cbd, 'min_distance_to_park': max_park, 'min_distance_to_prim': max_prim, 'min_distance_to_second': max_second, 
        'min_distance_to_train': max_train, 'min_distance_to_hosp': max_hosp, 'min_distance_to_poli': max_poli, 'min_distance_to_shop': max_shop})   
        
        # drop months to get values of all attributes based on year
        data = data.groupby(['year', 'address', 'latitude', 'longitude', 'postcode',
            'sa2_2021', 'sa2_2016', 'residence_type', 'nbed', 'nbath', 'ncar'], as_index=False)\
        .agg({'min_distance_to_cbd': 'first', 'min_distance_to_park': 'first', 'min_distance_to_prim': 'first', 'min_distance_to_second': 'first', 'min_distance_to_train': 'first', 
        'min_distance_to_hosp': 'first', 'min_distance_to_poli': 'first', 'min_distance_to_shop': 'first', 'weekly_rent': 'mean'})

        # merge predicted gdp and saving rate data with dataset of organised dataset (2021 and 2022) based on year
        gdp = list(pred_gdp_sr.loc[pred_gdp_sr['year'] == i, 'gdp'])[0]
        sr = list(pred_gdp_sr.loc[pred_gdp_sr['year'] == i, 'saving'])[0]
        
        data[list(gdp_sr.columns)[2]] = gdp
        data[list(gdp_sr.columns)[3]] = sr

    # output csv file of merged dataset
    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv')



## Merge income per person for each sa2

In [26]:
# load income per person for each sa2 data and predicted income per person for each sa2 data
income = pd.read_csv('../../data/curated/income_per_person_sa2.csv', index_col=[0])
income = income.rename(columns={'SA2': 'sa2_2016'})
pred_income = pd.read_csv('../../data/curated/feature_prediction/20_27_income_per_person_2016sa2.csv', index_col=[0])

In [27]:
year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

for i in year:
    if i not in [2020, 2021, 2022]:
        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

        # merge income per person for each sa2 with previous merged dataset (except 2020, 2021 and 2022) based on 2016 sa2
        income_data = income.loc[:, ['sa2_2016', str(i)]]

        data['income_per_person'] = (data.merge(income_data, on=['sa2_2016'], how='left')[str(i)])

    else:
        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

        # merge predicted income per person for each sa2 with previous merged dataset (2020, 2021 and 2022) based on 2016 sa2
        income_data = pred_income.loc[pred_income['Year'] == i][['sa2_2016', 'income_per_person_sa2']]

        data['income_per_person'] = (data.merge(income_data, on=['sa2_2016'], how='left')['income_per_person_sa2'])

    # output csv file of merged dataset
    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv')


## Merge population density for each sa2

In [28]:
#load population density for each sa2 data and predicted population density for each sa2 data
population = pd.read_csv('../../data/curated/vic_population_density_sa2.csv', index_col=[0])
population = population.rename(columns={'SA2 code': 'sa2_2021'})
pred_population = pd.read_csv('../../data/curated/feature_prediction/22_27_population.csv', index_col=[0])
pred_population = pred_population.rename(columns={'SA2 code': 'sa2_2021'})

In [29]:
year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

for i in year:
    if i != 2022:
        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

        # merge population density for each sa2 with previous merged dataset (except 2022) based on 2021 sa2
        population_data = population.loc[:, ['sa2_2021', f'population_density_of_{str(i)}']]

        data['population_density'] = (data.merge(population_data, on=['sa2_2021'], how='left')[f'population_density_of_{str(i)}'])

    else:
        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])
        
        # merge predicted population density for each sa2 with previous merged dataset (2022) based on 2021 sa2
        population_data = pred_population.loc[pred_population['year'] == i][['sa2_2021', 'pred']]

        data['population_density'] = (data.merge(population_data, on=['sa2_2021'], how='left')['pred'])

    # output csv file of merged dataset
    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv')


## Merge crime cases by postcode

In [30]:
#load crime cases with postcode data
crime_cases = pd.read_csv('../../data/curated/crime_cases.csv', index_col=[0])
crime_cases = crime_cases.rename(columns={'Postcode': 'postcode'})

In [31]:
year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

for i in year:
    data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

    # merge crime cases with previous merged dataset (except 2022) based on postcode
    crime_cases_data = crime_cases.loc[crime_cases['Year'] == i][['postcode', 'Offence Count']]

    data['crime_cases'] = (data.merge(crime_cases_data, on=['postcode'], how='left')['Offence Count'])

    # output csv file of merged dataset
    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index=False)
