In [7]:
import pandas as pd
import os
import numpy as np

In [8]:
# create path
new_path = '../../data/curated/merged_dataset/'

if not os.path.exists(new_path):
    os.makedirs(new_path)

## Merge GDP and saving rate and drop month to get values of all attributes based on year

In [9]:
# load gdp and saving rate data and predicted gdp and saving rate data
gdp_sr = pd.read_csv('../../data/curated/gdp_with_saving_rate.csv')
pred_gdp_sr = pd.read_csv('../../data/curated/feature_prediction/21_27_gdp_with_saving_predicted.csv')

In [10]:
year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

for i in year:
    if i not in [2021, 2022]:
        data = pd.read_csv(f'../../data/curated/min_distance_sa2_organised/{i}_property_with_SA2.csv', index_col=[0])

        # fill up missing distance data with maximum distance
        max_cbd = np.nanmax(data['min_distance_to_cbd'].values)
        max_park = np.nanmax(data['min_distance_to_park'].values)
        max_prim = np.nanmax(data['min_distance_to_prim'].values)
        max_second = np.nanmax(data['min_distance_to_second'].values)
        max_train = np.nanmax(data['min_distance_to_train'].values)
        max_poli = np.nanmax(data['min_distance_to_poli'].values)
        max_hosp = np.nanmax(data['min_distance_to_hosp'].values)
        max_shop = np.nanmax(data['min_distance_to_shop'].values)

        data = data.fillna({'min_distance_to_cbd': max_cbd, 'min_distance_to_park': max_park, 'min_distance_to_prim': max_prim, 'min_distance_to_second': max_second, 
        'min_distance_to_train': max_train, 'min_distance_to_hosp': max_hosp, 'min_distance_to_poli': max_poli, 'min_distance_to_shop': max_shop})    
        
        # drop months to get values of all attributes based on year
        data = data.groupby(['year', 'address', 'latitude', 'longitude', 'postcode',
            'sa2_2021', 'sa2_2016', 'residence_type', 'nbed', 'nbath', 'ncar'], as_index=False)\
        .agg({'min_distance_to_cbd': 'first', 'min_distance_to_park': 'first', 'min_distance_to_prim': 'first', 'min_distance_to_second': 'first', 'min_distance_to_train': 'first', 
        'min_distance_to_hosp': 'first', 'min_distance_to_poli': 'first', 'min_distance_to_shop': 'first', 'weekly_rent': 'mean'})

        # merge gdp and saving rate data with dataset of organised dataset (except 2021 and 2022) based on year
        gdp = list(gdp_sr.loc[gdp_sr['year'] == i, 'gdp(USD Millioins)'])[0]
        sr = list(gdp_sr.loc[gdp_sr['year'] == i, 'saving_rate(% of GDP)'])[0]
        
        data[list(gdp_sr.columns)[2]] = gdp
        data[list(gdp_sr.columns)[3]] = sr

    else:
        data = pd.read_csv(f'../../data/curated/min_distance_sa2_organised/{i}_property_with_SA2.csv', index_col=[0])

        # fill up missing distance data with maximum distance
        max_cbd = np.nanmax(data['min_distance_to_cbd'].values)
        max_park = np.nanmax(data['min_distance_to_park'].values)
        max_prim = np.nanmax(data['min_distance_to_prim'].values)
        max_second = np.nanmax(data['min_distance_to_second'].values)
        max_train = np.nanmax(data['min_distance_to_train'].values)
        max_poli = np.nanmax(data['min_distance_to_poli'].values)
        max_hosp = np.nanmax(data['min_distance_to_hosp'].values)
        max_shop = np.nanmax(data['min_distance_to_shop'].values)

        data = data.fillna({'min_distance_to_cbd': max_cbd, 'min_distance_to_park': max_park, 'min_distance_to_prim': max_prim, 'min_distance_to_second': max_second, 
        'min_distance_to_train': max_train, 'min_distance_to_hosp': max_hosp, 'min_distance_to_poli': max_poli, 'min_distance_to_shop': max_shop})   
        
        # drop months to get values of all attributes based on year
        data = data.groupby(['year', 'address', 'latitude', 'longitude', 'postcode',
            'sa2_2021', 'sa2_2016', 'residence_type', 'nbed', 'nbath', 'ncar'], as_index=False)\
        .agg({'min_distance_to_cbd': 'first', 'min_distance_to_park': 'first', 'min_distance_to_prim': 'first', 'min_distance_to_second': 'first', 'min_distance_to_train': 'first', 
        'min_distance_to_hosp': 'first', 'min_distance_to_poli': 'first', 'min_distance_to_shop': 'first', 'weekly_rent': 'mean'})

        # merge predicted gdp and saving rate data with dataset of organised dataset (2021 and 2022) based on year
        gdp = list(pred_gdp_sr.loc[pred_gdp_sr['year'] == i, 'gdp'])[0]
        sr = list(pred_gdp_sr.loc[pred_gdp_sr['year'] == i, 'saving'])[0]
        
        data[list(gdp_sr.columns)[2]] = gdp
        data[list(gdp_sr.columns)[3]] = sr

    # output csv file of merged dataset
    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv')



## Merge income per person for each sa2

In [11]:
# load income per person for each sa2 and predicted income per person for each sa2 data
income = pd.read_csv('../../data/curated/income_per_person_sa2.csv', index_col=[0])
income = income.rename(columns={'SA2': 'sa2_2016'})
pred_income = pd.read_csv('../../data/curated/feature_prediction/20_27_income_per_person_2016sa2.csv', index_col=[0])

In [12]:
year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

for i in year:
    if i not in [2020, 2021, 2022]:
        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

        # merge income per person for each sa2 with dataset of organised dataset (except 2020, 2021 and 2022) based on 2016 sa2
        income_data = income.loc[:, ['sa2_2016', str(i)]]

        data['income_per_person'] = (data.merge(income_data, on=['sa2_2016'], how='left')[str(i)])

    else:
        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

        # merge predicted income per person for each sa2 with dataset of organised dataset (2020, 2021 and 2022) based on 2016 sa2
        income_data = pred_income.loc[pred_income['Year'] == i][['sa2_2016', 'income_per_person_sa2']]

        data['income_per_person'] = (data.merge(income_data, on=['sa2_2016'], how='left')['income_per_person_sa2'])

    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv')


## Merge population density for each sa2

In [13]:
population = pd.read_csv('../../data/curated/vic_population_density_sa2.csv', index_col=[0])
population = population.rename(columns={'SA2 code': 'sa2_2021'})
population

Unnamed: 0,S/T name,sa2_2021,SA2 name,population_density_of_2001,population_density_of_2002,population_density_of_2003,population_density_of_2004,population_density_of_2005,population_density_of_2006,population_density_of_2007,...,population_density_of_2012,population_density_of_2013,population_density_of_2014,population_density_of_2015,population_density_of_2016,population_density_of_2017,population_density_of_2018,population_density_of_2019,population_density_of_2020,population_density_of_2021
651,Victoria,201011001,Alfredton,109.222011,115.597723,119.411765,122.960152,126.148008,128.292220,133.472486,...,171.916509,184.326376,196.166983,209.468691,224.895636,239.829222,256.546490,273.396584,293.586338,319.222011
652,Victoria,201011002,Ballarat,927.177419,944.193548,968.951613,982.983871,989.435484,996.451613,1000.645161,...,996.532258,996.129032,994.112903,991.935484,992.016129,988.790323,986.854839,992.822581,982.500000,973.870968
653,Victoria,201011005,Buninyong,103.100775,104.631783,107.693798,108.914729,113.507752,116.996124,118.817829,...,132.829457,135.348837,137.248062,139.360465,141.686047,143.546512,143.662791,144.418605,142.810078,140.155039
654,Victoria,201011006,Delacombe,121.461988,123.538012,127.807018,130.555556,137.543860,147.397661,152.222222,...,176.023392,183.245614,192.485380,200.175439,210.380117,222.719298,239.005848,259.532164,284.678363,311.111111
655,Victoria,201011007,Smythes Creek,31.680993,32.263610,32.578797,33.170965,33.505253,33.829990,34.326648,...,36.981853,37.382999,37.679083,37.879656,38.108883,38.223496,38.586437,39.245463,39.617956,40.238777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1168,Victoria,217031476,Otway,2.292317,2.310246,2.331496,2.331496,2.318879,2.296965,2.316887,...,2.294973,2.322199,2.336809,2.349426,2.361379,2.412511,2.461651,2.521416,2.593134,2.641610
1169,Victoria,217041477,Moyne - East,2.050797,2.046523,2.037975,2.027902,2.026375,2.030649,2.016607,...,2.064839,2.060260,2.055681,2.050186,2.048049,2.049576,2.057818,2.072776,2.098113,2.133525
1170,Victoria,217041478,Moyne - West,4.081362,4.115713,4.146629,4.164786,4.179507,4.220728,4.266366,...,4.498479,4.563745,4.604475,4.645696,4.712435,4.750712,4.795858,4.825302,4.829718,4.889096
1171,Victoria,217041479,Warrnambool - North,273.285256,279.631410,284.070513,287.451923,291.217949,296.923077,302.516026,...,324.567308,330.320513,335.416667,340.016026,343.621795,347.371795,351.506410,355.080128,358.637821,359.967949


In [14]:
pred_population = pd.read_csv('../../data/curated/feature_prediction/22_27_population.csv', index_col=[0])
pred_population = pred_population.rename(columns={'SA2 code': 'sa2_2021'})
pred_population

Unnamed: 0,year,sa2_2021,pred
0,2022,201011001,595.0
1,2022,201011002,1343.0
2,2022,201011005,533.0
3,2022,201011006,551.0
4,2022,201011007,415.0
...,...,...,...
3127,2027,217031476,542.0
3128,2027,217041477,559.0
3129,2027,217041478,570.0
3130,2027,217041479,861.0


In [15]:
year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

for i in year:
    if i != 2022:
        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

        population_data = population.loc[:, ['sa2_2021', f'population_density_of_{str(i)}']]

        data['population_density'] = (data.merge(population_data, on=['sa2_2021'], how='left')[f'population_density_of_{str(i)}'])

    else:
        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])
        
        population_data = pred_population.loc[pred_population['year'] == i][['sa2_2021', 'pred']]

        data['population_density'] = (data.merge(population_data, on=['sa2_2021'], how='left')['pred'])

    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv')


crime cases

In [16]:
crime_cases = pd.read_csv('../../data/curated/crime_cases.csv', index_col=[0])
crime_cases = crime_cases.rename(columns={'Postcode': 'postcode'})
crime_cases

Unnamed: 0,Year,postcode,Offence Count
0,2013,3000,17553
1,2013,3002,933
2,2013,3003,837
3,2013,3004,5305
4,2013,3006,2433
...,...,...,...
6889,2022,3990,2
6890,2022,3991,75
6891,2022,3992,94
6892,2022,3995,1179


In [17]:
year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

for i in year:
    data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

    crime_cases_data = crime_cases.loc[crime_cases['Year'] == i][['postcode', 'Offence Count']]

    data['crime_cases'] = (data.merge(crime_cases_data, on=['postcode'], how='left')['Offence Count'])

    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index=False)


ERP

In [18]:
#from pyspark.sql import SparkSession

#spark = (
#    SparkSession.builder.appName("MAST30034 project 2")
#    .config("spark.sql.repl.eagerEval.enabled", True) 
#    .config("spark.sql.parquet.cacheMetadata", "true")
#    .getOrCreate()
#)

#erp = spark.read.csv('../../data/curated/erp_vic.csv', header= True)
#erp = erp.toPandas()
#erp = erp.rename(columns={'sa2_maincode_2016': 'sa2_2016'})
#erp['sa2_2016'] = erp['sa2_2016'].astype(int)
#erp

In [19]:
#year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

#for i in year:
#    if i != 2022:
#        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

#        erp_data = erp.loc[:, ['sa2_2016', f'erp_{str(i)}']]

#        data['erp'] = (data.merge(erp_data, on=['sa2_2016'], how='left')[f'erp_{str(i)}'])

#    else:
#        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])
        
#        data['erp'] = None

#    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv')