In [2]:
import pandas as pd
import os
import numpy as np

In [2]:
# create path
new_path = '../../data/curated/merged_dataset/'

if not os.path.exists(new_path):
    os.makedirs(new_path)

## Merge GDP and saving rate and drop month to get values of all attributes based on year

In [3]:
# load gdp and saving rate data and predicted gdp and saving rate data
gdp_sr = pd.read_csv('../../data/curated/gdp_with_saving_rate.csv')
pred_gdp_sr = pd.read_csv('../../data/curated/feature_prediction/21_27_gdp_with_saving_predicted.csv')

In [4]:
year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

for i in year:
    if i not in [2021, 2022]:
        data = pd.read_csv(f'../../data/curated/min_distance_sa2_organised/{i}_property_with_SA2.csv', index_col=[0])

        # fill up missing distance data with maximum distance
        max_cbd = np.nanmax(data['min_distance_to_cbd'].values)
        max_park = np.nanmax(data['min_distance_to_park'].values)
        max_prim = np.nanmax(data['min_distance_to_prim'].values)
        max_second = np.nanmax(data['min_distance_to_second'].values)
        max_train = np.nanmax(data['min_distance_to_train'].values)
        max_poli = np.nanmax(data['min_distance_to_poli'].values)
        max_hosp = np.nanmax(data['min_distance_to_hosp'].values)
        max_shop = np.nanmax(data['min_distance_to_shop'].values)

        data = data.fillna({'min_distance_to_cbd': max_cbd, 'min_distance_to_park': max_park, 'min_distance_to_prim': max_prim, 'min_distance_to_second': max_second, 
        'min_distance_to_train': max_train, 'min_distance_to_hosp': max_hosp, 'min_distance_to_poli': max_poli, 'min_distance_to_shop': max_shop})    
        
        # drop months to get values of all attributes based on year
        data = data.groupby(['year', 'address', 'latitude', 'longitude', 'postcode',
            'sa2_2021', 'sa2_2016', 'residence_type', 'nbed', 'nbath', 'ncar'], as_index=False)\
        .agg({'min_distance_to_cbd': 'first', 'min_distance_to_park': 'first', 'min_distance_to_prim': 'first', 'min_distance_to_second': 'first', 'min_distance_to_train': 'first', 
        'min_distance_to_hosp': 'first', 'min_distance_to_poli': 'first', 'min_distance_to_shop': 'first', 'weekly_rent': 'mean'})

        # merge gdp and saving rate data with dataset of organised dataset (except 2021 and 2022) based on year
        gdp = list(gdp_sr.loc[gdp_sr['year'] == i, 'gdp(USD Millioins)'])[0]
        sr = list(gdp_sr.loc[gdp_sr['year'] == i, 'saving_rate(% of GDP)'])[0]
        
        data[list(gdp_sr.columns)[2]] = gdp
        data[list(gdp_sr.columns)[3]] = sr

    else:
        data = pd.read_csv(f'../../data/curated/min_distance_sa2_organised/{i}_property_with_SA2.csv', index_col=[0])

        # fill up missing distance data with maximum distance
        max_cbd = np.nanmax(data['min_distance_to_cbd'].values)
        max_park = np.nanmax(data['min_distance_to_park'].values)
        max_prim = np.nanmax(data['min_distance_to_prim'].values)
        max_second = np.nanmax(data['min_distance_to_second'].values)
        max_train = np.nanmax(data['min_distance_to_train'].values)
        max_poli = np.nanmax(data['min_distance_to_poli'].values)
        max_hosp = np.nanmax(data['min_distance_to_hosp'].values)
        max_shop = np.nanmax(data['min_distance_to_shop'].values)

        data = data.fillna({'min_distance_to_cbd': max_cbd, 'min_distance_to_park': max_park, 'min_distance_to_prim': max_prim, 'min_distance_to_second': max_second, 
        'min_distance_to_train': max_train, 'min_distance_to_hosp': max_hosp, 'min_distance_to_poli': max_poli, 'min_distance_to_shop': max_shop})   
        
        # drop months to get values of all attributes based on year
        data = data.groupby(['year', 'address', 'latitude', 'longitude', 'postcode',
            'sa2_2021', 'sa2_2016', 'residence_type', 'nbed', 'nbath', 'ncar'], as_index=False)\
        .agg({'min_distance_to_cbd': 'first', 'min_distance_to_park': 'first', 'min_distance_to_prim': 'first', 'min_distance_to_second': 'first', 'min_distance_to_train': 'first', 
        'min_distance_to_hosp': 'first', 'min_distance_to_poli': 'first', 'min_distance_to_shop': 'first', 'weekly_rent': 'mean'})

        # merge predicted gdp and saving rate data with dataset of organised dataset (2021 and 2022) based on year
        gdp = list(pred_gdp_sr.loc[pred_gdp_sr['year'] == i, 'gdp'])[0]
        sr = list(pred_gdp_sr.loc[pred_gdp_sr['year'] == i, 'saving'])[0]
        
        data[list(gdp_sr.columns)[2]] = gdp
        data[list(gdp_sr.columns)[3]] = sr

    # output csv file of merged dataset
    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv')



## Merge income per person for each sa2

In [3]:
# load income per person for each sa2 and predicted income per person for each sa2 data
income = pd.read_csv('../../data/curated/income_per_person_sa2.csv', index_col=[0])
income = income.rename(columns={'SA2': 'sa2_2016'})
pred_income = pd.read_csv('../../data/curated/feature_prediction/20_27_income_per_person_2016sa2.csv', index_col=[0])

In [7]:
year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

for i in year:
    if i not in [2020, 2021, 2022]:
        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

        # merge income per person for each sa2 with dataset of organised dataset (except 2020, 2021 and 2022) based on 2016 sa2
        income_data = income.loc[:, ['sa2_2016', str(i)]]

        data['income_per_person'] = (data.merge(income_data, on=['sa2_2016'], how='left')[str(i)])

    else:
        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

        # merge predicted income per person for each sa2 with dataset of organised dataset (2020, 2021 and 2022) based on 2016 sa2
        income_data = pred_income.loc[pred_income['Year'] == i][['sa2_2016', 'income_per_person_sa2']]

        data['income_per_person'] = (data.merge(income_data, on=['sa2_2016'], how='left')['income_per_person_sa2'])

    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv')


population density

In [8]:
population = pd.read_csv('../../data/curated/vic_population_sa2.csv', index_col=[0])
population = population.rename(columns={'SA2 code': 'sa2_2021'})
population

Unnamed: 0,S/T name,sa2_2021,SA2 name,2001,2002,2003,2004,2005,2006,2007,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
651,Victoria,201011001,Alfredton,5756,6092,6293,6480,6648,6761,7034,...,9060,9714,10338,11039,11852,12639,13520,14408,15472,16823
652,Victoria,201011002,Ballarat,11497,11708,12015,12189,12269,12356,12408,...,12357,12352,12327,12300,12301,12261,12237,12311,12183,12076
653,Victoria,201011005,Buninyong,5320,5399,5557,5620,5857,6037,6131,...,6854,6984,7082,7191,7311,7407,7413,7452,7369,7232
654,Victoria,201011006,Delacombe,4154,4225,4371,4465,4704,5041,5206,...,6020,6267,6583,6846,7195,7617,8174,8876,9736,10640
655,Victoria,201011007,Smythes Creek,3317,3378,3411,3473,3508,3542,3594,...,3872,3914,3945,3966,3990,4002,4040,4109,4148,4213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1168,Victoria,217031476,Otway,3452,3479,3511,3511,3492,3459,3489,...,3456,3497,3519,3538,3556,3633,3707,3797,3905,3978
1169,Victoria,217041477,Moyne - East,6718,6704,6676,6643,6638,6652,6606,...,6764,6749,6734,6716,6709,6714,6741,6790,6873,6989
1170,Victoria,217041478,Moyne - West,8317,8387,8450,8487,8517,8601,8694,...,9167,9300,9383,9467,9603,9681,9773,9833,9842,9963
1171,Victoria,217041479,Warrnambool - North,17053,17449,17726,17937,18172,18528,18877,...,20253,20612,20930,21217,21442,21676,21934,22157,22379,22462


In [9]:
pred_population = pd.read_csv('../../data/curated/feature_prediction/22_27_population.csv', index_col=[0])
pred_population = pred_population.rename(columns={'SA2 code': 'sa2_2021'})
pred_population

Unnamed: 0,year,sa2_2021,pred
0,2022,201011001,595.0
1,2022,201011002,1343.0
2,2022,201011005,533.0
3,2022,201011006,551.0
4,2022,201011007,415.0
...,...,...,...
3127,2027,217031476,542.0
3128,2027,217041477,559.0
3129,2027,217041478,570.0
3130,2027,217041479,861.0


In [10]:
year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

for i in year:
    if i != 2022:
        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

        population_data = population.loc[:, ['sa2_2021', str(i)]]

        data['population_density'] = (data.merge(population_data, on=['sa2_2021'], how='left')[str(i)])

    else:
        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])
        
        population_data = pred_population.loc[pred_population['year'] == i][['sa2_2021', 'pred']]

        data['population_density'] = (data.merge(population_data, on=['sa2_2021'], how='left')['pred'])

    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv')


crime cases

In [11]:
crime_cases = pd.read_csv('../../data/curated/crime_cases.csv', index_col=[0])
crime_cases = crime_cases.rename(columns={'Postcode': 'postcode'})
crime_cases

Unnamed: 0,Year,postcode,Offence Count
0,2013,3000,17553
1,2013,3002,933
2,2013,3003,837
3,2013,3004,5305
4,2013,3006,2433
...,...,...,...
6889,2022,3990,2
6890,2022,3991,75
6891,2022,3992,94
6892,2022,3995,1179


In [12]:
year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

for i in year:
    data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

    crime_cases_data = crime_cases.loc[crime_cases['Year'] == i][['postcode', 'Offence Count']]

    data['crime_cases'] = (data.merge(crime_cases_data, on=['postcode'], how='left')['Offence Count'])

    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index=False)


ERP

In [None]:
#from pyspark.sql import SparkSession

#spark = (
#    SparkSession.builder.appName("MAST30034 project 2")
#    .config("spark.sql.repl.eagerEval.enabled", True) 
#    .config("spark.sql.parquet.cacheMetadata", "true")
#    .getOrCreate()
#)

#erp = spark.read.csv('../../data/curated/erp_vic.csv', header= True)
#erp = erp.toPandas()
#erp = erp.rename(columns={'sa2_maincode_2016': 'sa2_2016'})
#erp['sa2_2016'] = erp['sa2_2016'].astype(int)
#erp

In [None]:
#year = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

#for i in year:
#    if i != 2022:
#        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])

#        erp_data = erp.loc[:, ['sa2_2016', f'erp_{str(i)}']]

#        data['erp'] = (data.merge(erp_data, on=['sa2_2016'], how='left')[f'erp_{str(i)}'])

#    else:
#        data = pd.read_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv', index_col=[0])
        
#        data['erp'] = None

#    data.to_csv(f'../../data/curated/merged_dataset/{i}_merged_data.csv')