In [114]:
import pandas as pd
import numpy as np

# Map properties to LGA

In [115]:
# read in properties data
data = pd.read_csv("../data/curated/properties_proximity.csv")

In [116]:
data = data.drop(columns='Unnamed: 0')

In [117]:
data.head()

Unnamed: 0,name,geometry,cost,beds,baths,parkings,lga_code,lga_name,nearest_station,nearest_park,nearest_shop,nearest_hospital,nearest_school,nearest_supermarket,distance_to_cbd
0,904/265 Exhibition Street Melbourne VIC 3000,POINT (144.9691204 -37.8095116),850.0,2.0,2.0,1.0,24600,Melbourne,1.486667,4.193333,1.875,5.448333,0.831667,0.743333,2.398333
1,210/422 Collins Street Melbourne VIC 3000,POINT (144.9601487 -37.8170971),475.0,1.0,1.0,0.0,24600,Melbourne,2.421667,6.305,2.455,5.94,3.503333,0.488333,3.843333
2,1902/200 Spencer Street Melbourne VIC 3000,POINT (144.9532465 -37.816228),630.0,2.0,1.0,1.0,24600,Melbourne,2.486667,7.198333,0.753333,5.675,1.56,0.813333,4.753333
3,312B/399 Bourke Street Melbourne VIC 3000,POINT (144.9621291 -37.8147259),450.0,1.0,1.0,0.0,24600,Melbourne,2.425,5.855,2.003333,5.488333,3.716667,0.701667,3.391667
4,3313/228 La Trobe Street Melbourne VIC 3000,POINT (144.962371 -37.8096052),900.0,2.0,1.0,2.0,24600,Melbourne,1.19,4.521667,2.173333,4.193333,1.408333,0.881667,2.156667


In [118]:
def custom_median(series):
    """
    find out median for each column, and ensure there are no 'average median' when there are 
    two median values, as beds, baths and parkings can not be decimals
    """
    sorted_series = sorted(series)
    n = len(sorted_series)
    if n % 2 == 1:
        return sorted_series[n // 2]
    else:
        return sorted_series[n // 2 - 1]

# using median values to represent the overall features for each LGA area
properties_median = data.groupby(['lga_name', 'lga_code']).agg(custom_median).reset_index()

# change the lga_name to lower case
properties_median['lga_name'] = properties_median['lga_name'].str.lower()

In [119]:
properties_median.head()

Unnamed: 0,lga_name,lga_code,name,geometry,cost,beds,baths,parkings,nearest_station,nearest_park,nearest_shop,nearest_hospital,nearest_school,nearest_supermarket,distance_to_cbd
0,alpine,20110,34 O'Donnell Avenue Myrtleford VIC 3737,POINT (146.7271339 -36.5522656),300.0,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0
1,ararat,20260,4 Grano Street Ararat VIC 3377,POINT (142.9414137 -37.2818641),400.0,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0
2,ballarat,20570,3/41 Longley Street Alfredton VIC 3350,POINT (143.8376317 -37.5653948),410.0,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0
3,banyule,20660,3 Clinton Street Heidelberg Heights VIC 3081,POINT (145.0598679 -37.7598674),550.0,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14
4,bass coast,20740,23 Carpathia Street Coronet Bay VIC 3984,POINT (145.4968004 -38.4790177),440.0,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675


In [120]:
# create data from 2017 to 2026, the columns are null at this stage

years = list(range(2017, 2027))

train = []
test = []

for year in years:
    # Create a copy of the dataframe for the current year
    temp_df = properties_median.copy()
    
    # Update the 'year' column
    temp_df['year'] = year
    
    # If the year is not 2023, set the 'cost' column to NaN
    if year != 2023:
        temp_df['cost'] = np.nan
    
    # Append the dataframe to the list
    if year < 2024: 
        train.append(temp_df)
    else:
        test.append(temp_df)

train_df = pd.concat(train, ignore_index=True)
test_df = pd.concat(test, ignore_index=True)


# Criminal Rate

In [121]:
offence = pd.read_excel("../data/raw/offence_count.xlsx", sheet_name='Table 03')
offence.head()

Unnamed: 0,Year,Year ending,Local Government Area,Postcode,Suburb/Town Name,Offence Division,Offence Subdivision,Offence Subgroup,Offence Count
0,2023,March,Alpine,3691,Dederang,A Crimes against the person,A20 Assault and related offences,A212 Non-FV Serious assault,1
1,2023,March,Alpine,3691,Dederang,A Crimes against the person,Other crimes against the person,Other crimes against the person,1
2,2023,March,Alpine,3691,Dederang,B Property and deception offences,B40 Theft,B42 Steal from a motor vehicle,2
3,2023,March,Alpine,3691,Dederang,B Property and deception offences,B40 Theft,B49 Other theft,1
4,2023,March,Alpine,3691,Dederang,D Public order and security offences,D10 Weapons and explosives offences,D11 Firearms offences,1


In [122]:
# renaming Merri-bek to Moreland the reason is we are using 2021 LGA area name, but from 2022 onwards,
# Moreland was renamed to Merri-bek, more information can be found here
# https://conversations.merri-bek.vic.gov.au/renaming
offence['Local Government Area'] = offence['Local Government Area'].replace('Merri-bek', 'Moreland')

# sum up the offence records by year and LGA area
offence_sums = offence.groupby(['Year', 'Local Government Area'])['Offence Count'].sum().reset_index()

# rename the column
offence_sums = offence_sums.rename(columns={'Year': 'year', 'Local Government Area': 'lga_name', 'Offence Count': 'offence_count'})

# change the values to lowercase
offence_sums['lga_name'] = offence_sums['lga_name'].str.lower()

# merge the original data frame
merged_df = pd.merge(train_df, offence_sums, on=['year', 'lga_name'], how='left')

In [123]:
merged_df.head()

Unnamed: 0,lga_name,lga_code,name,geometry,cost,beds,baths,parkings,nearest_station,nearest_park,nearest_shop,nearest_hospital,nearest_school,nearest_supermarket,distance_to_cbd,year,offence_count
0,alpine,20110,34 O'Donnell Avenue Myrtleford VIC 3737,POINT (146.7271339 -36.5522656),,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396
1,ararat,20260,4 Grano Street Ararat VIC 3377,POINT (142.9414137 -37.2818641),,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249
2,ballarat,20570,3/41 Longley Street Alfredton VIC 3350,POINT (143.8376317 -37.5653948),,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885
3,banyule,20660,3 Clinton Street Heidelberg Heights VIC 3081,POINT (145.0598679 -37.7598674),,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703
4,bass coast,20740,23 Carpathia Street Coronet Bay VIC 3984,POINT (145.4968004 -38.4790177),,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613


In [124]:
# fill in NaN values in test_df
test_df['offence_count'] = np.NaN

# Processing Historical Rent

In [125]:
import pandas as pd
from openpyxl import load_workbook

In [126]:
df = pd.read_excel('../data/raw/Quarterly median rents by local government area - March quarter 2023.xlsx', sheet_name='All Properties', engine='openpyxl')

In [127]:
df.head()

Unnamed: 0.1,Unnamed: 0,Mar 2016,Unnamed: 2,Jun 2016,Unnamed: 4,Sep 2016,Unnamed: 6,Dec 2016,Unnamed: 8,Mar 2017,...,Mar 2022,Unnamed: 50,Jun 2022,Unnamed: 52,Sep 2022,Unnamed: 54,Dec 2022,Unnamed: 56,Mar 2023,Unnamed: 58
0,,Count,Median,Count,Median,Count,Median,Count,Median,Count,...,Count,Median,Count,Median,Count,Median,Count,Median,Count,Median
1,Colac Otway,159,270,140,280,158,265,145,260,129,...,83,380,82,385,83,395,82,398,74,420
2,Corangamite,70,250,80,230,92,240,65,245,94,...,52,320,38,360,49,340,47,360,37,380
3,Glenelg,128,230,108,220,145,220,120,223,111,...,96,350,90,350,89,350,79,390,93,380
4,Greater Geelong,2218,320,1923,320,2057,320,2036,330,2260,...,1871,430,1721,425,2009,440,1930,450,1880,450


In [128]:
# Identify the "Median" columns
median_columns = df.columns[2::2]

# Create a mapping of old column names to new column names
rename_dict = {old: df[old][0] for old in median_columns}

# Rename the "Median" columns
df.rename(columns=rename_dict, inplace=True)

# Drop the "Count" columns
df = df.drop(df.columns[1::2], axis=1)

# Drop the first row (which contains 'Count' and 'Median' labels)
df = df.drop(0)


In [129]:
new_columns = [
    "lga_name",
    "Mar 2016", "Jun 2016", "Sep 2016", "Dec 2016", "Mar 2017", "Jun 2017", "Sep 2017", "Dec 2017", "Mar 2018",
    "Jun 2018", "Sep 2018", "Dec 2018", "Mar 2019", "Jun 2019", "Sep 2019", "Dec 2019", "Mar 2020", "Jun 2020",
    "Sep 2020", "Dec 2020", "Mar 2021", "Jun 2021", "Sep 2021", "Dec 2021", "Mar 2022", "Jun 2022", "Sep 2022",
    "Dec 2022", "Mar 2023"
]
df.columns = new_columns

In [130]:
df.head()

Unnamed: 0,lga_name,Mar 2016,Jun 2016,Sep 2016,Dec 2016,Mar 2017,Jun 2017,Sep 2017,Dec 2017,Mar 2018,...,Dec 2020,Mar 2021,Jun 2021,Sep 2021,Dec 2021,Mar 2022,Jun 2022,Sep 2022,Dec 2022,Mar 2023
1,Colac Otway,270,280,265,260,280,280,280,285,280,...,325,340,340,350,380,380,385,395,398,420
2,Corangamite,250,230,240,245,240,238,235,240,250,...,290,310,285,315,343,320,360,340,360,380
3,Glenelg,230,220,220,223,230,220,230,228,250,...,310,330,330,330,350,350,350,350,390,380
4,Greater Geelong,320,320,320,330,330,330,330,345,350,...,380,395,400,410,415,430,425,440,450,450
5,Moyne,280,280,280,300,280,283,280,283,280,...,363,360,380,345,360,400,445,400,400,380


In [131]:
# We are using the September data to represent each year's historical rent
# because our property data was scrapped in September
sep_columns = ['lga_name'] + [col for col in df.columns if 'Sep' in col]
df_historical = df[sep_columns]

# Rename columns
df_historical.columns = [col.split(' ')[1] if 'Sep' in col else col for col in df_historical.columns]

In [132]:
df_historical.head()

Unnamed: 0,lga_name,2016,2017,2018,2019,2020,2021,2022
1,Colac Otway,265,280,300,310,340,350,395
2,Corangamite,240,235,240,260,280,315,340
3,Glenelg,220,230,250,270,300,330,350
4,Greater Geelong,320,330,350,370,375,410,440
5,Moyne,280,280,275,315,320,345,400


In [133]:
# melt the df
df_historical_melted = df_historical.melt(id_vars=['lga_name'], 
                                          var_name='year', 
                                          value_name='historical_cost')

df_historical_melted['year'] = df_historical_melted['year'].astype(int)

# replace Merri-bek to Moreland
df_historical_melted['lga_name'] = df_historical_melted['lga_name'].replace('Merri-bek', 'Moreland')

# change to lower case
df_historical_melted['lga_name'] = df_historical_melted['lga_name'].str.lower()

In [134]:
merged_df = pd.merge(merged_df, df_historical_melted, on=['year', 'lga_name'], how='left')
merged_df['cost'] = merged_df['cost'].fillna(merged_df['historical_cost'])

merged_df = merged_df.drop(columns=['historical_cost'])

In [135]:
merged_df.head()

Unnamed: 0,lga_name,lga_code,name,geometry,cost,beds,baths,parkings,nearest_station,nearest_park,nearest_shop,nearest_hospital,nearest_school,nearest_supermarket,distance_to_cbd,year,offence_count
0,alpine,20110,34 O'Donnell Avenue Myrtleford VIC 3737,POINT (146.7271339 -36.5522656),270,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396
1,ararat,20260,4 Grano Street Ararat VIC 3377,POINT (142.9414137 -37.2818641),260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249
2,ballarat,20570,3/41 Longley Street Alfredton VIC 3350,POINT (143.8376317 -37.5653948),280,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885
3,banyule,20660,3 Clinton Street Heidelberg Heights VIC 3081,POINT (145.0598679 -37.7598674),395,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703
4,bass coast,20740,23 Carpathia Street Coronet Bay VIC 3984,POINT (145.4968004 -38.4790177),285,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613


# Population Data

In [136]:
# Read into population data and SA2 LGA correspondence table
population = pd.read_csv('../data/landing/Victoria.csv')
mapping_df = pd.read_csv("../data/raw/sa2_to_lga.csv")
mapping_df = mapping_df.drop(columns=['Unnamed: 0'])

In [137]:
# Select the total population
population = population[population['SEX'] == 'Persons']
population = population.reset_index(drop=True)

In [138]:
# Read into SA2 2016 and 2021 correspondence table, as our population dataset
# was recorded in 2016 SA2 regions
sa2_2016_2021 = pd.read_csv("../data/landing/CG_SA2_2016_SA2_2021.csv")
sa2_2016_2021.head()

Unnamed: 0,SA2_MAINCODE_2016,SA2_NAME_2016,SA2_CODE_2021,SA2_NAME_2021,RATIO_FROM_TO,INDIV_TO_REGION_QLTY_INDICATOR,OVERALL_QUALITY_INDICATOR,BMOS_NULL_FLAG
0,101021007.0,Braidwood,101021007,Braidwood,1.0,Good,Good,0
1,101021008.0,Karabar,101021008,Karabar,1.0,Good,Good,0
2,101021009.0,Queanbeyan,101021009,Queanbeyan,1.0,Good,Good,0
3,101021010.0,Queanbeyan - East,101021010,Queanbeyan - East,1.0,Good,Good,0
4,101021011.0,Queanbeyan Region,101021610,Googong,0.09671,Poor,Good,0


In [139]:
# Map the population data to use 2021 SA2 regions
population_sa2_2021 = population.merge(sa2_2016_2021, left_on='SA2_CODE', right_on='SA2_MAINCODE_2016', how='left')

population_sa2_2021.drop('SA2_CODE', axis=1, inplace=True)
population_sa2_2021.rename(columns={'SA2_CODE_2021': 'sa2_code'}, inplace=True)


In [140]:
population_sa2_2021.head()

Unnamed: 0,YEAR,SA2_NAME,SEX,Age0-4,Age5-9,Age10-14,Age15-19,Age20-24,Age25-29,Age30-34,...,Age85+,Total,SA2_MAINCODE_2016,SA2_NAME_2016,sa2_code,SA2_NAME_2021,RATIO_FROM_TO,INDIV_TO_REGION_QLTY_INDICATOR,OVERALL_QUALITY_INDICATOR,BMOS_NULL_FLAG
0,2017,Alfredton,Persons,926,976,1073,971,795,811,868,...,144,12525,201011001.0,Alfredton,201011001,Alfredton,1.0,Good,Good,0
1,2017,Ballarat,Persons,601,694,738,930,854,729,655,...,353,12227,201011002.0,Ballarat,201011002,Ballarat,1.0,Good,Good,0
2,2017,Ballarat - North,Persons,1605,1541,1367,1356,1688,1727,1540,...,599,23906,201011003.0,Ballarat - North,201011481,Ballarat East - Warrenheip,0.395627,Poor,Good,0
3,2017,Ballarat - North,Persons,1605,1541,1367,1356,1688,1727,1540,...,599,23906,201011003.0,Ballarat - North,201011482,Ballarat North - Invermay,0.604373,Poor,Good,0
4,2017,Ballarat - South,Persons,1708,1534,1336,1429,2505,2149,1763,...,598,25033,201011004.0,Ballarat - South,201011483,Canadian - Mount Clear,0.467037,Poor,Good,0


In [141]:
population_sa2_2021 = population_sa2_2021[['YEAR', 'Total', 'sa2_code']]

In [142]:
population_sa2_2021.head()

Unnamed: 0,YEAR,Total,sa2_code
0,2017,12525,201011001
1,2017,12227,201011002
2,2017,23906,201011481
3,2017,23906,201011482
4,2017,25033,201011483


In [143]:
# change the data type
population_sa2_2021['sa2_code'] = population_sa2_2021['sa2_code'].astype('int64')
mapping_df['SA2_CODE21'] = mapping_df['SA2_CODE21'].astype('int64')
mapping_df = mapping_df.rename(columns={"SA2_CODE21": "sa2_code"})
population_LGA = pd.merge(population_sa2_2021, mapping_df, on=['sa2_code'], how='left')

In [144]:
# sum up the population by year and LGA Area
grouped_population = population_LGA.groupby(['YEAR', 'LGA_CODE21', 'LGA_NAME21'])['Total'].sum().reset_index()

In [145]:
grouped_population = grouped_population.rename(columns={'YEAR': 'year', 'LGA_CODE21': 'lga_code', 'LGA_NAME21': 'lga_name', 'Total': 'total'})

In [146]:
grouped_population.head()

Unnamed: 0,year,lga_code,lga_name,total
0,2017,20110,Alpine,13113
1,2017,20260,Ararat,11613
2,2017,20570,Ballarat,152520
3,2017,20660,Banyule,129192
4,2017,20740,Bass Coast,34166


In [147]:
merged_df = pd.merge(merged_df, grouped_population, on=['year', 'lga_code'], how='left')

In [148]:
test_df = pd.merge(test_df, grouped_population, on = ['year', 'lga_code'], how = 'left')

In [149]:
merged_df.head()

Unnamed: 0,lga_name_x,lga_code,name,geometry,cost,beds,baths,parkings,nearest_station,nearest_park,nearest_shop,nearest_hospital,nearest_school,nearest_supermarket,distance_to_cbd,year,offence_count,lga_name_y,total
0,alpine,20110,34 O'Donnell Avenue Myrtleford VIC 3737,POINT (146.7271339 -36.5522656),270,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396,Alpine,13113.0
1,ararat,20260,4 Grano Street Ararat VIC 3377,POINT (142.9414137 -37.2818641),260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249,Ararat,11613.0
2,ballarat,20570,3/41 Longley Street Alfredton VIC 3350,POINT (143.8376317 -37.5653948),280,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885,Ballarat,152520.0
3,banyule,20660,3 Clinton Street Heidelberg Heights VIC 3081,POINT (145.0598679 -37.7598674),395,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703,Banyule,129192.0
4,bass coast,20740,23 Carpathia Street Coronet Bay VIC 3984,POINT (145.4968004 -38.4790177),285,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613,Bass Coast,34166.0


In [150]:
# Drop the lga_name_y column
test_df.drop('lga_name_y', axis=1, inplace=True)

# Rename the lga_name_x column to lga_name
test_df.rename(columns={'lga_name_x': 'lga_name'}, inplace=True)
test_df = test_df.rename(columns={'total': 'population'})

In [151]:
merged_df.drop('lga_name_y', axis=1, inplace=True)
merged_df.rename(columns={'lga_name_x': 'lga_name'}, inplace=True)
merged_df = merged_df.rename(columns={'total': 'population'})


# Processing Census Data

In [152]:
import pandas as pd

# read in 2021 census data
df_2021 = pd.read_csv("../data/landing/census2021/2021 Census GCP Local Government Areas for VIC/2021Census_G02_VIC_LGA.csv")
df_2021['LGA_CODE_2021'] = df_2021['LGA_CODE_2021'].str.replace('LGA', '')
df_2021['LGA_CODE_2021'] = df_2021['LGA_CODE_2021'].astype(int)
df_2021 = df_2021.rename(columns={"LGA_CODE_2021": "lga_code"})
weekly_income_2021 = df_2021[['lga_code', 'Median_tot_prsnl_inc_weekly']]

In [153]:
# read in 2016 census data
df_2016 = pd.read_csv("../data/landing/census2016/2016 Census GCP Local Government Areas for VIC/2016Census_G02_VIC_LGA.csv")
df_2016['LGA_CODE_2016'] = df_2016['LGA_CODE_2016'].str.replace('LGA', '')
df_2016['LGA_CODE_2016'] = df_2016['LGA_CODE_2016'].astype(int)
df_2016 = df_2016.rename(columns={"LGA_CODE_2016": "lga_code"})
weekly_income_2016 = df_2016[['lga_code', 'Median_tot_prsnl_inc_weekly']]

In [154]:
import pandas as pd

# merge the 2016 income and 2021 income
merged_income = weekly_income_2016.merge(weekly_income_2021, on='lga_code', suffixes=('_2016', '_2021'))

# Calculate annual growth rate
merged_income['growth_rate_prsnl_inc'] = (merged_income['Median_tot_prsnl_inc_weekly_2021'] / merged_income['Median_tot_prsnl_inc_weekly_2016']) ** (1/5) - 1

# Create a new dataframe to store the long-format data
long_df = pd.DataFrame(columns=['year', 'lga_code', 'Median_tot_prsnl_inc_weekly'])

# Fill out the data for the years 2016 to 2021
for year in range(2016, 2027):
    if year == 2016:
        temp_df = merged_income[['lga_code', 'Median_tot_prsnl_inc_weekly_2016']]
        temp_df.columns = ['lga_code', 'Median_tot_prsnl_inc_weekly']
    elif year == 2021:
        temp_df = merged_income[['lga_code', 'Median_tot_prsnl_inc_weekly_2021']]
        temp_df.columns = ['lga_code', 'Median_tot_prsnl_inc_weekly']
    else:
        prev_year = year - 1
        temp_df = long_df[long_df['year'] == prev_year].copy()
        
        # Use .map() to match the SA2_CODE_2021 values and apply the growth rate
        temp_df['Median_tot_prsnl_inc_weekly'] = temp_df['Median_tot_prsnl_inc_weekly'] * (1 + temp_df['lga_code'].map(merged_income.set_index('lga_code')['growth_rate_prsnl_inc']))
        
    
    temp_df['year'] = year
    long_df = pd.concat([long_df, temp_df], ignore_index=True)

# Print the long-format dataframe
long_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['year'] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['year'] = year


Unnamed: 0,year,lga_code,Median_tot_prsnl_inc_weekly
0,2016,20110,562
1,2016,20260,556
2,2016,20570,590
3,2016,20660,728
4,2016,20740,507
...,...,...,...
897,2026,27450,978.806597
898,2026,27630,795.570265
899,2026,29399,959.040486
900,2026,29499,0.0


In [155]:
personal_income = long_df.rename(columns={'Median_tot_prsnl_inc_weekly': 'weekly_income'})

In [156]:
merged_df = pd.merge(merged_df, personal_income, on=['year', 'lga_code'], how='left')

In [157]:
test_df = pd.merge(test_df, personal_income, on=['year', 'lga_code'], how='left')

In [160]:
merged_df.to_csv("../data/development/train.csv")

In [161]:
test_df.to_csv("../data/development/test.csv")