In [1]:
import pandas as pd
import numpy as np

# Map properties to LGA

In [2]:
data = pd.read_csv("../data/curated/properties_proximity.csv")

In [3]:
data = data.drop(columns='Unnamed: 0')

In [4]:
data.head()

Unnamed: 0,name,geometry,cost,beds,baths,parkings,LGA_CODE23,LGA_NAME23,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd
0,904/265 Exhibition Street Melbourne VIC 3000,POINT (144.9691204 -37.8095116),850.0,2.0,2.0,1.0,24600,Melbourne,1.486667,4.193333,1.875,5.448333,0.831667,0.743333,2.398333
1,210/422 Collins Street Melbourne VIC 3000,POINT (144.9601487 -37.8170971),475.0,1.0,1.0,0.0,24600,Melbourne,2.421667,6.305,2.455,5.94,3.503333,0.488333,3.843333
2,1902/200 Spencer Street Melbourne VIC 3000,POINT (144.9532465 -37.816228),630.0,2.0,1.0,1.0,24600,Melbourne,2.486667,7.198333,0.753333,5.675,1.56,0.813333,4.753333
3,312B/399 Bourke Street Melbourne VIC 3000,POINT (144.9621291 -37.8147259),450.0,1.0,1.0,0.0,24600,Melbourne,2.425,5.855,2.003333,5.488333,3.716667,0.701667,3.391667
4,3313/228 La Trobe Street Melbourne VIC 3000,POINT (144.962371 -37.8096052),900.0,2.0,1.0,2.0,24600,Melbourne,1.19,4.521667,2.173333,4.193333,1.408333,0.881667,2.156667


In [5]:
def custom_median(series):
    sorted_series = sorted(series)
    n = len(sorted_series)
    if n % 2 == 1:
        return sorted_series[n // 2]
    else:
        return sorted_series[n // 2 - 1]

properties_median = data.groupby(['LGA_NAME23', 'LGA_CODE23']).agg(custom_median).reset_index()

In [6]:
properties_median.head()

Unnamed: 0,LGA_NAME23,LGA_CODE23,name,geometry,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd
0,Alpine,20110,34 O'Donnell Avenue Myrtleford VIC 3737,POINT (146.7271339 -36.5522656),300.0,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0
1,Ararat,20260,4 Grano Street Ararat VIC 3377,POINT (142.9414137 -37.2818641),400.0,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0
2,Ballarat,20570,3/41 Longley Street Alfredton VIC 3350,POINT (143.8376317 -37.5653948),410.0,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0
3,Banyule,20660,3 Clinton Street Heidelberg Heights VIC 3081,POINT (145.0598679 -37.7598674),550.0,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14
4,Bass Coast,20740,23 Carpathia Street Coronet Bay VIC 3984,POINT (145.4968004 -38.4790177),440.0,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675


In [7]:
years = list(range(2017, 2027))

train = []
test = []

for year in years:
    # Create a copy of the dataframe for the current year
    temp_df = properties_median.copy()
    
    # Update the 'year' column
    temp_df['year'] = year
    
    # If the year is not 2023, set the 'cost' column to NaN
    if year != 2023:
        temp_df['cost'] = np.nan
    
    # Append the dataframe to the list
    if year < 2024: 
        train.append(temp_df)
    else:
        test.append(temp_df)

train_df = pd.concat(train, ignore_index=True)
test_df = pd.concat(test, ignore_index=True)


# Criminal Rate

In [8]:
offence = pd.read_excel("../data/external_SA2/offence_count.xlsx", sheet_name='Table 03')
offence.head()

Unnamed: 0,Year,Year ending,Local Government Area,Postcode,Suburb/Town Name,Offence Division,Offence Subdivision,Offence Subgroup,Offence Count
0,2023,March,Alpine,3691,Dederang,A Crimes against the person,A20 Assault and related offences,A212 Non-FV Serious assault,1
1,2023,March,Alpine,3691,Dederang,A Crimes against the person,Other crimes against the person,Other crimes against the person,1
2,2023,March,Alpine,3691,Dederang,B Property and deception offences,B40 Theft,B42 Steal from a motor vehicle,2
3,2023,March,Alpine,3691,Dederang,B Property and deception offences,B40 Theft,B49 Other theft,1
4,2023,March,Alpine,3691,Dederang,D Public order and security offences,D10 Weapons and explosives offences,D11 Firearms offences,1


In [9]:
offence_sums = offence.groupby(['Year', 'Local Government Area'])['Offence Count'].sum().reset_index()
offence_sums = offence_sums.rename(columns={'Year': 'year', 'Local Government Area': 'LGA_NAME23'})
merged_df = pd.merge(train_df, offence_sums, on=['year', 'LGA_NAME23'], how='left')

In [10]:
merged_df.head()

Unnamed: 0,LGA_NAME23,LGA_CODE23,name,geometry,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd,year,Offence Count
0,Alpine,20110,34 O'Donnell Avenue Myrtleford VIC 3737,POINT (146.7271339 -36.5522656),,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396
1,Ararat,20260,4 Grano Street Ararat VIC 3377,POINT (142.9414137 -37.2818641),,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249
2,Ballarat,20570,3/41 Longley Street Alfredton VIC 3350,POINT (143.8376317 -37.5653948),,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885
3,Banyule,20660,3 Clinton Street Heidelberg Heights VIC 3081,POINT (145.0598679 -37.7598674),,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703
4,Bass Coast,20740,23 Carpathia Street Coronet Bay VIC 3984,POINT (145.4968004 -38.4790177),,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613


In [11]:
test_df['Offence Count'] = np.NaN

# Processing Historical Rent

In [12]:
import pandas as pd
from openpyxl import load_workbook

In [13]:
df = pd.read_excel('../data/landing/Quarterly median rents by local government area - March quarter 2023.xlsx', sheet_name='All Properties', engine='openpyxl')

In [14]:
# Identify the "Median" columns
median_columns = df.columns[2::2]

# Create a mapping of old column names to new column names
rename_dict = {old: df[old][0] for old in median_columns}

# Rename the "Median" columns
df.rename(columns=rename_dict, inplace=True)

# Drop the "Count" columns
df = df.drop(df.columns[1::2], axis=1)

# Drop the first row (which contains 'Count' and 'Median' labels)
df = df.drop(0)



In [15]:
new_columns = [
    "Local Government Area",
    "Mar 2016", "Jun 2016", "Sep 2016", "Dec 2016", "Mar 2017", "Jun 2017", "Sep 2017", "Dec 2017", "Mar 2018",
    "Jun 2018", "Sep 2018", "Dec 2018", "Mar 2019", "Jun 2019", "Sep 2019", "Dec 2019", "Mar 2020", "Jun 2020",
    "Sep 2020", "Dec 2020", "Mar 2021", "Jun 2021", "Sep 2021", "Dec 2021", "Mar 2022", "Jun 2022", "Sep 2022",
    "Dec 2022", "Mar 2023"
]
df.columns = new_columns

In [16]:
sep_columns = ['Local Government Area'] + [col for col in df.columns if 'Sep' in col]
df_historical = df[sep_columns]

# Rename columns
df_historical.columns = [col.split(' ')[1] if 'Sep' in col else col for col in df_historical.columns]

In [17]:
df_historical_melted = df_historical.melt(id_vars=['Local Government Area'], 
                                          var_name='year', 
                                          value_name='historical_cost')

df_historical_melted['year'] = df_historical_melted['year'].astype(int)

df_historical_melted = df_historical_melted.rename(columns={'Local Government Area': 'LGA_NAME23'})

In [18]:
merged_df = pd.merge(merged_df, df_historical_melted, on=['year', 'LGA_NAME23'], how='left')
merged_df['cost'] = merged_df['cost'].fillna(merged_df['historical_cost'])

merged_df = merged_df.drop(columns=['historical_cost'])

In [19]:
merged_df.head()

Unnamed: 0,LGA_NAME23,LGA_CODE23,name,geometry,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd,year,Offence Count
0,Alpine,20110,34 O'Donnell Avenue Myrtleford VIC 3737,POINT (146.7271339 -36.5522656),270,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396
1,Ararat,20260,4 Grano Street Ararat VIC 3377,POINT (142.9414137 -37.2818641),260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249
2,Ballarat,20570,3/41 Longley Street Alfredton VIC 3350,POINT (143.8376317 -37.5653948),280,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885
3,Banyule,20660,3 Clinton Street Heidelberg Heights VIC 3081,POINT (145.0598679 -37.7598674),395,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703
4,Bass Coast,20740,23 Carpathia Street Coronet Bay VIC 3984,POINT (145.4968004 -38.4790177),285,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613


In [20]:
merged_df.head()

Unnamed: 0,LGA_NAME23,LGA_CODE23,name,geometry,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd,year,Offence Count
0,Alpine,20110,34 O'Donnell Avenue Myrtleford VIC 3737,POINT (146.7271339 -36.5522656),270,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396
1,Ararat,20260,4 Grano Street Ararat VIC 3377,POINT (142.9414137 -37.2818641),260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249
2,Ballarat,20570,3/41 Longley Street Alfredton VIC 3350,POINT (143.8376317 -37.5653948),280,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885
3,Banyule,20660,3 Clinton Street Heidelberg Heights VIC 3081,POINT (145.0598679 -37.7598674),395,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703
4,Bass Coast,20740,23 Carpathia Street Coronet Bay VIC 3984,POINT (145.4968004 -38.4790177),285,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613


# Population Data

In [21]:
population = pd.read_csv('../data/external_SA2/Victoria.csv')
mapping_df = pd.read_csv("../data/sa2_to_lga.csv")
mapping_df = mapping_df.drop(columns=['Unnamed: 0'])

In [22]:
population = population[population['SEX'] == 'Persons']
population = population.reset_index(drop=True)

In [23]:
sa2_2016_2021 = pd.read_csv("../data/landing/CG_SA2_2016_SA2_2021.csv")
sa2_2016_2021.head()

Unnamed: 0,SA2_MAINCODE_2016,SA2_NAME_2016,SA2_CODE_2021,SA2_NAME_2021,RATIO_FROM_TO,INDIV_TO_REGION_QLTY_INDICATOR,OVERALL_QUALITY_INDICATOR,BMOS_NULL_FLAG
0,101021007.0,Braidwood,101021007,Braidwood,1.0,Good,Good,0
1,101021008.0,Karabar,101021008,Karabar,1.0,Good,Good,0
2,101021009.0,Queanbeyan,101021009,Queanbeyan,1.0,Good,Good,0
3,101021010.0,Queanbeyan - East,101021010,Queanbeyan - East,1.0,Good,Good,0
4,101021011.0,Queanbeyan Region,101021610,Googong,0.09671,Poor,Good,0


In [24]:
population_sa2_2021 = population.merge(sa2_2016_2021, left_on='SA2_CODE', right_on='SA2_MAINCODE_2016', how='left')

population_sa2_2021.drop('SA2_CODE', axis=1, inplace=True)
population_sa2_2021.rename(columns={'SA2_CODE_2021': 'SA2_CODE21'}, inplace=True)


In [25]:
population_sa2_2021.head()

Unnamed: 0,YEAR,SA2_NAME,SEX,Age0-4,Age5-9,Age10-14,Age15-19,Age20-24,Age25-29,Age30-34,...,Age85+,Total,SA2_MAINCODE_2016,SA2_NAME_2016,SA2_CODE21,SA2_NAME_2021,RATIO_FROM_TO,INDIV_TO_REGION_QLTY_INDICATOR,OVERALL_QUALITY_INDICATOR,BMOS_NULL_FLAG
0,2017,Alfredton,Persons,926,976,1073,971,795,811,868,...,144,12525,201011001.0,Alfredton,201011001,Alfredton,1.0,Good,Good,0
1,2017,Ballarat,Persons,601,694,738,930,854,729,655,...,353,12227,201011002.0,Ballarat,201011002,Ballarat,1.0,Good,Good,0
2,2017,Ballarat - North,Persons,1605,1541,1367,1356,1688,1727,1540,...,599,23906,201011003.0,Ballarat - North,201011481,Ballarat East - Warrenheip,0.395627,Poor,Good,0
3,2017,Ballarat - North,Persons,1605,1541,1367,1356,1688,1727,1540,...,599,23906,201011003.0,Ballarat - North,201011482,Ballarat North - Invermay,0.604373,Poor,Good,0
4,2017,Ballarat - South,Persons,1708,1534,1336,1429,2505,2149,1763,...,598,25033,201011004.0,Ballarat - South,201011483,Canadian - Mount Clear,0.467037,Poor,Good,0


In [26]:
population_sa2_2021 = population_sa2_2021[['YEAR', 'Total', 'SA2_CODE21', 'SA2_NAME_2021']]

In [27]:
population_sa2_2021.head()

Unnamed: 0,YEAR,Total,SA2_CODE21,SA2_NAME_2021
0,2017,12525,201011001,Alfredton
1,2017,12227,201011002,Ballarat
2,2017,23906,201011481,Ballarat East - Warrenheip
3,2017,23906,201011482,Ballarat North - Invermay
4,2017,25033,201011483,Canadian - Mount Clear


In [28]:
population_sa2_2021['SA2_CODE21'] = population_sa2_2021['SA2_CODE21'].astype('int64')
mapping_df['SA2_CODE21'] = mapping_df['SA2_CODE21'].astype('int64')
population_LGA = pd.merge(population_sa2_2021, mapping_df, on=['SA2_CODE21'], how='left')

In [29]:
grouped_population = population_LGA.groupby(['YEAR', 'LGA_CODE23', 'LGA_NAME23'])['Total'].sum().reset_index()

In [30]:
grouped_population = grouped_population.rename(columns={'YEAR': 'year'})

In [31]:
merged_df = pd.merge(merged_df, grouped_population, on=['year', 'LGA_NAME23'], how='left')

In [32]:
test_df = pd.merge(test_df, grouped_population, on = ['year', 'LGA_NAME23'], how = 'left')

In [33]:
# Drop the LGA_CODE23_y column
test_df.drop('LGA_CODE23_y', axis=1, inplace=True)

# Rename the LGA_CODE23_x column to LGA_CODE23
test_df.rename(columns={'LGA_CODE23_x': 'LGA_CODE23'}, inplace=True)
test_df = test_df.rename(columns={'Total': 'population'})

In [34]:
# Drop the LGA_CODE23_y column
merged_df.drop('LGA_CODE23_y', axis=1, inplace=True)

# Rename the LGA_CODE23_x column to LGA_CODE23
merged_df.rename(columns={'LGA_CODE23_x': 'LGA_CODE23'}, inplace=True)


In [35]:
merged_df = merged_df.rename(columns={'Total': 'population'})

# Processing Census Data

In [36]:
import pandas as pd

df_2021 = pd.read_csv("../data/landing/census2021/2021 Census GCP All Geographies for VIC/SA2/VIC/2021Census_G02_VIC_SA2.csv")
df_2021

Unnamed: 0,SA2_CODE_2021,Median_age_persons,Median_mortgage_repay_monthly,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Average_num_psns_per_bedroom,Median_tot_hhd_inc_weekly,Average_household_size
0,201011001,34,1698,865,370,2218,0.8,1952,2.8
1,201011002,45,1700,842,313,2276,0.7,1573,2.2
2,201011005,42,1662,805,330,2270,0.8,1927,2.7
3,201011006,33,1500,775,360,1855,0.7,1627,2.6
4,201011007,41,1733,802,350,2236,0.8,2065,3.0
...,...,...,...,...,...,...,...,...,...
519,217041478,46,1517,758,280,1911,0.7,1522,2.4
520,217041479,40,1451,760,300,1892,0.7,1451,2.4
521,217041480,44,1387,761,287,1899,0.8,1380,2.2
522,297979799,50,0,2200,0,0,0.0,0,0.0


In [37]:
df_2016 = pd.read_csv("../data/landing/census2016/2016 Census GCP All Geographies for VIC/SA2/VIC/2016Census_G02_VIC_SA2.csv")
df_2016

Unnamed: 0,SA2_MAINCODE_2016,Median_age_persons,Median_mortgage_repay_monthly,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Average_num_psns_per_bedroom,Median_tot_hhd_inc_weekly,Average_household_size
0,201011001,34,1615,702,310,1833,0.8,1585,2.8
1,201011002,42,1500,670,260,1867,0.8,1327,2.3
2,201011003,39,1347,615,250,1574,0.8,1198,2.3
3,201011004,36,1257,528,240,1242,0.8,945,2.2
4,201011005,38,1580,638,300,1921,0.8,1634,2.8
...,...,...,...,...,...,...,...,...,...
459,217041478,44,1400,617,223,1470,0.8,1217,2.5
460,217041479,39,1500,625,250,1510,0.8,1210,2.5
461,217041480,43,1364,610,250,1498,0.8,1144,2.3
462,297979799,49,0,3000,0,0,0.0,0,0.0


In [38]:
corresponding_table = pd.read_csv("../data/landing/CG_SA2_2016_SA2_2021.csv")
corresponding_table.head()

Unnamed: 0,SA2_MAINCODE_2016,SA2_NAME_2016,SA2_CODE_2021,SA2_NAME_2021,RATIO_FROM_TO,INDIV_TO_REGION_QLTY_INDICATOR,OVERALL_QUALITY_INDICATOR,BMOS_NULL_FLAG
0,101021007.0,Braidwood,101021007,Braidwood,1.0,Good,Good,0
1,101021008.0,Karabar,101021008,Karabar,1.0,Good,Good,0
2,101021009.0,Queanbeyan,101021009,Queanbeyan,1.0,Good,Good,0
3,101021010.0,Queanbeyan - East,101021010,Queanbeyan - East,1.0,Good,Good,0
4,101021011.0,Queanbeyan Region,101021610,Googong,0.09671,Poor,Good,0


In [39]:
# Merge the 2016 census data with the corresponding table
merged_2016 = pd.merge(df_2016, corresponding_table[['SA2_MAINCODE_2016', 'SA2_CODE_2021']], on='SA2_MAINCODE_2016', how='left')

# Drop the old SA2_MAINCODE_2016 column
merged_2016.drop('SA2_MAINCODE_2016', axis=1, inplace=True)

merged_2016


Unnamed: 0,Median_age_persons,Median_mortgage_repay_monthly,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Average_num_psns_per_bedroom,Median_tot_hhd_inc_weekly,Average_household_size,SA2_CODE_2021
0,34,1615,702,310,1833,0.8,1585,2.8,201011001
1,42,1500,670,260,1867,0.8,1327,2.3,201011002
2,39,1347,615,250,1574,0.8,1198,2.3,201011481
3,39,1347,615,250,1574,0.8,1198,2.3,201011482
4,36,1257,528,240,1242,0.8,945,2.2,201011483
...,...,...,...,...,...,...,...,...,...
527,44,1400,617,223,1470,0.8,1217,2.5,217041478
528,39,1500,625,250,1510,0.8,1210,2.5,217041479
529,43,1364,610,250,1498,0.8,1144,2.3,217041480
530,49,0,3000,0,0,0.0,0,0.0,297979799


In [40]:
df_2016_cleaned = merged_2016.drop_duplicates(subset='SA2_CODE_2021', keep='first')

# Check the shape of the cleaned dataframe
df_2016_cleaned.shape

(524, 9)

In [41]:
import pandas as pd


# Filter columns
df_2021 = df_2021[['SA2_CODE_2021', 'Median_tot_prsnl_inc_weekly']]
df_2016 = df_2016_cleaned[['SA2_CODE_2021', 'Median_tot_prsnl_inc_weekly']]

df_2021['SA2_CODE_2021'] = df_2021['SA2_CODE_2021'].astype(int)
df_2016['SA2_CODE_2021'] = df_2016['SA2_CODE_2021'].astype(int)

# Merge dataframes
merged = df_2021.merge(df_2016, on='SA2_CODE_2021', suffixes=('_2021', '_2016'))

# Calculate annual growth rate
merged['growth_rate_prsnl_inc'] = (merged['Median_tot_prsnl_inc_weekly_2021'] / merged['Median_tot_prsnl_inc_weekly_2016']) ** (1/5) - 1

# Create a new dataframe to store the long-format data
long_df = pd.DataFrame(columns=['year', 'SA2_CODE_2021', 'Median_tot_prsnl_inc_weekly'])

# Fill out the data for the years 2016 to 2021
for year in range(2016, 2027):
    if year == 2016:
        temp_df = merged[['SA2_CODE_2021', 'Median_tot_prsnl_inc_weekly_2016']]
        temp_df.columns = ['SA2_CODE_2021', 'Median_tot_prsnl_inc_weekly']
    elif year == 2021:
        temp_df = merged[['SA2_CODE_2021', 'Median_tot_prsnl_inc_weekly_2021']]
        temp_df.columns = ['SA2_CODE_2021', 'Median_tot_prsnl_inc_weekly']
    else:
        prev_year = year - 1
        temp_df = long_df[long_df['year'] == prev_year].copy()
        
        # Use .map() to match the SA2_CODE_2021 values and apply the growth rate
        temp_df['Median_tot_prsnl_inc_weekly'] = temp_df['Median_tot_prsnl_inc_weekly'] * (1 + temp_df['SA2_CODE_2021'].map(merged.set_index('SA2_CODE_2021')['growth_rate_prsnl_inc']))
        
    
    temp_df['year'] = year
    long_df = pd.concat([long_df, temp_df], ignore_index=True)

# Print the long-format dataframe
long_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['SA2_CODE_2021'] = df_2021['SA2_CODE_2021'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2016['SA2_CODE_2021'] = df_2016['SA2_CODE_2021'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['year'] = year
A value is trying to be set on a copy of a slice from 

Unnamed: 0,year,SA2_CODE_2021,Median_tot_prsnl_inc_weekly
0,2016,201011001,702
1,2016,201011002,670
2,2016,201011005,638
3,2016,201011006,595
4,2016,201011007,646
...,...,...,...
5759,2026,217041478,931.222042
5760,2026,217041479,924.16
5761,2026,217041480,949.378689
5762,2026,297979799,1613.333333


In [42]:
long_df = long_df.rename(columns={'SA2_CODE_2021': 'SA2_CODE21', 'Median_tot_prsnl_inc_weekly': 'weekly_income'})

In [43]:
personal_income = pd.merge(long_df, mapping_df, on=['SA2_CODE21'], how='left')
personal_income = personal_income.drop(columns=['SA2_CODE21', 'SA2_NAME21'])

In [44]:
merged_df = pd.merge(merged_df, personal_income, on=['year', 'LGA_NAME23'], how='left')

In [45]:
merged_df = merged_df.drop_duplicates(subset=['LGA_NAME23', 'year'])

In [46]:
merged_df.reset_index(drop=True)
merged_df = merged_df.drop(columns = ['LGA_CODE23_y'])
merged_df = merged_df.rename(columns={'LGA_CODE23_x': 'LGA_CODE23'})

In [47]:
test_df = pd.merge(test_df, personal_income, on=['year', 'LGA_NAME23'], how='left')

In [48]:
test_df = test_df.drop_duplicates(subset=['LGA_NAME23', 'year'])

In [49]:
test_df.reset_index(drop=True)
test_df = test_df.drop(columns = ['LGA_CODE23_y'])
test_df = test_df.rename(columns={'LGA_CODE23_x': 'LGA_CODE23'})

In [50]:
test_df = test_df.reset_index(drop=True)

In [51]:
merged_df = merged_df.reset_index(drop=True)

In [52]:
merged_df

Unnamed: 0,LGA_NAME23,LGA_CODE23,name,geometry,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd,year,Offence Count,population,weekly_income
0,Alpine,20110,34 O'Donnell Avenue Myrtleford VIC 3737,POINT (146.7271339 -36.5522656),270,2.0,1.0,2.0,46.428333,5.346667,38.880000,29.888333,2.526667,2.446667,999.000000,2017,396,13113.0,621.947682
1,Ararat,20260,4 Grano Street Ararat VIC 3377,POINT (142.9414137 -37.2818641),260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.000000,1.433333,3.033333,999.000000,2017,1249,11613.0,583.176092
2,Ballarat,20570,3/41 Longley Street Alfredton VIC 3350,POINT (143.8376317 -37.5653948),280,3.0,2.0,2.0,7.341667,9.785000,6.483333,24.810000,2.235000,4.776667,999.000000,2017,11885,152520.0,731.935668
3,Banyule,20660,3 Clinton Street Heidelberg Heights VIC 3081,POINT (145.0598679 -37.7598674),395,3.0,1.0,1.0,2.876667,11.215000,9.528333,5.353333,1.760000,2.630000,23.140000,2017,9703,129192.0,573.955394
4,Bass Coast,20740,23 Carpathia Street Coronet Bay VIC 3984,POINT (145.4968004 -38.4790177),285,3.0,2.0,2.0,86.750000,32.095000,22.930000,39.630000,3.885000,7.376667,116.675000,2017,2613,34166.0,549.541548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548,Wodonga,27170,20 Klim Street Killara VIC 3691,POINT (146.8857882 -36.1313959),450.0,3.0,2.0,2.0,10.213333,999.000000,5.975000,81.821667,2.803333,3.890000,999.000000,2023,3750,74233.0,804.199699
549,Wyndham,27260,3 Victor Court Hoppers Crossing VIC 3029,POINT (144.6772403 -37.8273763),480.0,4.0,2.0,2.0,6.440000,12.295000,7.668333,13.736667,2.543333,3.648333,34.695000,2023,18155,685662.0,782.273895
550,Yarra,27350,310/8 Howard Street Richmond VIC 3121,POINT (144.9938461 -37.82125690000001),650.0,2.0,1.0,1.0,3.283333,6.468333,4.310000,4.795000,1.603333,1.371667,9.305000,2023,13415,153858.0,1488.80435
551,Yarra Ranges,27450,315 MACCLESFIELD ROAD Macclesfield VIC 3782,POINT (145.3260355 -37.75560160000001),530.0,3.0,2.0,2.0,4.188333,15.453333,6.746667,9.856667,2.313333,3.796667,44.118333,2023,7130,166556.0,576.7732


In [53]:
merged_df.to_csv("../data/development/train.csv")

In [54]:
test_df.to_csv("../data/development/test.csv")