# Preprocess Historical Data (Part 2)
Combining the historical data with external data sources. Not substancially used in analysis as the historical data is limited to very few suburbs.

Written by Daksh Agrawal

In [78]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import geopandas as gpd

In [79]:
# Load the historical data
historical_data = pd.read_csv('../data/raw/historical_data.csv')
historical_data.head()

Unnamed: 0,type,suburb,Count,Date,Median
0,1 bedroom flat,Albert Park-Middle Park-West St Kilda,352.0,Mar 2000,165.0
1,1 bedroom flat,Armadale,210.0,Mar 2000,150.0
2,1 bedroom flat,Carlton North,87.0,Mar 2000,150.0
3,1 bedroom flat,Carlton-Parkville,303.0,Mar 2000,165.0
4,1 bedroom flat,CBD-St Kilda Rd,755.0,Mar 2000,250.0


In [80]:
historical_data['year_completed'] = pd.to_datetime(historical_data['Date']).dt.year

  historical_data['year_completed'] = pd.to_datetime(historical_data['Date']).dt.year


In [81]:
# Convert Date to datetime
historical_data['Date'] = pd.to_datetime(historical_data['Date'])

  historical_data['Date'] = pd.to_datetime(historical_data['Date'])


In [82]:
# Read victoria district boundaries shapefile
victoria_gdf = gpd.read_file('../data/landing/boundaries/Victoria/vic_dist_boundaries.shp')
victoria_gdf.head()

Unnamed: 0,sa2_code,sa2_name,chg_flag,chg_lbl,sa3_code,sa3_name,sa4_code,sa4_name,gcc_code,gcc_name,ste_code,ste_name,aus_code,aus_name,areasqkm,loci_uri,geometry
0,201011001,Alfredton,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,52.7109,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.78281 -37.56667, 143.75557 -37.5..."
1,201011002,Ballarat,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,12.3787,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.81896 -37.55583, 143.81644 -37.5..."
2,201011005,Buninyong,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,51.5855,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.8417 -37.61597, 143.84175 -37.61..."
3,201011006,Delacombe,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,34.1607,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.75049 -37.5912, 143.75044 -37.59..."
4,201011007,Smythes Creek,0,No change,20101,Ballarat,201,Ballarat,2RVIC,Rest of Vic.,2,Victoria,AUS,Australia,104.7274,http://linked.data.gov.au/dataset/asgsed3/SA2/...,"POLYGON ((143.73295 -37.62334, 143.73262 -37.6..."


In [83]:
# Merge the historical data with the district boundaries based on the name
historical_data = historical_data.merge(victoria_gdf[["sa2_name", "sa2_code"]], left_on='suburb', right_on='sa2_name')
historical_data

Unnamed: 0,type,suburb,Count,Date,Median,year_completed,sa2_name,sa2_code
0,1 bedroom flat,Armadale,210.0,2000-03-01,150.0,2000,Armadale,206061135
1,1 bedroom flat,Docklands,,2000-03-01,,2000,Docklands,206041118
2,1 bedroom flat,East Melbourne,203.0,2000-03-01,180.0,2000,East Melbourne,206041119
3,1 bedroom flat,Elwood,585.0,2000-03-01,150.0,2000,Elwood,206051129
4,1 bedroom flat,Fitzroy,176.0,2000-03-01,140.0,2000,Fitzroy,206071142
...,...,...,...,...,...,...,...,...
29619,All properties,Seymour,143.0,2023-03-01,365.0,2023,Seymour,204011059
29620,All properties,Swan Hill,299.0,2023-03-01,320.0,2023,Swan Hill,215031404
29621,All properties,Torquay,547.0,2023-03-01,620.0,2023,Torquay,203031053
29622,All properties,Warragul,497.0,2023-03-01,440.0,2023,Warragul,205011079


In [84]:
# Load historical population data
population_data = pd.read_csv('../data/raw/population_historical.csv')
population_data['Date'] = pd.to_datetime(population_data['Date'])
population_data['SA2_CODE'] = population_data['SA2_CODE'].astype(str)

In [85]:
# shift the date by 1 month to match the historical data
population_data['Date'] = population_data['Date'] + pd.DateOffset(months=-1)

In [86]:
# Merge the historical data with the population data based on the year and district code
historical_data = historical_data.merge(population_data, left_on=['Date', 'sa2_code'], right_on=['Date', 'SA2_CODE'])
historical_data

Unnamed: 0.1,type,suburb,Count,Date,Median,year_completed,sa2_name,sa2_code,Unnamed: 0,SA2_CODE,Population
0,1 bedroom flat,Armadale,210.0,2000-03-01,150.0,2000,Armadale,206061135,673,206061135,8593.575342
1,1 bedroom flat,Docklands,,2000-03-01,,2000,Docklands,206041118,650,206041118,-427.643836
2,1 bedroom flat,East Melbourne,203.0,2000-03-01,180.0,2000,East Melbourne,206041119,651,206041119,3634.561644
3,1 bedroom flat,Elwood,585.0,2000-03-01,150.0,2000,Elwood,206051129,666,206051129,13222.958904
4,1 bedroom flat,Fitzroy,176.0,2000-03-01,140.0,2000,Fitzroy,206071142,681,206071142,8173.356164
...,...,...,...,...,...,...,...,...,...,...,...
29619,All properties,Seymour,143.0,2023-03-01,365.0,2023,Seymour,204011059,48609,204011059,6686.698630
29620,All properties,Swan Hill,299.0,2023-03-01,320.0,2023,Swan Hill,215031404,49037,215031404,10927.287671
29621,All properties,Torquay,547.0,2023-03-01,620.0,2023,Torquay,203031053,48601,203031053,26174.054795
29622,All properties,Warragul,497.0,2023-03-01,440.0,2023,Warragul,205011079,48630,205011079,24572.342466


In [87]:
# Assuming your DataFrame is named 'df'
# Sort the DataFrame by 'suburb', 'type', and 'Date'
historical_data = historical_data.sort_values(by=['suburb', 'type', 'Date'])

# Calculate the percentage change in rental price (Median) and population
historical_data['percentage_change_rental_price'] = historical_data.groupby(['suburb', 'type'])['Median'].pct_change()
historical_data['percentage_change_population'] = historical_data.groupby(['suburb', 'type'])['Population'].pct_change()

  historical_data['percentage_change_rental_price'] = historical_data.groupby(['suburb', 'type'])['Median'].pct_change()


In [88]:
historical_data

Unnamed: 0.1,type,suburb,Count,Date,Median,year_completed,sa2_name,sa2_code,Unnamed: 0,SA2_CODE,Population,percentage_change_rental_price,percentage_change_population
15,1 bedroom flat,Altona,87.0,2000-03-01,95.0,2000,Altona,213021341,930,213021341,12297.095890,,
61,1 bedroom flat,Altona,94.0,2000-06-01,100.0,2000,Altona,213021341,1452,213021341,12271.915068,0.052632,-0.002048
107,1 bedroom flat,Altona,97.0,2000-09-01,105.0,2000,Altona,213021341,1974,213021341,12246.457534,0.050000,-0.002074
153,1 bedroom flat,Altona,98.0,2000-12-01,105.0,2000,Altona,213021341,2496,213021341,12221.000000,0.000000,-0.002079
199,1 bedroom flat,Altona,89.0,2001-03-01,105.0,2001,Altona,213021341,3018,213021341,12196.095890,0.000000,-0.002038
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29439,All properties,Wodonga,1233.0,2022-03-01,380.0,2022,Wodonga,204031492,46538,204031492,14863.726027,0.027027,-0.001094
29485,All properties,Wodonga,1267.0,2022-06-01,390.0,2022,Wodonga,204031492,47060,204031492,14847.271233,0.026316,-0.001107
29531,All properties,Wodonga,1251.0,2022-09-01,400.0,2022,Wodonga,204031492,47582,204031492,14830.635616,0.025641,-0.001120
29577,All properties,Wodonga,1191.0,2022-12-01,410.0,2022,Wodonga,204031492,48104,204031492,14814.000000,0.025000,-0.001122


In [89]:
# Save the processed data
historical_data.to_csv('../data/curated/historical_data.csv', index=False)