In [1]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Import Rental Data


In [2]:
rental_data = pd.read_csv("../data/raw/rental_data.csv")

In [3]:
# get a new column year from the rental dataframe
rental_data['year'] = pd.DatetimeIndex(rental_data['date']).year
rental_data

Unnamed: 0,lat,lng,address,bed,bath,car,type,rented_price,date,year
0,-37.813730,144.955580,"201/560 LONSDALE STREET, MELBOURNE",2.0,2.0,1.0,Unit/apmt,800,2023-08-01,2023
1,-37.813730,144.955580,"201/560 LONSDALE STREET, MELBOURNE",2.0,2.0,1.0,Unit/apmt,730,2024-03-01,2024
2,-37.813730,144.955580,"201/560 LONSDALE STREET, MELBOURNE",2.0,2.0,1.0,Unit/apmt,800,2023-07-01,2023
3,-37.813730,144.955580,"201/560 LONSDALE STREET, MELBOURNE",2.0,2.0,1.0,Unit/apmt,540,2021-08-01,2021
4,-37.813730,144.955580,"1702/560 LONSDALE STREET, MELBOURNE",2.0,1.0,0.0,Unit/apmt,720,2023-08-01,2023
...,...,...,...,...,...,...,...,...,...,...
3388731,-37.883631,144.732710,"96 BOARDWALK BOULEVARD, POINT COOK",3.0,2.0,2.0,Unit/apmt,330,2011-12-01,2011
3388732,-37.883631,144.732710,"96 BOARDWALK BOULEVARD, POINT COOK",3.0,2.0,2.0,Unit/apmt,330,2011-11-01,2011
3388733,-37.883631,144.732710,"96 BOARDWALK BOULEVARD, POINT COOK",3.0,2.0,2.0,Unit/apmt,280,2009-11-01,2009
3388734,-37.895171,144.759854,"53 BROADBEACH CIRCUIT, POINT COOK",4.0,2.0,2.0,Townhouse,420,2019-09-01,2019


Select only years after 2017

In [5]:
rental_data = rental_data[rental_data['year'] >= 2017]

# Import population and income data

In [7]:
population_data = pd.read_csv("../data/raw/population/population-01-23.csv")
income_data_2 = pd.read_csv("../data/raw/income/income-17-21.csv")

population_data

Unnamed: 0,sa2_code,sa2_name,year,population,geometry
0,201011001,Alfredton,2001,5756.0,POLYGON ((143.78282104711133 -37.5666578080732...
1,201011002,Ballarat,2001,11497.0,POLYGON ((143.81896375194268 -37.5558184903035...
2,201011005,Buninyong,2001,5320.0,POLYGON ((143.8417079819547 -37.61596158212406...
3,201011006,Delacombe,2001,4154.0,POLYGON ((143.7505006002539 -37.59119089617515...
4,201011007,Smythes Creek,2001,3317.0,POLYGON ((143.7329551500814 -37.62333175526272...
...,...,...,...,...,...
12001,217031476,Otway,2023,3983.0,MULTIPOLYGON (((143.40263218222083 -38.7815167...
12002,217041477,Moyne - East,2023,7132.0,POLYGON ((142.41438437646818 -38.0930256782602...
12003,217041478,Moyne - West,2023,10148.0,MULTIPOLYGON (((142.00870118938005 -38.4171452...
12004,217041479,Warrnambool - North,2023,22762.0,POLYGON ((142.4366836580644 -38.35544112113083...


In [8]:
# change the type of income_data_2 to float
income_data_2['median_income'] = income_data_2['median_income'].str.replace(',', '').astype(float)

In [9]:
population_data["geometry"] = population_data["geometry"].apply(wkt.loads)
income_data_2["geometry"] = income_data_2["geometry"].apply(wkt.loads)

gdf_population = gpd.GeoDataFrame(population_data, geometry="geometry", crs="EPSG:4326")
gdf_income_2 = gpd.GeoDataFrame(income_data_2, geometry="geometry", crs="EPSG:4326")

gdf_rental = gpd.GeoDataFrame(rental_data, geometry=gpd.points_from_xy(rental_data["lng"], rental_data["lat"]), crs="EPSG:4326")

# join rental data with population and income

In [10]:
# left sptail join rental lat lon on population geometry column

joined_gdf = gpd.sjoin(gdf_rental,gdf_population, how="left", predicate = "within", on_attribute=["year"])


In [11]:
joined_gdf = joined_gdf.dropna()

Only selecting the data recorded after year 2016

In [14]:
joined_gdf2 = joined_gdf[joined_gdf["year"] > 2016]

In [15]:
joined_gdf2.drop(columns=["index_right"], inplace=True)
joined_gdf_new2 = gpd.sjoin(joined_gdf2,gdf_income_2, how="left", predicate = "within", on_attribute=["year"])
joined_gdf_new2

Unnamed: 0,lat,lng,address,bed,bath,car,type,rented_price,date,year,geometry,sa2_code_left,sa2_name_left,population,index_right,sa2_code_right,sa2_name_right,median_income
0,-37.813730,144.955580,"201/560 LONSDALE STREET, MELBOURNE",2.0,2.0,1.0,Unit/apmt,800,2023-08-01,2023,POINT (144.95558 -37.81373),206041505.0,Melbourne CBD - West,20027.0,,,,
2,-37.813730,144.955580,"201/560 LONSDALE STREET, MELBOURNE",2.0,2.0,1.0,Unit/apmt,800,2023-07-01,2023,POINT (144.95558 -37.81373),206041505.0,Melbourne CBD - West,20027.0,,,,
3,-37.813730,144.955580,"201/560 LONSDALE STREET, MELBOURNE",2.0,2.0,1.0,Unit/apmt,540,2021-08-01,2021,POINT (144.95558 -37.81373),206041505.0,Melbourne CBD - West,16098.0,2364.0,206041505.0,Melbourne CBD - West,39300.0
4,-37.813730,144.955580,"1702/560 LONSDALE STREET, MELBOURNE",2.0,1.0,0.0,Unit/apmt,720,2023-08-01,2023,POINT (144.95558 -37.81373),206041505.0,Melbourne CBD - West,20027.0,,,,
5,-37.813730,144.955580,"1702/560 LONSDALE STREET, MELBOURNE",2.0,1.0,0.0,Unit/apmt,650,2023-06-01,2023,POINT (144.95558 -37.81373),206041505.0,Melbourne CBD - West,20027.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3388726,-37.883631,144.732710,"96 BOARDWALK BOULEVARD, POINT COOK",3.0,2.0,2.0,Unit/apmt,370,2019-09-01,2019,POINT (144.73271 -37.88363),213051581.0,Point Cook - North West,9739.0,1408.0,213051581.0,Point Cook - North West,57433.0
3388727,-37.883631,144.732710,"96 BOARDWALK BOULEVARD, POINT COOK",3.0,2.0,2.0,Unit/apmt,370,2019-08-01,2019,POINT (144.73271 -37.88363),213051581.0,Point Cook - North West,9739.0,1408.0,213051581.0,Point Cook - North West,57433.0
3388728,-37.883631,144.732710,"96 BOARDWALK BOULEVARD, POINT COOK",3.0,2.0,2.0,Unit/apmt,370,2019-07-01,2019,POINT (144.73271 -37.88363),213051581.0,Point Cook - North West,9739.0,1408.0,213051581.0,Point Cook - North West,57433.0
3388729,-37.883631,144.732710,"96 BOARDWALK BOULEVARD, POINT COOK",3.0,2.0,2.0,Unit/apmt,365,2018-06-01,2018,POINT (144.73271 -37.88363),213051581.0,Point Cook - North West,9678.0,889.0,213051581.0,Point Cook - North West,57001.0


In [40]:
joined_gdf = joined_gdf_new2

joined_gdf_new2 is the dataframe that has the data from 2021 - 2023

In [41]:
joined_gdf = joined_gdf.dropna()

In [42]:
joined_gdf.drop(columns = "index_right",inplace=True)

KeyError: "['index_right'] not found in axis"

# Import cpidata, unemployement rate and housing interest data

In [27]:
cpi_data = pd.read_csv("../data/curated/cpi_data.csv")
unemployment_data = pd.read_csv("../data/curated/yearly_avg_unemployment.csv")
housing_interest_data = pd.read_csv("../data/curated/housing_interest.csv")

In [28]:
# change the time column to year only
housing_interest_data["time"] = housing_interest_data["time"].apply(lambda x: x[:4])

In [29]:
# aggregate the housing_interest_data by year
housing_interest_data = housing_interest_data.groupby("time")["interest_rate"].mean().reset_index()
housing_interest_data

Unnamed: 0,time,interest_rate
0,2019,4.171667
1,2020,3.656667
2,2021,3.415
3,2022,4.308333
4,2023,6.374167
5,2024,6.661667


In [30]:
housing_interest_data.rename(columns={"time":"year"}, inplace=True)

In [31]:
unemployment_data.rename(columns = {"Yearly Average Seasonally Adjusted (%)":"unemployment_rate"}, inplace=True)

In [32]:
cpi_data.drop(columns = ["time","Change from previous quarter (%)"] ,inplace=True)
cpi_data.rename(columns = {"Annual change (%)" : "cpi" }, inplace=True)
# add a row in unemployment_data that 2024 has rate 4.2
new_row = pd.DataFrame({"year": [2024], "cpi": [3.8]})
cpi_data = pd.concat([cpi_data, new_row], ignore_index=True)
cpi_data

Unnamed: 0,cpi,year
0,1.7,2014
1,1.7,2015
2,1.5,2016
3,1.9,2017
4,1.8,2018
5,1.8,2019
6,0.9,2020
7,3.5,2021
8,7.8,2022
9,4.1,2023


In [33]:
# add a row in unemployment_data that 2021 has rate 4.2
new_row = pd.DataFrame({"year": [2021], "unemployment_rate": [4.2]})
unemployment_data = pd.concat([unemployment_data, new_row], ignore_index=True)
unemployment_data

Unnamed: 0,year,unemployment_rate
0,2014,6.216667
1,2015,6.058333
2,2016,5.716667
3,2017,5.608333
4,2018,5.283333
5,2019,5.175
6,2020,5.166667
7,2022,3.611111
8,2023,3.691667
9,2024,4.014286


# model median income using time series

In [96]:
# join the cpi_data and unemployment_data on year
joined_gdf_with_cpi = pd.merge(joined_gdf, cpi_data, on="year", how="left")
joined_gdf_with_cpi_and_unem = pd.merge(joined_gdf_with_cpi, unemployment_data, on="year", how="left")

In [98]:
joined_gdf_with_cpi_and_unem.drop(columns = ["lat","lng","address","sa2_code_left","sa2_name_left","sa2_code_right","date"], inplace=True)
joined_gdf_with_cpi_and_unem.rename(columns={"sa2_name_right":"region"},inplace = True)

In [100]:
# find the unique values of the region column
unique_region = joined_gdf_with_cpi_and_unem["region"].unique()
median_income_pred = pd.DataFrame(columns=["year","region","median_income"])

In [102]:
# Predict using each data for a specific region

for j in range(2022,2030):
    for i in range(0,len(unique_region)):
        
        region_0 = joined_gdf_with_cpi_and_unem[joined_gdf_with_cpi_and_unem["region"] == unique_region[i]]
        X = region_0[["cpi","year","unemployment_rate"]]
        y = region_0["median_income"]

        model = LinearRegression()
        model.fit(X,y)

        data_2022 = [cpi_data[cpi_data["year"] == j]["cpi"].values[0],j,unemployment_data[unemployment_data["year"] == j]["unemployment_rate"].values[0]]

        new_row = pd.DataFrame({"year": [j], "region": [unique_region[i]], "median_income": model.predict([data_2022])[0]})

        median_income_pred = pd.concat([median_income_pred, new_row], ignore_index=True)

median_income_pred.rename(columns={"median_income":"median_pred_income"},inplace=True)
income_data_2.rename(columns={"sa2_name":"region"}, inplace=True)
income_data_2.head(1)

# join the median_income_pred with income_data_2 on region
median_income_pred = pd.merge(median_income_pred, income_data_2, on="region", how="left")

median_income_pred.drop(columns=["year_y","median_income"], inplace=True)
median_income_pred.rename(columns={"year_x":"year"}, inplace=True)

median_income_pred.drop_duplicates(inplace=True)
median_income_pred.rename(columns={"median_pred_income":"median_income"}, inplace=True)

# join the median_income_pred with income_data_2 on region
income_data_full = pd.concat([income_data_2, median_income_pred], ignore_index=True)
income_data_full

  median_income_pred = pd.concat([median_income_pred, new_row], ignore_index=True)


2022




2023




2024


IndexError: index 0 is out of bounds for axis 0 with size 0

create new dataframe that used our prediction

In [None]:
gdf_income_new = gpd.GeoDataFrame(income_data_full, geometry="geometry", crs="EPSG:4326")

In [76]:
final_rental = gpd.sjoin(gdf_rental,gdf_income_new, how="left", predicate = "within", on_attribute=["year"])
final_rental.drop(columns=["index_right"], inplace=True)
final_rental = gpd.sjoin(final_rental,gdf_population, how="left", predicate = "within", on_attribute=["year"])
final_rental = final_rental.dropna()

In [78]:
final_rental = final_rental.merge(cpi_data, on="year", how="left")
final_rental = final_rental.merge(unemployment_data, on="year", how="left")


In [80]:
final_rental.to_csv("../data/curated/rental_data_cleaned.csv", index=False)

In [39]:
income_data_full.to_csv("../data/curated/predicted_median_income.csv")

# Population prediction from abs

In [68]:
population_pred = pd.read_csv("../data/landing/population_prediction.csv")


population_pred.dropna(inplace=True)

population_pred["Statistical Areas Level 2 2021 code"] = population_pred["Statistical Areas Level 2 2021 code"].astype(str)
# remove all rows that doesn't start with 2 in Statistical Areas Level 2 2021 code

population_pred = population_pred[population_pred["Statistical Areas Level 2 2021 code"].str.startswith("2")]


  population_pred = pd.read_csv("../data/landing/population_prediction.csv")


In [69]:
# remove all columns other than the Projected persons (total) column
population_pred = population_pred[["Statistical Areas Level 2 2021 code","Projected persons (total)","Statistical Areas Level 2 2021 name","Reference date"]]

In [70]:
population_pred["year"] = population_pred["Reference date"].apply(lambda x: x[5:10])

# find the distinct values of the Statistical Areas Level 2 2021 code
unique_num = population_pred["Statistical Areas Level 2 2021 code"].unique()
len(unique_num)

522

In [81]:
population_pred.to_csv("../data/curated/population_prediction.csv", index=False)