In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

In [2]:
all_properties = pd.read_csv("../data/curated/houses_all_properties.csv")
all_properties.head()

Unnamed: 0,address,parking,type,num_schools,cost,suburb,beds,baths,cost/(beds+baths),geometry,...,NUMBER_OF_JOBS_PERSONS_2018-19,NUMBER_OF_JOBS_PERSONS_2019-20,NUMBER_OF_JOBS_PERSONS_2020-21,MEDIAN_INCOME_PERSONS_2016-17,MEDIAN_INCOME_PERSONS_2017-18,MEDIAN_INCOME_PERSONS_2018-19,MEDIAN_INCOME_PERSONS_2019-20,MEDIAN_INCOME_PERSONS_2020-21,centroid,distance_to_centroid
0,"(Leased) 3 Yarra Street, South Yarra VIC 3141",1,Apartment / Unit / Flat,13,460.0,South Yarra,1,1,230.0,POINT (144.992828 -37.838173),...,13060.0,12759.0,12394.0,26306.5,27750.5,29043.5,32390.5,34577.5,POINT (144.9965000772275 -37.83614617221807),0.004194
1,"004B/12 Albert Street, Hawthorn East VIC 3123",0,Apartment / Unit / Flat,14,400.0,Hawthorn East,1,1,200.0,POINT (145.048408 -37.823472),...,15422.0,14730.0,14159.0,26100.5,27117.0,26803.5,31088.0,34211.0,POINT (145.04991149745675 -37.83126983822268),0.007941
2,"04/949 Dandenong Road, Malvern East VIC 3145",0,Apartment / Unit / Flat,10,365.0,Malvern East,1,1,182.5,POINT (145.048262 -37.878631),...,18950.0,18895.0,19034.0,25636.0,26307.0,25956.5,28608.0,30642.5,POINT (145.065853863916 -37.87682309041124),0.017685
3,"0711/333 Exhibition St, Melbourne VIC 3000",0,Apartment / Unit / Flat,12,675.0,Melbourne,2,2,168.75,POINT (144.96834 -37.807831),...,13693.0,12473.0,11002.0,10656.0,10929.0,11120.5,12060.0,14478.0,POINT (144.96859287539095 -37.812787018836396),0.004962
4,"1 & 2/23 Koonawarra Street, Clayton VIC 3168",1,House,5,340.0,Clayton,1,1,170.0,POINT (145.124589 -37.90863),...,6188.0,6445.0,7536.0,11981.0,12020.0,11625.0,12689.5,12635.0,POINT (145.1381453634467 -37.91053619756948),0.01369


In [3]:
stratify_column = 'SA2_Name'

house_counts = all_properties[stratify_column].value_counts()

all_properties = all_properties[all_properties[stratify_column].isin(house_counts[house_counts >= 50].index)]

In [4]:
# Create a mapping dictionary
mapping = {category: index for index, category in enumerate(all_properties['SA2_Name'].unique())}

# Map the categorical variable to numerical indices
all_properties['map_SA2_Name'] = all_properties['SA2_Name'].map(mapping)

all_properties['type'] = all_properties['type'].apply(lambda x: 0 if x == 'House' else 1) 

In [5]:
columns_to_drop = []
    
for item in all_properties.columns:
    if "name" in item:
        columns_to_drop.append(item)
    if "centroid" in item:
        columns_to_drop.append(item)
    if "increase" in item:
        columns_to_drop.append(item)
    if "migration" in item:
        columns_to_drop.append(item)
    if "ERP" in item:
        columns_to_drop.append(item)
    if "NUMBER" in item:
        columns_to_drop.append(item)
    if "MEDIAN" in item:
        columns_to_drop.append(item)
        
all_properties = all_properties.drop(columns_to_drop, axis = 1)

all_properties = all_properties.drop(["address", "cost/(beds+baths)", "suburb", "geometry"], axis = 1)

all_properties.dtypes

parking                                int64
type                                   int64
num_schools                            int64
cost                                 float64
beds                                   int64
baths                                  int64
closest_train_station_distance_km    float64
closest_tram_station_distance_km     float64
closest_hospital_distance_km         float64
closest_grocery_distance_km          float64
SA2_Name                              object
map_SA2_Name                           int64
dtype: object

In [6]:
exclude_columns = ['parking', 'type', 'num_schools', 'beds', 'baths', 'map_SA2_Name', 'cost']
columns_to_scale = [col for col in all_properties.columns if all_properties[col].dtype != object and col not in exclude_columns]

# Initialize the StandardScaler
scaler = Normalizer()

# Scale the selected columns
data_scaled = all_properties.copy()
data_scaled[columns_to_scale] = scaler.fit_transform(data_scaled[columns_to_scale])

In [7]:
display(data_scaled.describe())

Unnamed: 0,parking,type,num_schools,cost,beds,baths,closest_train_station_distance_km,closest_tram_station_distance_km,closest_hospital_distance_km,closest_grocery_distance_km,map_SA2_Name
count,4430.0,4430.0,4430.0,4430.0,4430.0,4430.0,4430.0,4430.0,4430.0,4430.0,4430.0
mean,1.017833,0.640858,8.681264,642.020377,2.358239,1.498871,0.309436,0.381845,0.621051,0.172785,24.918962
std,0.890175,0.479803,3.587438,260.052483,1.101988,0.56453,0.263852,0.388418,0.328015,0.12199,14.836719
min,0.0,0.0,0.0,165.0,1.0,1.0,0.002231,0.002222,0.001991,0.00017,0.0
25%,0.0,0.0,7.0,500.0,2.0,1.0,0.101746,0.065071,0.331758,0.077536,12.0
50%,1.0,1.0,10.0,580.0,2.0,1.0,0.230563,0.149192,0.745395,0.149443,24.0
75%,2.0,1.0,11.0,700.0,3.0,2.0,0.42545,0.888253,0.927273,0.247173,39.0
max,11.0,1.0,15.0,3800.0,9.0,6.0,0.991671,0.999938,0.998828,0.69121,46.0


In [8]:
X = data_scaled.drop(columns=["cost", "SA2_Name"])
y = data_scaled["cost"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=data_scaled['map_SA2_Name']
)

In [9]:
linear_model = LinearRegression()

linear_model.fit(X_train, y_train)

y_pred_linear = linear_model.predict(X_test)

rmse_linear = root_mean_squared_error(y_test, y_pred_linear) 
r2_linear = r2_score(y_test, y_pred_linear) 

print(f'Root Mean Squared Error: {rmse_linear}')
print(f'R² Score: {r2_linear}')

Root Mean Squared Error: 189.2797034181879
R² Score: 0.5156328672610988


In [10]:
rf_model = RandomForestRegressor(max_depth = 5)

rf_model.fit(X_train, y_train)

y_pred_rf_test = rf_model.predict(X_test)
y_pred_rf_train = rf_model.predict(X_train)

rmse_rf_test = root_mean_squared_error(y_test, y_pred_rf_test) 
rmse_rf_train = root_mean_squared_error(y_train, y_pred_rf_train)
r2_rf_test = r2_score(y_test, y_pred_rf_test)
r2_rf_train = r2_score(y_train, y_pred_rf_train)

print(f'Root Mean Squared Error: Test: {rmse_rf_test} and Train: {rmse_rf_train}')
print(f'R² Score: Test: {r2_rf_test} and Train: {r2_rf_train}')

Root Mean Squared Error: Test: 151.1770198934034 and Train: 137.23582705945043
R² Score: Test: 0.6910144766052859 and Train: 0.7147200224543111


In [11]:
y_pred_series = pd.Series(rf_model.predict(X), name='Predicted').reset_index(drop=True)
X_reset = X.reset_index(drop=True)
predicted_test = pd.concat([X_reset, y_pred_series], axis=1)
predicted_test = pd.concat([predicted_test, data_scaled["SA2_Name"]], axis = 1)

In [12]:
aggregated_df = predicted_test.groupby("SA2_Name")['Predicted'].median().reset_index(name='SA2_Median')
aggregated_df

Unnamed: 0,SA2_Name,SA2_Median
0,Albert Park,589.984695
1,Alfredton,537.314066
2,Barwon Heads - Armstrong Creek,630.539136
3,Bendigo,540.633839
4,Carlton,541.725559
5,Carnegie,613.329411
6,Caulfield - North,613.329411
7,Clyde North - South,540.253866
8,Docklands,540.551672
9,Doncaster,613.329411


In [13]:
threshold_subs = aggregated_df["SA2_Name"].unique()

In [14]:
len(threshold_subs)

47

In [15]:
subs_w_hist = pd.read_csv("../data/curated/suburb_w_hist.csv")
subs_w_hist['Suburb'] = subs_w_hist['Suburb'].replace('Melbourne', 'CBD')

In [16]:
contain = []
not_contain = []

for suburb in subs_w_hist["Suburb"].unique():
    found = False
    for sub in threshold_subs:
        if suburb == sub:
            contain.append((suburb, sub))
            break
        elif suburb in sub:
            contain.append((suburb, sub))
            found = True
    if not found:
        not_contain.append(suburb)
    
comp = pd.DataFrame(contain, columns = ["sub_w_hist", "sa2"])

In [17]:
len(comp["sa2"].unique())

35

In [18]:
from collections import Counter

count = Counter(comp["sa2"])

duplicate = []

for item in count:
    if count[item] > 1:
        print(item, count[item])
        duplicate.append(item)

Malvern - Glen Iris 2
Hawthorn East 2
Malvern East 2
Prahran - Windsor 2
St Kilda East 2


In [19]:
results = []

for suburb in comp["sa2"]:
    
    if suburb in duplicate:
        if "-" in suburb:
            hist_subs = list(comp[comp["sa2"] == suburb]["sub_w_hist"])
            hist_data = pd.concat([subs_w_hist[subs_w_hist["Suburb"] == hist_subs[0]][['date', 'median_rental_price']], 
                                   subs_w_hist[subs_w_hist["Suburb"] == hist_subs[1]][['date', 'median_rental_price']]])
        else:
            hist_subs = list(comp[comp["sa2"] == suburb]["sub_w_hist"])
            if hist_subs[0] == suburb:
                hist_data = subs_w_hist[subs_w_hist["Suburb"] == hist_subs[0]][['date', 'median_rental_price']]
            else:
                hist_data = subs_w_hist[subs_w_hist["Suburb"] == hist_subs[1]][['date', 'median_rental_price']]
                
    else:
        hist_sub = list(comp[comp["sa2"] == suburb]["sub_w_hist"])[0]
        hist_data = subs_w_hist[subs_w_hist["Suburb"] == hist_sub][['date', 'median_rental_price']]
        
    sep_2024_median_df = pd.DataFrame({'date': '2024-09-01', 'median_rental_price': aggregated_df[aggregated_df["SA2_Name"] == suburb]["SA2_Median"]})
    suburb_df = pd.concat([hist_data, sep_2024_median_df], ignore_index=True)
    suburb_df['date'] = pd.to_datetime(suburb_df['date'])
    
    # Use only recent data for linear regression
    suburb_df = suburb_df[suburb_df['date'] >= '2021-01-01']

    # Extract the year from the date
    suburb_df['year'] = suburb_df['date'].dt.year

    # Prepare data for the linear regression model
    X_historical = suburb_df[['year']]  # Independent variable (year)
    y_historical = suburb_df['median_rental_price']  # Dependent variable (rental price)

    # Fit a linear regression model
    future_model = LinearRegression()
    future_model.fit(X_historical, y_historical)

    # Forecast for the years 2025, 2026, and 2027
    forecast_years = pd.DataFrame({'year': [2025, 2026, 2027]})
    forecasted_prices = future_model.predict(forecast_years)

    # Create a DataFrame for the forecast results
    forecast_yearly = pd.DataFrame({
        'Suburb': [suburb] * 3,
        'forecasted_median_rental_price': forecasted_prices
    }, index=[f'2025', f'2026', f'2027'])
    
    results.append(forecast_yearly)

# Combine all suburb forecasts
forecast_df_combined = pd.concat(results)

# Pivot the table to have 'Suburb' as columns and years as rows
final_results = forecast_df_combined.pivot_table(values='forecasted_median_rental_price', index=forecast_df_combined.index, columns='Suburb')

# Transpose and reset the index for the final result
final_df = final_results.T.reset_index()

# Rename the columns for clarity
final_df = final_df.rename(columns={'index': 'Suburb'})

# Reorder the DataFrame columns to have 'Suburb' first, followed by dates
columns_order = ['Suburb'] + [col for col in final_df.columns if col != 'Suburb']

final_df = final_df[columns_order]

In [20]:
final_df.to_csv("../data/curated/future_predict.csv")