In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

from tqdm.auto import tqdm
tqdm.pandas()


import sys, os
sys.path.append(os.path.abspath('../'))
from scripts.utils import create_dir, get_runtime
import time
start_time = time.time()


In [None]:
MIN_INSTANCES_PER_SUBURB = 100

In [8]:
# Load rental dataset from curated layer
rental_df = pd.read_csv('../data/curated/rental-17-24.csv')
rental_df['sa2_code'] = rental_df['sa2_code'].astype('Int64').astype(str)
rental_df = rental_df[[
    'suburb', 'sa2_code', 'type', 'year', 'bed', 'bath', 'car', 'median_income',
    'population', 'cpi', 'unemployment_rate', 'time_city', 'avg_property_price',
    'rented_price'
]]
rental_df

Unnamed: 0,suburb,sa2_code,type,year,bed,bath,car,median_income,population,cpi,unemployment_rate,time_city,avg_property_price,rented_price
0,MELBOURNE,206041505,Unit/apmt,2023,2.0,2.0,1.0,44492.400500,20027.0,5.60,3.691667,263.3,682488.770000,800
1,MELBOURNE,206041505,Unit/apmt,2023,2.0,2.0,1.0,44492.400500,20027.0,5.60,3.691667,263.3,682488.770000,800
2,MELBOURNE,206041505,Unit/apmt,2021,2.0,2.0,1.0,39300.000000,16098.0,3.50,4.200000,263.3,619543.745192,540
3,MELBOURNE,206041505,Unit/apmt,2023,2.0,1.0,0.0,44492.400500,20027.0,5.60,3.691667,263.3,682488.770000,720
4,MELBOURNE,206041505,Unit/apmt,2023,2.0,1.0,0.0,44492.400500,20027.0,5.60,3.691667,263.3,682488.770000,650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1409722,PAKENHAM,212011552,House,2024,3.0,1.0,0.0,55683.420715,28466.0,3.53,4.014286,3251.4,843219.858156,520
1409723,PAKENHAM,212011551,House,2024,3.0,2.0,2.0,57772.670788,7044.0,3.53,4.014286,3126.4,843219.858156,500
1409724,PAKENHAM,212011551,House,2024,3.0,2.0,2.0,57772.670788,7044.0,3.53,4.014286,3126.4,843219.858156,500
1409725,PAKENHAM,212011550,Unit/apmt,2024,4.0,2.0,2.0,58924.808098,9820.0,3.53,4.014286,3297.1,843219.858156,580


In [9]:
hist_df = rental_df[
    (rental_df['year'] >= 2017) &
    (rental_df['year'] <= 2023)
]
curr_df = rental_df[rental_df['year'] == 2024]

In [10]:
# parition curr and hist dataframes based on type: 'House' and 'Unit/apmt'
hist_house_df = hist_df[hist_df['type'] == 'House']
hist_unit_df = hist_df[hist_df['type'] == 'Unit/apmt']
curr_house_df = curr_df[curr_df['type'] == 'House']
curr_unit_df = curr_df[curr_df['type'] == 'Unit/apmt']

# drop 'type' column
hist_house_df.drop(columns=['type'], inplace=True)
hist_unit_df.drop(columns=['type'], inplace=True)
curr_house_df.drop(columns=['type'], inplace=True)
curr_unit_df.drop(columns=['type'], inplace=True)

# print the shape of each dataframe
print('hist_house_df:', hist_house_df.shape)
print('hist_unit_df:', hist_unit_df.shape)
print('curr_house_df:', curr_house_df.shape)
print('curr_unit_df:', curr_unit_df.shape)

hist_house_df: (488339, 14)
hist_unit_df: (742645, 14)
curr_house_df: (29701, 14)
curr_unit_df: (57744, 14)


In [16]:
# generate dictionary of suburb as key and partitions from df as values, where each partition is from a particular suburb
def get_suburb_partitions(df):
    suburb_partitions = dict()
    for suburb in df['suburb'].unique():
        suburb_partitions[suburb] = df[df['suburb'] == suburb]
    return suburb_partitions

# get suburb partitions for each df
hist_house_partitions = get_suburb_partitions(hist_house_df)
hist_unit_partitions = get_suburb_partitions(hist_unit_df)
curr_house_partitions = get_suburb_partitions(curr_house_df)
curr_unit_partitions = get_suburb_partitions(curr_unit_df)

In [17]:
# remove instances where the number of rows in the partition is less than 100
hist_house_parts_new = dict()
hist_unit_parts_new = dict()
print('Num suburbs in hist house before removal',len(hist_house_partitions))
for suburb, df in hist_house_partitions.items():
    if df.shape[0] >= MIN_INSTANCES_PER_SUBURB:
        hist_house_parts_new[suburb] = df
print('Num suburbs in hist house after removal', len(hist_house_partitions))

print('Num suburbs in hist unit before removal', hist_unit_df['suburb'].nunique())
for suburb, df in hist_unit_partitions.items():
    if df.shape[0] >= MIN_INSTANCES_PER_SUBURB:
        hist_unit_parts_new[suburb] = df
print('Num suburbs in hist unit after removal', len(hist_unit_partitions))


Num suburbs in hist house before removal 1130
Num suburbs in hist house after removal 1130
Num suburbs in hist unit before removal 963
Num suburbs in hist unit after removal 963


In [7]:
# # Convert preprocessed data back into a DataFrame with proper feature names
# preprocessed_df = pd.DataFrame(X_preprocessed, columns=all_features)

# # Compute correlation matrix (add back the target column)
# preprocessed_df['rented_price'] = y.copy()
# corr_matrix = preprocessed_df.corr()

# # Plot the correlation matrix
# plt.figure(figsize=(10, 8))
# sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
# plt.title("Correlation Matrix of Scaled/Encoded Dataset")
# plt.show()

In [8]:
# # Define the model and Sequential Feature Selector
# model = LinearRegression()
# sfs = SequentialFeatureSelector(model, n_features_to_select=5, direction='backward')

# # Creating the pipeline with preprocessing, feature selection, and the model
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('feature_selection', sfs),
#     ('model', model)
# ])

# # Fitting the pipeline
# pipeline.fit(X, y)

# # Output selected feature indices
# selected_indices = sfs.get_support(indices=True)
# selected_feature_names = [all_features[i] for i in selected_indices]

# print(f"Selected feature names: {selected_feature_names}")

In [9]:
# numeric_features = ['bed, bath, car, year, median_income, population, cpi, unemployment_rate, time_city']
# categorical_features = ['type']

# Defining the ColumnTransformer for one-hot encoding 'type' and standardizing numeric columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

def run_0R(X_train, X_test, y_train):
    zero_r_model = DummyRegressor(strategy='mean')
    zero_r_model.fit(X_train, y_train)
    return zero_r_model.predict(X_test)

def run_LR(X_train, X_test, y_train):
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    return lr_model.predict(X_test)

def run_RF(X_train, X_test, y_train):
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    return rf_model.predict(X_test)

def display_metrics(y_test, y_pred_zero_r, y_pred_lr, y_pred_rf):
    # Compute evaluation metrics for ZeroR
    mse_zero_r = mean_squared_error(y_test, y_pred_zero_r)
    r2_zero_r = r2_score(y_test, y_pred_zero_r)
    # Compute evaluation metrics for Linear Regression
    mse_lr = mean_squared_error(y_test, y_pred_lr)
    r2_lr = r2_score(y_test, y_pred_lr)
    # Compute evaluation metrics for Random Forest
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    r2_rf = r2_score(y_test, y_pred_rf)
    # get variance of the target variable
    variance = y_test.var()      
    # Output the results
    print("ZeroR (Baseline) Model")
    print(f"Mean Squared Error (MSE): {mse_zero_r}")
    print(f"R-Squared (R2 Score): {r2_zero_r}\n")
    print("Linear Regression Model")
    print(f"Mean Squared Error (MSE): {mse_lr}")
    print(f"R-Squared (R2 Score): {r2_lr}\n")
    print("Random Forest Model")
    print(f"Mean Squared Error (MSE): {mse_rf}")
    print(f"R-Squared (R2 Score): {r2_rf}")
    print(f"Variance of the target variable: {variance}")

In [10]:
# for df in dfs.values():
#     # Defining features and target
#     X = df.drop(columns=['rented_price'])  # Features
#     y = df['rented_price']  # Target
#     # Defining the model and Sequential Feature Selector
#     # model = LinearRegression()
#     # sfs = SequentialFeatureSelector(model, direction='backward')
#     X_preprocessed = preprocessor.fit_transform(X)
#     # Get feature names
#     all_features = numeric_features + list(
#         preprocessor.named_transformers_['cat'].get_feature_names_out(['type'])
#     )
#     X_train, X_test, y_train, y_test = train_test_split(
#         X_preprocessed, y, test_size=0.2, random_state=42
#     )
#     # Run 0R, LR, and RF models and Display metrics
#     display_metrics(
#         y_test,
#         run_0R(X_train, X_test, y_train),
#         run_LR(X_train, X_test, y_train),
#         run_RF(X_train, X_test, y_train)
#     )
    

NameError: name 'dfs' is not defined