In [287]:
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

In [288]:
df = pd.read_csv('../data/curated/houses_all_properties.csv')
THRESHOLD = 50

In [289]:
df = df[['address','suburb','SA2_Name', 'parking', 'type', 'num_schools', 'cost', 'beds', 'baths', 'cost/(beds+baths)',
         'closest_train_station_distance_km', 'closest_tram_station_distance_km', 'closest_hospital_distance_km',
         'closest_grocery_distance_km', 'Net_migration_2021_22', 'Net_migration_2022_23', 'ERP_per_km2_2021',
         'ERP_per_km2_2022', 'ERP_per_km2_2023', 'ERP_increase_2020_21', 'ERP_increase_2021_22', 'ERP_increase_2022_23',
         'NUMBER_OF_JOBS_PERSONS_2020-21', 'MEDIAN_INCOME_PERSONS_2020-21']]

In [290]:
df1 = df.drop(columns=['address','suburb','SA2_Name','type'])

In [291]:
df1.corr()['cost'].sort_values(ascending=False)

cost                                 1.000000
baths                                0.459979
cost/(beds+baths)                    0.440213
beds                                 0.300913
ERP_per_km2_2021                     0.188587
num_schools                          0.187166
ERP_per_km2_2022                     0.183876
ERP_per_km2_2023                     0.178395
parking                              0.164154
MEDIAN_INCOME_PERSONS_2020-21        0.086419
Net_migration_2022_23                0.082477
NUMBER_OF_JOBS_PERSONS_2020-21       0.082422
ERP_increase_2022_23                 0.064240
closest_grocery_distance_km         -0.010240
closest_train_station_distance_km   -0.016886
Net_migration_2021_22               -0.020340
ERP_increase_2021_22                -0.027717
closest_hospital_distance_km        -0.087480
closest_tram_station_distance_km    -0.095150
ERP_increase_2020_21                -0.149103
Name: cost, dtype: float64

In [292]:
# Select the column of interest (e.g., 'feature1')
column_of_interest = df['cost']

# Other features
other_features = df1.drop(columns=['cost'])

# Calculate MI between 'feature1' and other features for classification
mi_scores = []
for column in other_features.columns:
    mi = mutual_info_regression(df[[column_of_interest.name]], df[column])
    mi_scores.append((column, mi[0]))

# Convert MI scores to DataFrame for better readability
mi_df = pd.DataFrame(mi_scores, columns=['Feature', 'MI with cost'])
mi_df.sort_values('MI with cost', ascending=False)

Unnamed: 0,Feature,MI with cost
4,cost/(beds+baths),3.44926
18,MEDIAN_INCOME_PERSONS_2020-21,0.239812
10,Net_migration_2022_23,0.220815
6,closest_tram_station_distance_km,0.220367
12,ERP_per_km2_2022,0.220088
9,Net_migration_2021_22,0.217815
11,ERP_per_km2_2021,0.217676
16,ERP_increase_2022_23,0.214597
13,ERP_per_km2_2023,0.21324
15,ERP_increase_2021_22,0.211654


In [293]:
subs = df['SA2_Name'].value_counts()
subs = subs[subs >= THRESHOLD].index

len(subs)

47

In [294]:
df = df[df['SA2_Name'].isin(subs)]

In [295]:
df['suburb'].count()

np.int64(4430)

In [296]:
df['type'] = df['type'].apply(lambda x: 0 if x == 'House' else 1)   

In [297]:
# Create a mapping dictionary
mapping = {category: index for index, category in enumerate(df['SA2_Name'].unique())}

# Map the categorical variable to numerical indices
df['map_SA2_Name'] = df['SA2_Name'].map(mapping)

In [298]:
from sklearn.preprocessing import StandardScaler, Normalizer
exclude_columns = ['parking', 'type', 'num_schools', 'beds', 'baths', 'map_SA2_Name']


X = df.drop(columns=['address', 'suburb', 'SA2_Name', 'cost/(beds+baths)'])
# Select columns to scale
columns_to_scale = [col for col in X.columns if col not in exclude_columns]

# Initialize the StandardScaler
scaler = Normalizer()

# Scale the selected columns
X[columns_to_scale] = scaler.fit_transform(X[columns_to_scale])

y = df['cost']
X = X.drop(columns=['cost'])


lst = []
for i in X.columns:
    if 'closest' in i:
        lst.append(i)

# X1 = X[['beds', 'baths', 'parking', 'type', 'num_schools'] + lst]



In [299]:
df.columns

Index(['address', 'suburb', 'SA2_Name', 'parking', 'type', 'num_schools',
       'cost', 'beds', 'baths', 'cost/(beds+baths)',
       'closest_train_station_distance_km', 'closest_tram_station_distance_km',
       'closest_hospital_distance_km', 'closest_grocery_distance_km',
       'Net_migration_2021_22', 'Net_migration_2022_23', 'ERP_per_km2_2021',
       'ERP_per_km2_2022', 'ERP_per_km2_2023', 'ERP_increase_2020_21',
       'ERP_increase_2021_22', 'ERP_increase_2022_23',
       'NUMBER_OF_JOBS_PERSONS_2020-21', 'MEDIAN_INCOME_PERSONS_2020-21',
       'map_SA2_Name'],
      dtype='object')

In [300]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df['map_SA2_Name'])

In [301]:
X1.columns

#X1 = X[['beds', 'baths', 'map_SA2_Name', 'parking']]

Index(['beds', 'baths', 'parking', 'type', 'num_schools',
       'closest_train_station_distance_km', 'closest_tram_station_distance_km',
       'closest_hospital_distance_km', 'closest_grocery_distance_km'],
      dtype='object')

In [302]:
model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(root_mean_squared_error(y_test, y_pred))
print(f"R² Score: {r2}")

182.9581081712329
R² Score: 0.5474465352529613


In [303]:
y_pred_train = model.predict(X_train)
print(root_mean_squared_error(y_train, y_pred_train))
print(f"R² Score: {r2_score(y_train, y_pred_train)}")



171.82497918804472
R² Score: 0.5527927004802953


In [304]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)    
y_pred_2 = rf.predict(X_test)
y_pred_t = rf.predict(X_train)

mse = root_mean_squared_error(y_test, y_pred_2)
r2 = r2_score(y_test, y_pred_2)

print(f"Test rmse: {root_mean_squared_error(y_test, y_pred_2)}")
print(f"Test R² Score: {r2}")

print(f"Train rmse: {root_mean_squared_error(y_train, y_pred_t)}")
print(f"Train R² Score: {r2_score(y_train, y_pred_t)}")
#print(rf.score(X, y))



Test rmse: 110.41930913276477
Test R² Score: 0.8351622706589903
Train rmse: 39.79785529974337
Train R² Score: 0.9760085983768969
