In [23]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder


In [24]:
df = pd.read_csv('../../data/curated/individual_property_final.csv')

In [25]:
# Select relevant features and target (assuming 'Cost' is the target)
df = df.drop(columns=['Address', 'Latitude', 'Longitude', 'Postcode', 'SA2_CODE21', 'LGA_CODE24', 'Suburb'], axis=1)

categorical_columns = ['Property Type','Closest Gov Secondary School']
# Apply LabelEncoder to each categorical column
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    
df_train = df[(df['Year'] >= 2015) & (df['Year'] <= 2024)]
df_predict = df[(df['Year'] >= 2025) & (df['Year'] <= 2027)]

features = df_train.drop(columns=['Cost'])
target = df_train['Cost']

# Normalize the features and target using MinMaxScaler (LSTMs work better with normalized data)
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)
target_scaled = scaler.fit_transform(target.values.reshape(-1, 1))

# Convert data into sequences for LSTM
def create_sequences(features, target, time_steps=10):
    X, y = [], []
    for i in range(len(features) - time_steps):
        X.append(features[i:i+time_steps])
        y.append(target[i+time_steps])
    return np.array(X), np.array(y)

# Set the number of time steps (e.g., 10 previous timesteps to predict the next one)
time_steps = 10
X, y = create_sequences(features_scaled, target_scaled, time_steps)

# Display the shape of the data
print(f'X shape: {X.shape}, y shape: {y.shape}')


X shape: (8550, 10, 21), y shape: (8550, 1)


In [26]:

# Build a simple LSTM model
model = Sequential()

# Add the LSTM layer with 50 units
model.add(LSTM(units=50, return_sequences=False, input_shape=(X.shape[1], X.shape[2])))

# Optionally, add Dropout for regularization
model.add(Dropout(0.2))

# Add a Dense layer for the output
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Display model summary
model.summary()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 50)                14400     
                                                                 
 dropout_2 (Dropout)         (None, 50)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 51        
                                                                 
Total params: 14451 (56.45 KB)
Trainable params: 14451 (56.45 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
seed=37

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [28]:
# Evaluate the model on the test set
test_loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}')

# Make predictions
y_pred = model.predict(X_test)

# Optionally, inverse transform the scaled predictions and true values
y_pred_inverse = scaler.inverse_transform(y_pred)
y_test_inverse = scaler.inverse_transform(y_test.reshape(-1, 1))

# Display the predictions and true values
print(f'Predicted: {y_pred_inverse.flatten()}')
print(f'Actual: {y_test_inverse.flatten()}')

Test Loss: 0.010548189282417297
Predicted: [366.28137 147.93211 412.28522 ... 664.2967  403.72528 298.73987]
Actual: [392.00242718 144.         336.         ... 640.         415.15486726
 328.25      ]


In [29]:

# Make predictions on the test set
y_pred = model.predict(X_test)

# Inverse transform the scaled predictions and true values (if you scaled the data)
y_pred_inverse = scaler.inverse_transform(y_pred)
y_test_inverse = scaler.inverse_transform(y_test.reshape(-1, 1))

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test_inverse, y_pred_inverse))
print(f'RMSE: {rmse}')


RMSE: 112.44549439163991


Looks bad, skip