In [1]:
import numpy as np 
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('MELBOURNE_HOUSE_PRICES_LESS.csv')

In [3]:
data = data.drop(columns=['Address', 'Suburb', 'Postcode', 'Regionname', 'SellerG', 'Propertycount']).copy()

In [4]:
data.reset_index()

Unnamed: 0,index,Rooms,Type,Price,Method,Date,Distance,CouncilArea
0,0,3,h,1490000.0,S,1/04/2017,3.0,Yarra City Council
1,1,3,h,1220000.0,S,1/04/2017,3.0,Yarra City Council
2,2,3,h,1420000.0,S,1/04/2017,3.0,Yarra City Council
3,3,3,h,1515000.0,S,1/04/2017,7.5,Moonee Valley City Council
4,4,2,h,670000.0,S,1/04/2017,10.4,Moonee Valley City Council
...,...,...,...,...,...,...,...,...
63018,63018,3,h,566000.0,S,31/03/2018,20.6,Hume City Council
63019,63019,3,h,500000.0,S,31/03/2018,20.6,Hume City Council
63020,63020,3,h,545000.0,S,31/03/2018,20.6,Hume City Council
63021,63021,3,u,,PI,31/03/2018,15.3,Whittlesea City Council


In [5]:
{column: len(data[column].unique()) for column in data.columns}

{'Rooms': 14,
 'Type': 3,
 'Price': 3418,
 'Method': 9,
 'Date': 112,
 'Distance': 180,
 'CouncilArea': 34}

In [6]:
data

Unnamed: 0,Rooms,Type,Price,Method,Date,Distance,CouncilArea
0,3,h,1490000.0,S,1/04/2017,3.0,Yarra City Council
1,3,h,1220000.0,S,1/04/2017,3.0,Yarra City Council
2,3,h,1420000.0,S,1/04/2017,3.0,Yarra City Council
3,3,h,1515000.0,S,1/04/2017,7.5,Moonee Valley City Council
4,2,h,670000.0,S,1/04/2017,10.4,Moonee Valley City Council
...,...,...,...,...,...,...,...
63018,3,h,566000.0,S,31/03/2018,20.6,Hume City Council
63019,3,h,500000.0,S,31/03/2018,20.6,Hume City Council
63020,3,h,545000.0,S,31/03/2018,20.6,Hume City Council
63021,3,u,,PI,31/03/2018,15.3,Whittlesea City Council


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63023 entries, 0 to 63022
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Rooms        63023 non-null  int64  
 1   Type         63023 non-null  object 
 2   Price        48433 non-null  float64
 3   Method       63023 non-null  object 
 4   Date         63023 non-null  object 
 5   Distance     63023 non-null  float64
 6   CouncilArea  63023 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 3.4+ MB


In [9]:
def onehot_encode(df, column_dict):
    df = df.copy()
    for column, prefix in column_dict.items():
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df =pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [10]:
def preprocessing_data(df):
  df =df.copy()
  df = df.dropna(axis=0).reset_index(drop=True)
  df['Date'] = pd.to_datetime(df['Date'])
  column_dict = {
 'Type': 'TP',
 'Method': 'ME',
 'CouncilArea': 'CA',
  }
  df = onehot_encode(df, column_dict)
  df['Year'] = df['Date'].apply(lambda x: x.year)
  df['Month'] = df['Date'].apply(lambda x: x.month)
  df['Day'] = df['Date'].apply(lambda x: x.day)
  df = df.drop('Date', axis=1)
  
 # splitting into X and y
  y = df['Price']
  X = df.drop('Price', axis=1)
  scaler = StandardScaler()
  X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
  return X, y

In [11]:
X, y = preprocessing_data(data)

In [12]:
X

Unnamed: 0,Rooms,Distance,TP_h,TP_t,TP_u,ME_PI,ME_S,ME_SA,ME_SP,ME_VB,...,CA_Port Phillip City Council,CA_Stonnington City Council,CA_Whitehorse City Council,CA_Whittlesea City Council,CA_Wyndham City Council,CA_Yarra City Council,CA_Yarra Ranges Shire Council,Year,Month,Day
0,-0.075861,-1.285142,0.646364,-0.338536,-0.487235,-0.373882,0.762586,-0.08714,-0.393012,-0.3402,...,-0.165485,-0.161289,-0.167385,-0.210713,-0.156232,5.974250,-0.060219,-0.053884,-1.909231,-1.448953
1,-0.075861,-1.285142,0.646364,-0.338536,-0.487235,-0.373882,0.762586,-0.08714,-0.393012,-0.3402,...,-0.165485,-0.161289,-0.167385,-0.210713,-0.156232,5.974250,-0.060219,-0.053884,-1.909231,-1.448953
2,-0.075861,-1.285142,0.646364,-0.338536,-0.487235,-0.373882,0.762586,-0.08714,-0.393012,-0.3402,...,-0.165485,-0.161289,-0.167385,-0.210713,-0.156232,5.974250,-0.060219,-0.053884,-1.909231,-1.448953
3,-0.075861,-0.689112,0.646364,-0.338536,-0.487235,-0.373882,0.762586,-0.08714,-0.393012,-0.3402,...,-0.165485,-0.161289,-0.167385,-0.210713,-0.156232,-0.167385,-0.060219,-0.053884,-1.909231,-1.448953
4,-1.134401,-0.305003,0.646364,-0.338536,-0.487235,-0.373882,0.762586,-0.08714,-0.393012,-0.3402,...,-0.165485,-0.161289,-0.167385,-0.210713,-0.156232,-0.167385,-0.060219,-0.053884,-1.909231,-1.448953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48428,-1.134401,3.350649,-1.547116,-0.338536,2.052397,-0.373882,-1.311328,-0.08714,2.544451,-0.3402,...,-0.165485,-0.161289,-0.167385,-0.210713,-0.156232,-0.167385,-0.060219,1.259546,-1.258706,1.773269
48429,-0.075861,-0.569906,0.646364,-0.338536,-0.487235,-0.373882,0.762586,-0.08714,-0.393012,-0.3402,...,-0.165485,-0.161289,-0.167385,-0.210713,-0.156232,-0.167385,-0.060219,1.259546,-1.258706,1.773269
48430,-0.075861,1.045999,0.646364,-0.338536,-0.487235,-0.373882,0.762586,-0.08714,-0.393012,-0.3402,...,-0.165485,-0.161289,-0.167385,-0.210713,-0.156232,-0.167385,-0.060219,1.259546,-1.258706,1.773269
48431,-0.075861,1.045999,0.646364,-0.338536,-0.487235,-0.373882,0.762586,-0.08714,-0.393012,-0.3402,...,-0.165485,-0.161289,-0.167385,-0.210713,-0.156232,-0.167385,-0.060219,1.259546,-1.258706,1.773269


In [13]:
X.mean()

Rooms                               -1.191255e-16
Distance                             2.505744e-16
TP_h                                 3.990412e-17
TP_t                                -7.239957e-17
TP_u                                -2.053889e-17
ME_PI                                4.988015e-17
ME_S                                -1.349698e-16
ME_SA                               -4.694602e-18
ME_SP                               -3.520952e-18
ME_VB                               -5.868253e-17
CA_Banyule City Council              1.701793e-17
CA_Bayside City Council              4.841309e-18
CA_Boroondara City Council           6.235019e-17
CA_Brimbank City Council            -7.790106e-17
CA_Cardinia Shire Council            3.961071e-18
CA_Casey City Council               -1.349698e-17
CA_Darebin City Council             -4.459872e-17
CA_Frankston City Council            2.523349e-17
CA_Glen Eira City Council            1.095896e-16
CA_Greater Dandenong City Council    3.095503e-17


In [14]:
y

0        1490000.0
1        1220000.0
2        1420000.0
3        1515000.0
4         670000.0
           ...    
48428     347700.0
48429     808000.0
48430     566000.0
48431     500000.0
48432     545000.0
Name: Price, Length: 48433, dtype: float64

In [15]:
{column: len(X[column].unique()) for column in X.columns}

{'Rooms': 14,
 'Distance': 176,
 'TP_h': 2,
 'TP_t': 2,
 'TP_u': 2,
 'ME_PI': 2,
 'ME_S': 2,
 'ME_SA': 2,
 'ME_SP': 2,
 'ME_VB': 2,
 'CA_Banyule City Council': 2,
 'CA_Bayside City Council': 2,
 'CA_Boroondara City Council': 2,
 'CA_Brimbank City Council': 2,
 'CA_Cardinia Shire Council': 2,
 'CA_Casey City Council': 2,
 'CA_Darebin City Council': 2,
 'CA_Frankston City Council': 2,
 'CA_Glen Eira City Council': 2,
 'CA_Greater Dandenong City Council': 2,
 'CA_Hobsons Bay City Council': 2,
 'CA_Hume City Council': 2,
 'CA_Kingston City Council': 2,
 'CA_Knox City Council': 2,
 'CA_Macedon Ranges Shire Council': 2,
 'CA_Manningham City Council': 2,
 'CA_Maribyrnong City Council': 2,
 'CA_Maroondah City Council': 2,
 'CA_Melbourne City Council': 2,
 'CA_Melton City Council': 2,
 'CA_Mitchell Shire Council': 2,
 'CA_Monash City Council': 2,
 'CA_Moonee Valley City Council': 2,
 'CA_Moorabool Shire Council': 2,
 'CA_Moreland City Council': 2,
 'CA_Murrindindi Shire Council': 2,
 'CA_Nillum

# Training

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=120)

In [17]:
# Define the model - deep neural net
number_input_features = 47
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="relu"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 384       
                                                                 
 dense_1 (Dense)             (None, 5)                 45        
                                                                 
 dense_2 (Dense)             (None, 1)                 6         
                                                                 
Total params: 435
Trainable params: 435
Non-trainable params: 0
_________________________________________________________________


In [18]:
X_train.shape

(38746, 47)

In [19]:
# Compile the model
nn.compile(loss="mse", optimizer="adam", metrics=["accuracy"])

In [20]:
# Train the model
fit_model = nn.fit(X_train, y_train, epochs=50, validation_split=0.2,
batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50


Epoch 49/50
Epoch 50/50


In [86]:
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

303/303 - 0s - loss: 125533315072.0000 - accuracy: 0.0000e+00 - 349ms/epoch - 1ms/step
Loss: 125533315072.0, Accuracy: 0.0


In [21]:
y_true = np.array(y_test)
y_pred = np.squeeze(nn.predict(X_test))



In [22]:
model_r2 = r2_score(y_true, y_pred)
print('Model R^2: {:.6f}'. format(model_r2))

Model R^2: 0.597738


In [27]:
nn.save('model_with_filter.h5')