#Machine learning exercise- Pricing UK Houses

Importing libraries.

In [17]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import random
from sklearn import metrics
random.seed(27)

Data were shuffled on my local system through an external library written in C++ (https://github.com/alexandres/terashuf): this was done in order to avoid the indroduction of bias in the training of the Neural Newtork. Indeed data are more or less ordered by date in the original csv. Given the big dimension of the .csv file, data are read by chunks in order to make them feasible to the RAM : these chunks work as additional batches in the training of the Neural Network.

In [18]:
data_path='/content/drive/MyDrive/Homework_Amazon/pp-complete_shuffled.csv'
chunksize = 10 ** 6
index=[1, 2, 4, 6, 11]
lst=['Id', 'Price', 'Date',  
    'Postcode', 'Property_type', 'Old_new', 'Duration', 'PAON', 'SAON',
    'Street', 'Locality', 'Town', 'District', 'County', 'PPD', 'Record_status', 'unknown1', 'unknokn2'] 
colnames=['price', 'data', 'property_type', 'lease_duration', 'location']
pd.read_csv(data_path, chunksize=chunksize, names=lst, header=None)

<pandas.io.parsers.readers.TextFileReader at 0x7f7e05a44fd0>

Initializing and compiling the Neural Network: 3 hidden layers on 10 neurons each, all with RELU activation functions. Moreover an Adam optimizer is adopted using mean squared error as loss function.

In [19]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(9,)),
    keras.layers.Dense(10, activation='relu'),
    keras.layers.Dense(10, activation='relu'),
    keras.layers.Dense(10, activation='relu'),


    keras.layers.Dense(1),
])

In [20]:
model.compile(optimizer = keras.optimizers.Adam(learning_rate=0.1),
              loss = keras.losses.MeanSquaredError(),
              metrics=['MeanSquaredError'])

Preprocessing on categorical variables: one hot encoding and extracting bool variable for 'Town' (London or not). Splitting on dataset in train and test based on the date (data after 01-01-2019 is added to the testset). The train is again splitted in training and validation with 67%-33% ratio. The neural network is then trained in every step of the loop with a new chunk.

In [None]:
i=0
for chunk in pd.read_csv(data_path, chunksize=chunksize, names=lst, header=None):
  #training preprocessing
  chunk["Date"]= pd.to_datetime(chunk["Date"])
  batch=chunk[chunk.Date<pd.datetime(2019, 1, 1)]
  batch['Property_type'] = batch['Property_type'].astype('category')
  property_type_batch =pd.get_dummies(batch.Property_type, prefix='Property')
  batch['Duration'] = batch['Duration'].astype('category')
  duration_batch=pd.get_dummies(batch.Duration, prefix='Duration')
  london_bool_batch=np.array(batch.Town=='LONDON')
  london_dummy_batch=london_bool_batch.astype(int)
  london_dummy_batch=london_dummy_batch.reshape(london_dummy_batch.shape[0], 1)
  y_batch=batch['Price'].to_numpy()
  y_batch=y_batch.reshape(y_batch.shape[0], 1)
  y_batch=y_batch.astype('float')
  X_batch=property_type_batch.to_numpy()
  X_batch=np.append(X_batch, duration_batch, axis=1)
  if(X_batch.shape[1]!=8): #adding extra column to durations dummy (if no duration_U values are present)
    X_batch=np.append(X_batch, np.zeros((X_batch.shape[0], 1)), axis=1)
    print('No duration_U in this chunk')
  X_batch=np.append(X_batch, london_dummy_batch, axis=1)
  X_batch.shape, y_batch.shape
  X_train, X_validation, y_train, y_validation = train_test_split(X_batch, y_batch, test_size=0.33, random_state=101)
  
  #test preprocessing
  test_chunk=chunk[chunk.Date>=pd.datetime(2019, 1, 1)]
  test_chunk['Property_type'] = test_chunk['Property_type'].astype('category')
  property_type_test_chunk =pd.get_dummies(test_chunk.Property_type, prefix='Property')
  duration_test_chunk=pd.get_dummies(test_chunk.Duration, prefix='Duration')
  london_bool_test_chunk=np.array(test_chunk.Town=='London')
  london_dummy_test_chunk=london_bool_test_chunk.astype(int)
  london_dummy_test_chunk=london_dummy_test_chunk.reshape(london_dummy_test_chunk.shape[0], 1)
  y_test_chunk=test_chunk['Price'].to_numpy()
  y_test_chunk=y_test_chunk.reshape(y_test_chunk.shape[0], 1)
  y_test_chunk=y_test_chunk.astype('float')
  X_test_chunk=property_type_test_chunk.to_numpy()
  X_test_chunk=np.append(X_test_chunk, duration_test_chunk, axis=1)
  X_test_chunk=np.append(X_test_chunk, np.zeros((X_test_chunk.shape[0], 1)), axis=1) #adding extra column to durations dummy (no U values are preseny since 2019)
  X_test_chunk=np.append(X_test_chunk, london_dummy_test_chunk, axis=1)
  if(i==0):  #first chunk: initialization of X_test and y_test
    X_test=X_test_chunk
    y_test=y_test_chunk
  else:
    X_test=np.append(X_test, X_test_chunk, axis=0)
    y_test=np.append(y_test, y_test_chunk, axis=0)
  #training of the model
  history=model.fit(X_train, y_train, epochs = 2, validation_data = (X_validation, y_validation), batch_size=100)
  print(i)
  i=i+1


Testing on the dataset

In [23]:
y_predicted = model.predict(X_test)
y_predicted=y_predicted.flatten()
# evaluate predictions
print('MAE:', metrics.mean_absolute_error(y_test, y_predicted))  
print('MSE:', metrics.mean_squared_error(y_test, y_predicted))  
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_predicted)))

MAE: 241219.7352421483
MSE: 3411288110175.523
RMSE: 1846967.2737153529
