## Keras NN regression model

In [44]:
import pandas  as pd
import numpy   as np
import json
from time import time
from sklearn.model_selection import train_test_split
# from tensorflow.keras import utils
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense 
from keras_tuner.tuners import RandomSearch, Hyperband, BayesianOptimization
from keras_tuner import HyperModel
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### Data load & preprocessing

In [3]:
cl_data = pd.read_csv('data/clean_data.csv')

In [4]:
y = cl_data.buy_price
X = cl_data.drop('buy_price', axis=1)

In [5]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
# X_train_scaled = ss.fit_transform(X_train)
# X_test_scaled = ss.transform(X_test)
X_scaled = ss.fit_transform(X)
# y_train = np.array(y_train)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=1)

In [7]:
X_train.shape

(16466, 14)

### Build NN model & searching hyperparams

In [23]:
class ANNhypermodel(HyperModel):
    
    def __init__(self, input_shape):
        self.input_shape= input_shape
        
    def build(self, hp):
        model= Sequential()
        
        # Tune the number of units in the first Dense layer
        # Defining dense units as a close approx to the original neural network to perform a fair comparision!
        
        
        hp_units_1= hp.Int('units_1', min_value=64, max_value= 512, step=32)
        hp_units_2= hp.Int('units_2', min_value=64, max_value= 512, step=32)
        hp_units_3= hp.Int('units_3', min_value=32, max_value= 256, step=16)

        model.add(Dense(units=hp_units_1, activation='relu', input_shape= (self.input_shape,)))
        model.add(Dense(units=hp_units_2, activation='relu'))
        model.add(Dense(units=hp_units_3, activation='relu'))
        model.add(Dense(1))

        # Tune the learning rate for the optimizer 
        hp_learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG', default= 0.0005)

        model.compile(loss='mse',
                    optimizer= Adam(learning_rate=hp_learning_rate),
                    metrics= ['mae','mse']
                     )

        return model

hypermodel= ANNhypermodel(input_shape = X_train.shape[1])

In [25]:
HYPERBAND_MAX_EPOCHS = 150
EXECUTION_PER_TRIAL = 2

tuner= Hyperband(hypermodel,
                   objective= 'val_mse',
                   max_epochs=HYPERBAND_MAX_EPOCHS, #Set 100+ for good results
                   executions_per_trial=EXECUTION_PER_TRIAL,
                   directory= 'hyperband',
                   project_name='houseprices',
                   overwrite=True)

In [24]:
MAX_TRIALS = 20
tuner= RandomSearch(hypermodel,
                   objective= 'val_mse',
                   max_trials= MAX_TRIALS,
                   executions_per_trial= EXECUTION_PER_TRIAL,
                   directory= 'random_search',
                   project_name='houseprices',
                   overwrite=True)

In [31]:
print('Searching for the best params!')

t0 = time()
tuner.search(x= X_train,
             y= y_train,
             epochs=100,
             batch_size= 1024,
             validation_data= (X_test, y_test),
             verbose=0,
             callbacks= []
            )
print((time()- t0)/60, "min")

# Retreive the optimal hyperparameters
best_hps= tuner.get_best_hyperparameters(num_trials=1)[0]

# Retrieve the best model
best_model = tuner.get_best_models(num_models=1)[0]

searching for the best params!
INFO:tensorflow:Oracle triggered exit
3495.2729663848877  secs


In [46]:
best_model.get_compile_config()

{'optimizer': {'module': 'keras.optimizers.legacy',
  'class_name': 'Adam',
  'config': {'name': 'Adam',
   'learning_rate': 0.009221840725015822,
   'decay': 0.0,
   'beta_1': 0.9,
   'beta_2': 0.999,
   'epsilon': 1e-07,
   'amsgrad': False},
  'registered_name': None},
 'loss': 'mse',
 'metrics': ['mae', 'mse'],
 'loss_weights': None,
 'weighted_metrics': None,
 'run_eagerly': None,
 'steps_per_execution': None,
 'jit_compile': None}

In [43]:
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 224)               3360      
                                                                 
 dense_1 (Dense)             (None, 96)                21600     
                                                                 
 dense_2 (Dense)             (None, 208)               20176     
                                                                 
 dense_3 (Dense)             (None, 1)                 209       
                                                                 
Total params: 45,345
Trainable params: 45,345
Non-trainable params: 0
_________________________________________________________________


In [33]:
y_pred = best_model.predict(X_test).ravel()



In [35]:
pred_price = best_model.predict(X_scaled).ravel()



### Get metrics & saving results

In [52]:
best_model.save("model.h5")
print("Saved model to disk")

Saved model to disk


In [42]:
dc = {
    "R2_test": r2,
    "mae": mae,
    "mae%": mae_perc,
    "rmse": rmse,    
    "R2_total": score    
}

json_object = json.dumps(dc, indent=4)

with open("data/nn_metrics.json", "w") as outfile:
    outfile.write(json_object)

In [41]:
df = create_result_df(y, pred_price)
df.to_csv('data/nn_results.csv', index=False)

In [36]:
def metrics (y_test, y_pred):
    print('Test set metrics:')
    print()
    mae = mean_absolute_error(y_test, y_pred)
    mae_perc = np.mean(abs(y_test - y_pred)*100/y_test)
    rmse = mean_squared_error(y_test, y_pred)**0.5
    r2 = r2_score(y_test, y_pred)
    
    print(f'Mean absolute error: {mae:.2f}')
    print(f'Mean absolute error (%): {mae_perc:.2f}')
    print(f'Root mean squared error: {rmse:.2f}')
    print(f'R2 score for test set: {r2:.4f}')
    print('#'*20)
    print()
    return r2, mae, mae_perc, rmse

In [40]:
def create_result_df(actual, predict):
    df = pd.DataFrame(actual)
    df.rename(columns={df.columns[0]: 'actual'}, inplace=True)
    df.insert(len(df.columns), column='predict', value=predict)
    df.insert(len(df.columns), column='difference', value=df.actual - df.predict)
    df.insert(len(df.columns), column='diff%', value=abs(df.difference)/df.actual *100 )
#     df.insert(len(df.columns), column='squared', value= df.difference**2)
    return df

In [37]:
y_test = y_test.ravel()
r2, mae, mae_perc, rmse = metrics(y_test, y_pred)
score = r2_score(y, pred_price)
print(f"R2 score for data set {score:.4f}")

Test set metrics:

Mean absolute error: 87240.87
Mean absolute error (%): 19.09
Root mean squared error: 144591.02
R2 score for test set: 0.8915
####################

R2 score for data set 0.8947
