# Machine learning program "Median House Value"


In [2]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np

print("Tensorflow version: "+tf.__version__)

Tensorflow version: 2.14.0


## Datasets

In [3]:
ATT_FILE =   f"MedianHouseValuePreparedCleanAttributes.csv"
LABEL_FILE = f"MedianHouseValueOneHotEncodedClasses.csv"

attributes = pd.read_csv(ATT_FILE)
label =      pd.read_csv(LABEL_FILE)

In [3]:
attributes.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,0.217131,-0.693943,0.411765,-0.939264,-0.887337,-0.909246,-0.892781,-0.775851,-1.0
1,-0.693227,0.177471,-0.294118,-0.917951,-0.886716,-0.954483,-0.889492,-0.591592,1.0
2,0.448207,-0.959617,0.372549,-0.830663,-0.800745,-0.893495,-0.795757,-0.558972,1.0
3,0.290837,-0.70882,-0.411765,-0.876291,-0.859094,-0.94316,-0.843776,-0.487055,-1.0
4,-0.400398,0.158342,-0.490196,-0.841854,-0.845748,-0.934135,-0.82766,-0.164687,-0.333333


In [4]:
label.head()

Unnamed: 0,"Cheap:[15.0, 141.3]","Averaged:[141.4, 230.2]","Expensive:[230.3, 500.0]"
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0


In [4]:
TRAIN_RATIO = 0.8

n_instances = attributes.shape[0]
n_train = int(n_instances*TRAIN_RATIO)
n_dev = int((n_instances - n_train)/2)

x_train = attributes.values[:n_train]
t_train = label.values[:n_train]
x_dev =   attributes.values[n_train:n_train + n_dev]
t_dev =   label.values[n_train:n_train + n_dev]

INPUTS =  x_train.shape[1]
OUTPUTS = t_train.shape[1]

NUM_TRAINING_EXAMPLES = int(round(x_train.shape[0]/1))
NUM_DEV_EXAMPLES =      int(round(x_dev.shape[0]/1))

print ("Number of training examples: ", NUM_TRAINING_EXAMPLES)
print ("Number of examples for development test: ", NUM_DEV_EXAMPLES)

Number of training examples:  16342
Number of examples for development test:  2043


## Model


In [5]:
n_neurons_per_hidden_layer = [500, 250, 75, 25]
learning_rate = 0.1

In [6]:
model = keras.Sequential(name="my_model")

model.add(keras.layers.InputLayer(input_shape=(INPUTS,)))
for neurons in n_neurons_per_hidden_layer:
  model.add(keras.layers.Dense(neurons, activation="relu"))
model.add(keras.layers.Dense(OUTPUTS, activation="softmax"))

2023-10-24 09:27:14.417422: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-24 09:27:14.424675: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-24 09:27:14.425048: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-24 09:27:14.426876: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-10-24 09:27:14.427408: I tensorflow/compile

In [7]:
model.compile(loss=tf.keras.losses.categorical_crossentropy,
              optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),
              metrics=["categorical_accuracy"])


## Training

In [8]:
n_epochs = 1000
batch_size = 512
start_time = time.perf_counter()

In [None]:
history = model.fit(x_train, t_train,
                    batch_size = batch_size,
                    epochs=n_epochs,
                    verbose = 0,
                    validation_data = (x_dev, t_dev))

## Results

In [None]:
results = pd.DataFrame(history.history)
results.plot(figsize = (8, 5))
plt.grid(True)
plt.xlabel ("Epochs")
plt.ylabel ("Accuracy - Mean Log Loss")
plt.gca().set_ylim(0, 1) # set the vertical range to [0,1]
plt.show()

In [None]:
print ("Error (training): ",
       round((1 - results.categorical_accuracy.values[-1:][0])*100, 1), "%")
print ("Error (development test): ",
       round((1 - results.val_categorical_accuracy.values[-1:][0])*100, 1), "%")
print ("Time: ",
       round((time.perf_counter() - start_time)),"seconds")

## **Exercises**

### **Exercise 1.**
Performance evaluation  Determine the following values: train error, test error (using dev set), bias, variance and training time. Repeat the execution at least three times. Consider 5% as Bayesian error (human error).

In [9]:
human_error = 5
train_errors = [17.7, 9.5, 2.5]
test_errors = [20.2, 21.0, 20.9]
training_times = [203, 1660, 155]
bias = [ e - human_error for e in train_errors]
variance = [e - train_errors[i] for i, e in enumerate(test_errors)]

data = {
    "train error" : train_errors,
    "test error": test_errors,
    "bias":bias,
    "variance":variance,
    'training time (s)': training_times
    }

values = pd.DataFrame(data)
values

Unnamed: 0,train error,test error,bias,variance,training time (s)
0,17.7,20.2,12.7,2.5,203
1,9.5,21.0,4.5,11.5,1660
2,2.5,20.9,-2.5,18.4,155


### **Exercise 2:**
Changing basic hyperparameters  Change hyperparameters related to: batch size, number of layers, and number of neurons. Estimate train error, test error, bias, variance and training time. Consider 5% as Bayesian error (human error).

#### **Change the batch size**

In [22]:
n_epochs = 1000
batch_sizes = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]


def calc_measures(history, start_time):
  results = pd.DataFrame(history.history)
  error_trainig =  round((1 - results.categorical_accuracy.values[-1:][0])*100, 1)
  error_test = round((1 - results.val_categorical_accuracy.values[-1:][0])*100, 1)
  total_time = round((time.perf_counter() - start_time))
  print ("Error (training): ",error_trainig, "%")
  print ("Error (development test): ",error_test, "%")
  print ("Time: ",time,"seconds")
  return error_trainig, error_test, total_time

train_errors = []
test_errors = []
training_times = []

for batch_size in batch_sizes:
  start_time = time.perf_counter()

  history = model.fit(x_train, t_train,
                      batch_size = batch_size,
                      epochs=n_epochs,
                      verbose = 0,
                      validation_data = (x_dev, t_dev))

  error_training, error_test, total_time = calc_measures(history, start_time)
  train_errors.append(error_training)
  test_errors.append(error_test)
  training_times.append(total_time)




In [21]:
bias = [ e - human_error for e in train_errors]
variance = [e - train_errors[i] for i, e in enumerate(test_errors)]
iterations = [ (n_train/batch_size) *n_epochs for batch_size in batch_sizes]
data = {
    "batch size": batch_sizes,
    "N":iterations,
    "train error" : train_errors,
    "test error": test_errors,
    "bias":bias,
    "variance":variance,
    'training time (s)': training_times
    }

values = pd.DataFrame(data)
values

Unnamed: 0,batch size,N,train error,test error,bias,variance,training time (s)
0,16342,1000.0,28.5,28.8,23.5,0.3,125
