In [None]:
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns

np.set_printoptions(precision=3, suppress=True)

column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv("./auto-mpg.data", names=column_names,
                            na_values='?', comment='\t',
                            sep=' ', skipinitialspace=True)
dataset = raw_dataset.copy()
# Turn last 0/1/2 categorical column in 3 binary columns:
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
# dataset.tail()


In [None]:
# Split dataset into training and testing
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [None]:
# See dataset "clouds"
# sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']], diag_kind='kde')

In [None]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')


1. Do linear regression on one variable: `Horsepower`:

In [None]:
# Slice features:
horsepower = np.array(train_features['Horsepower']).reshape((-1,1))
# Normalizer layer
horsepower_normalizer = tf.keras.layers.Normalization(input_shape=[1,], axis=-1)
horsepower_normalizer.adapt(horsepower)
# Build net
horsepower_model = tf.keras.Sequential([
    horsepower_normalizer,
    tf.keras.layers.Dense(units=1)
])

# Compile
horsepower_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')
# horsepower_model.summary()


In [None]:
# Fit
history: tf.keras.callbacks.History = horsepower_model.fit(
    train_features['Horsepower'],
    train_labels,
    epochs=100,
    # Suppress logging.
    verbose=0,
    # Calculate validation results on 20% of the training data.
    validation_split=0.2)


In [None]:
# Plot history
# hist = pd.DataFrame(history.history)
# hist['epoch'] = history.epoch
# hist.tail()
def plot_loss(history: tf.keras.callbacks.History):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.ylim([0, 10])
    plt.xlabel('Epoch')
    plt.ylabel('Error [MPG]')
    plt.legend()
    plt.grid(True)
plot_loss(history)


In [None]:
# Plot fit vs trainset comparison:
def plot_horsepower(x,y):
    plt.scatter(train_features['Horsepower'], train_labels, label='Data')
    plt.plot(x, y, color='k', label='Predictions')
    plt.xlabel('Horsepower')
    plt.ylabel('MPG')
    plt.legend()

x = tf.linspace(0.0, 250, 251)
y = horsepower_model.predict(x)
plot_horsepower(x,y)

In [None]:
# Collect test results to compare later:
test_results = {}
test_results['horsepower_model'] = horsepower_model.evaluate(
    test_features['Horsepower'],
    test_labels, verbose=0)

2. Linear regression with multiple inputs:

In [None]:
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_features))
linear_model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(units=1)
])
linear_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')
history = linear_model.fit(
    train_features,
    train_labels,
    epochs=100,
    # Suppress logging.
    verbose=0,
    # Calculate validation results on 20% of the training data.
    validation_split=0.2)


In [None]:
# Plot losses:
plot_loss(history)

In [None]:
# Collect these results too:
test_results['linear_model'] = linear_model.evaluate(
    test_features, test_labels, verbose=0)

3. DNN with one input:

In [None]:
def build_and_compile_model(norm):
    model = tf.keras.Sequential([
        norm,
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])

    model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model

In [None]:
dnn_horsepower_model = build_and_compile_model(horsepower_normalizer)
history = dnn_horsepower_model.fit(
    train_features['Horsepower'],
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)
plot_loss(history)


In [None]:
# Plot compare fit with traindata:
x = tf.linspace(0.0, 250, 251)
y = dnn_horsepower_model.predict(x)
plot_horsepower(x, y)


In [None]:
# Collect again:
test_results['dnn_horsepower_model'] = dnn_horsepower_model.evaluate(
    test_features['Horsepower'], test_labels,
    verbose=0)


4. DNN with multiple inputs

In [None]:
dnn_model = build_and_compile_model(normalizer)
history = dnn_model.fit(
    train_features,
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)
plot_loss(history)


In [None]:
# Collect:
test_results['dnn_model'] = dnn_model.evaluate(test_features, test_labels, verbose=0)

In [None]:
# Compare models:
pd.DataFrame(test_results, index=['Mean absolute error [MPG]']).T

Compare test predictions with actual values:

In [None]:
test_predictions = dnn_model.predict(test_features).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)


See error distribution:

In [None]:
error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [MPG]')
_ = plt.ylabel('Count')


In [None]:
#Save the model:
dnn_model.save('dnn_model')
