# Miles Per Gallon (MPG)

#####  This data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes.

It has 398 Instances and 9 attributes, including the class attribute:

* mpg:                     - Continuous
* cylinders:               - multi-valued discrete
* displacement:            - Continuous
* horsepower:              - Continuous
* weight:                  - Continuous
* acceleration:            - Continuous
* model year:              - multi-valued discrete
* origin:                  - multi-valued discrete
* car name:                - string (unique for each instance)


The dataset can be found at: https://archive.ics.uci.edu/ml/datasets/auto+mpg

##### Importing necessary modules

In [1]:
import tensorflow as tf
import pandas as pd

In [2]:
# Reading the dataset
data = pd.read_csv("auto-mpg.csv")

In [3]:
# Displaying the first 5 rows of the dataset
data.head()

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
# discreption of the dataset
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mpg,392.0,23.445918,7.805007,9.0,17.0,22.75,29.0,46.6
cyl,392.0,5.471939,1.705783,3.0,4.0,4.0,8.0,8.0
displ,392.0,194.41199,104.644004,68.0,105.0,151.0,275.75,455.0
hp,392.0,104.469388,38.49116,46.0,75.0,93.5,126.0,230.0
weight,392.0,2977.584184,849.40256,1613.0,2225.25,2803.5,3614.75,5140.0
accel,392.0,15.541327,2.758864,8.0,13.775,15.5,17.025,24.8
yr,392.0,75.979592,3.683737,70.0,73.0,76.0,79.0,82.0
origin,392.0,1.576531,0.805518,1.0,1.0,1.0,2.0,3.0


#####  Spliting the dataset into features and labels

In [5]:
features = data.drop("mpg", axis=1)

In [6]:
features.head()

Unnamed: 0,cyl,displ,hp,weight,accel,yr,origin,name
0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,8,302.0,140,3449,10.5,70,1,ford torino


In [7]:
features.shape

(392, 8)

In [8]:
labels = data["mpg"]

In [9]:
labels.shape

(392,)

In [10]:
labels.head()

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64

#####  Spliting the features and labels into training (70%) and testing ( 30%) sets with a random seeding

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

In [13]:
x_train.shape

(274, 8)

In [14]:
x_test.head()

Unnamed: 0,cyl,displ,hp,weight,accel,yr,origin,name
78,4,96.0,69,2189,18.0,72,2,renault 12 (sw)
274,4,121.0,115,2795,15.7,78,2,saab 99gle
246,4,91.0,60,1800,16.4,78,3,honda civic cvcc
55,4,91.0,70,1955,20.5,71,1,plymouth cricket
387,4,140.0,86,2790,15.6,82,1,ford mustang gl


In [15]:
 y_test.head()

78     26.0
274    21.6
246    36.1
55     26.0
387    27.0
Name: mpg, dtype: float64

In [16]:
y_train.shape

(274,)

In [17]:
x_test.head()

Unnamed: 0,cyl,displ,hp,weight,accel,yr,origin,name
78,4,96.0,69,2189,18.0,72,2,renault 12 (sw)
274,4,121.0,115,2795,15.7,78,2,saab 99gle
246,4,91.0,60,1800,16.4,78,3,honda civic cvcc
55,4,91.0,70,1955,20.5,71,1,plymouth cricket
387,4,140.0,86,2790,15.6,82,1,ford mustang gl


#### Creating the feature columns for the regressor

In [18]:
# Creating continuous feature columns
cyl = tf.feature_column.numeric_column("cyl")
displ = tf.feature_column.numeric_column("displ")
hp = tf.feature_column.numeric_column("hp")
wght = tf.feature_column.numeric_column("weight")
acc = tf.feature_column.numeric_column("accel")
yr = tf.feature_column.numeric_column("yr")
orgn = tf.feature_column.numeric_column("origin")

In [19]:
# Total number of car names
len(data["name"].unique())

301

In [20]:
# Creating Catagorical Feature columns
name = tf.feature_column.categorical_column_with_hash_bucket("name", hash_bucket_size=1000)

In [21]:
# Aggregating all the feature columns in a single list
feature_columns = [cyl, displ, hp, wght, acc, yr, orgn, name]

####  Creating input function for the tf.estimator object.

In [22]:
input_function = tf.estimator.inputs.pandas_input_fn(x=x_train, y=y_train, batch_size=10, num_epochs=1000, shuffle=True)

###  Creating a Dense Neural Network Regressor

In [23]:
regressor = tf.estimator.DNNRegressor(feature_columns=feature_columns, hidden_units=[5,5])

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_model_dir': '/tmp/tmpiao1Wb', '_save_summary_steps': 100}


###  Training the regressor model for 1000 steps

In [None]:
'''
This will generate an error because of the categorical feature 
columns. To avoid this problem the should be converted to an 
embedding_column or indicator column
''' 
regressor.train(input_fn=input_function, steps=1000)

In [24]:
# we only have a single categorical feature column --> "name"
name_embadded = tf.feature_column.embedding_column(name, dimension=1000)

In [25]:
feature_columns = [cyl, displ, hp, wght, acc, yr, orgn, name_embadded]

In [26]:
input_function = tf.estimator.inputs.pandas_input_fn(x_train, y_train, batch_size=10, num_epochs=1000, shuffle=True)

In [27]:
# Creating a DNNRegressor with 2x5 hidden layer nodes
regressor = tf.estimator.DNNRegressor(feature_columns=feature_columns, hidden_units=[50,50,50,50])

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_model_dir': '/tmp/tmpYxUKiv', '_save_summary_steps': 100}


In [28]:
regressor.train(input_fn=input_function, steps=1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpYxUKiv/model.ckpt.
INFO:tensorflow:loss = 1826.41, step = 1
INFO:tensorflow:global_step/sec: 191.152
INFO:tensorflow:loss = 2040.51, step = 101 (0.526 sec)
INFO:tensorflow:global_step/sec: 184.042
INFO:tensorflow:loss = 31.8453, step = 201 (0.542 sec)
INFO:tensorflow:global_step/sec: 184.946
INFO:tensorflow:loss = 215.442, step = 301 (0.540 sec)
INFO:tensorflow:global_step/sec: 195.682
INFO:tensorflow:loss = 77.679, step = 401 (0.511 sec)
INFO:tensorflow:global_step/sec: 190.108
INFO:tensorflow:loss = 566.234, step = 501 (0.528 sec)
INFO:tensorflow:global_step/sec: 195.304
INFO:tensorflow:loss = 7.20909, step = 601 (0.513 sec)
INFO:tensorflow:global_step/sec: 175.433
INFO:tensorflow:loss = 122.249, step = 701 (0.567 sec)
INFO:tensorflow:global_step/sec: 165.741
INFO:tensorflow:loss = 25.9686, step = 801 (0.613 sec)
INFO:tensorflow:global_step/sec: 187.873
INFO:tensorflow:loss = 74.978, ste

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x7f29bbd82390>

### Predicting on a new data

In [29]:
# Creating a  prediction input function.
prediction_input_function = tf.estimator.inputs.pandas_input_fn(x=x_test, batch_size=100, num_epochs=1, shuffle=False)

In [30]:
# Predicting...
prediction = regressor.predict(prediction_input_function)

In [31]:
predictions = list(prediction)

INFO:tensorflow:Restoring parameters from /tmp/tmpYxUKiv/model.ckpt-1000


In [32]:
predictions

[{'predictions': array([ 9.4922514], dtype=float32)},
 {'predictions': array([ 11.68437481], dtype=float32)},
 {'predictions': array([ 32.62736893], dtype=float32)},
 {'predictions': array([ 8.2105751], dtype=float32)},
 {'predictions': array([ 19.6181221], dtype=float32)},
 {'predictions': array([ 27.92676735], dtype=float32)},
 {'predictions': array([ 28.95987129], dtype=float32)},
 {'predictions': array([ 9.86620998], dtype=float32)},
 {'predictions': array([ 12.58866215], dtype=float32)},
 {'predictions': array([ 5.33256769], dtype=float32)},
 {'predictions': array([ 14.54178524], dtype=float32)},
 {'predictions': array([ 20.57441711], dtype=float32)},
 {'predictions': array([ 20.65381813], dtype=float32)},
 {'predictions': array([ 28.03973007], dtype=float32)},
 {'predictions': array([ 13.99341679], dtype=float32)},
 {'predictions': array([ 36.31511307], dtype=float32)},
 {'predictions': array([ 25.22215843], dtype=float32)},
 {'predictions': array([ 28.64135933], dtype=float32)},

In [33]:
pred = [p["predictions"] for p in predictions]

### Evaluating our regressor model using root_mean_square (RMSE) Should be ~0

In [34]:
from sklearn.metrics import mean_squared_error

In [35]:
mean_squared_error(y_test, pred)**0.5

12.463884271677436

# Done!