<a href="https://colab.research.google.com/github/MouhamedB-Ndiaye/Fuel-Efficiency-Prediction/blob/main/Fuel_Efficiency_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

For predicting fuel efficiency, we aim to predict the output of a continuous value,such as a price or a probability.

For this problem we use the Auto MPG dataset to create a model to predict fuel efficiency of vehicules in late 1970s and early 1980s. To do this, we provide the model with a description of many automobiles from this period.

In [8]:
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers

import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns

import numpy as np

In [9]:
#Let's import the data using the pandas package

column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower',
                'Weight', 'Acceleration', 'Model Year', 'Origin']
dataset = pd.read_csv("auto-mpg.csv", names=column_names,
                      na_values = "?", comment='\t',
                      sep=" ", skipinitialspace=True)

In [10]:
dataset

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,"mpg,cylinders,displacement,horsepower,weight,a...","year,origin,car",name,,,,,
1,"18,8,307,130,3504,12,70,1,chevrolet",chevelle,malibu,,,,,
2,"15,8,350,165,3693,11.5,70,1,buick",skylark,320,,,,,
3,"18,8,318,150,3436,11,70,1,plymouth",satellite,,,,,,
4,"16,8,304,150,3433,12,70,1,amc",rebel,sst,,,,,
...,...,...,...,...,...,...,...,...
394,"27,4,140,86,2790,15.6,82,1,ford",mustang,gl,,,,,
395,"44,4,97,52,2130,24.6,82,2,vw",pickup,,,,,,
396,"32,4,135,84,2295,11.6,82,1,dodge",rampage,,,,,,
397,"28,4,120,79,2625,18.6,82,1,ford",ranger,,,,,,


In [11]:
# The "origin" column in the dataset is categorical, so to move forward
# we need to use some one-hot encoding on it

origin = dataset.pop('Origin') #Pandas Pop is the sound a column makes when it's removed from a dataset
dataset['USA'] = (origin == 1) * 1.0
dataset['Europe'] = (origin == 2) * 1.0
dataset['Japan'] = (origin == 3) * 1.0


In [13]:
# splitting the data into training and test sets:

train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index) #the drop function Remove rows or columns by specifying label names and corresponding axis, or by specifying directly index or column names. 


In [15]:
# let’s visualize the data by using the seaborn’s pair plot method:
sns.pairplot(train_dataset[["MPG", "Cylinders", "Displacement", "Weight"]], diag_kind="kde")

#sns.PairGrid(train_dataset[["MPG", "Cylinders", "Displacement", "Weight"]], size = 4)

In [16]:

train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')

In [17]:
# Normalization of the dataset
#It is recommended that we standardize features that use different scales and ranges. Although the model can converge without standardization of features, this makes learning more difficult and makes the resulting model dependent on the choice of units used in the input. We need to do this to project the test dataset into the same distribution the model was trained on:

def norm(x):
  return (x - np.mean(x)) / np.std(x)
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [18]:
# Build the Model

#Here, we will use the sequential API with two hidden layers and one output layer that will return a single value. The steps to build the model are encapsulated in a function, build_model, 
#since we will be creating a second model later:

def build_model():
  model = keras.Sequential([
    layers.Dense(64, activation=tf.nn.relu, input_shape=[len(train_dataset.keys())]),
    layers.Dense(64, activation=tf.nn.relu),
    layers.Dense(1)                      
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss= 'mean_squared_error',
                optimizer = optimizer,
                metrics = ['mean_absolute_error', 'mean_squared_error' ])
  return model

model = build_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                640       
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 4,865
Trainable params: 4,865
Non-trainable params: 0
_________________________________________________________________


In [None]:
#before training the model to predict fuel efficiency let’s tray this model in the first 10 samples:

example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
example_result

ValueError: ignored

In [None]:
# Training the model to predict fuel efficiency

class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

EPOCHS = 1000

history = model.fit(
  normed_train_data, train_labels,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
  callbacks=[PrintDot()])

ValueError: ignored

In [None]:
# Let's visualize the model training

def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch
  
  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [MPG]')
  plt.plot(hist['epoch'], hist['mean_absolute_error'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mean_absolute_error'],
           label = 'Val Error')
  plt.ylim([0,5])
  plt.legend()
  
  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [$MPG^2$]')
  plt.plot(hist['epoch'], hist['mean_squared_error'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mean_squared_error'],
           label = 'Val Error')
  plt.ylim([0,20])
  plt.legend()
  plt.show()
plot_history(history)

In [None]:
#Now, let’s update the model.fit method to stop training when the validation score does not improve.
#We’ll be using an EarlyStopping callback that tests a training condition for each epoch.

model = build_model()

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(normed_train_data, train_labels, epochs=EPOCHS,
                    validation_split = 0.2, verbose=0, callbacks=[early_stop, PrintDot()])

plot_history(history)

In [None]:


loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=0)
print("Testing set Mean Abs Error: {:5.2f} MPG".format(mae))

In [None]:
#Now, let’s make predictions on the model to predict fuel efficiency:

test_predictions = model.predict(normed_test_data).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])
