# Predicting Fuel Consumption in Pytorch Using Pytorch Lightning Library

In [8]:
import numpy as np
import pandas as pd
import os, sys
from pathlib import Path
from watermark import watermark
import matplotlib.pyplot as plt
%matplotlib inline

import pytorch_lightning as pl
from pytorch_lightning.callbacks.progress import TQDMProgressBar

import torch
import torch.nn as nn 
from torch.nn import functional as F
from torch.nn.functional import normalize, one_hot
from torch.utils.data import random_split

# https://pypi.org/project/torchmetrics/
# https://torchmetrics.readthedocs.io/en/latest/pages/lightning.html
# https://torchmetrics.readthedocs.io/en/latest/

from torchmetrics import MeanAbsoluteError
from torchmetrics import MeanSquaredError
from sklearn.model_selection import train_test_split 

print(watermark(packages="torch,lightning", python=True))
print("Torch CUDA available?", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Python implementation: CPython
Python version       : 3.9.12
IPython version      : 8.5.0

torch    : 1.13.1
lightning: not installed

Torch CUDA available? False


# 1) Prepare Datasets, train and test

In [47]:
# We copy from previous sample all the data preparation steps into tensors

path_to_file = 'C:/Users/MRM/Desktop/Data_Analytics/Medium_and_PPB/Machine_Learning/Machine_Learning_Projects/Regression_Problems/Predicting_Car_Fuel_Consumption'

file = 'auto-mpg.data'

column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv(os.path.join(path_to_file, file), names=column_names,
                 na_values = "?", comment='\t',
                 sep=" ", skipinitialspace=True)
df = df.dropna()
df = df.reset_index(drop=True)

# Perform train, test split
df_train, df_test = train_test_split(df, train_size=0.8, random_state=1)

numeric_column_names = ['Cylinders', 'Displacement','Horsepower', 'Weight','Acceleration']
# Normalize Numerical Data and convert them to a tensor
# Normalize divides every element in the column by the max element in it (so it not a min-max scaling or standarization)
x_train_numeric_norm = normalize(torch.tensor(df_train[numeric_column_names].values))
x_test_numeric_norm = normalize(torch.tensor(df_test[numeric_column_names].values))

# Embeddings for Origin
total_origin = df_train['Origin'].nunique() # Result 3
origin_encoded_train = one_hot(torch.from_numpy(df_train['Origin'].values) % total_origin)
origin_encoded_test = one_hot(torch.from_numpy(df_test['Origin'].values) % total_origin)

 # Bucketize train and test
boundaries = torch.tensor([73, 76, 79])
 
v = torch.tensor(df_train['Model Year'].values)
train_year_bucketized = torch.bucketize(v, boundaries, right=True)
train_year_bucketized = train_year_bucketized.unsqueeze(1)  # Add additional dimension to allow concatenation of tensors

v = torch.tensor(df_test['Model Year'].values)
test_year_bucketized = torch.bucketize(v, boundaries, right=True)
test_year_bucketized = test_year_bucketized.unsqueeze(1) 

# Join all datasets 

x_train = torch.cat((x_train_numeric_norm, origin_encoded_train,train_year_bucketized), 1).float()
x_test = torch.cat([x_test_numeric_norm, origin_encoded_test, test_year_bucketized], 1).float()

y_train = torch.tensor(df_train['MPG'].values).float()
y_train = torch.reshape(y_train, (y_train.shape[0],1))
y_test = torch.tensor(df_test['MPG'].values).float()
y_test = torch.reshape(y_test, (y_test.shape[0],1))


In [48]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

torch.Size([313, 9])
torch.Size([313, 1])
torch.Size([79, 9])
torch.Size([79, 1])


# 2) Create DataLoader

In [49]:
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import random_split

train_ds = TensorDataset(x_train, y_train)
test_ds = TensorDataset(x_test,y_test)
batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size, shuffle=True)

In [50]:
for i, batch in enumerate(train_dl, 1):
    print(f'batch{i}', 'x: ', batch[0], 'y: ', batch[1])

batch1 x:  tensor([[1.7969e-03, 4.0431e-02, 3.1896e-02, 9.9864e-01, 7.4123e-03, 0.0000e+00,
         0.0000e+00, 1.0000e+00, 1.0000e+00],
        [1.5128e-03, 5.2949e-02, 3.1391e-02, 9.9808e-01, 6.4295e-03, 0.0000e+00,
         1.0000e+00, 0.0000e+00, 1.0000e+00],
        [1.7914e-03, 5.0606e-02, 4.2545e-02, 9.9779e-01, 6.2698e-03, 1.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.7953e-03, 7.8543e-02, 3.2539e-02, 9.9637e-01, 3.1417e-03, 0.0000e+00,
         1.0000e+00, 0.0000e+00, 1.0000e+00],
        [1.7177e-03, 9.2110e-02, 4.4659e-02, 9.9474e-01, 2.3618e-03, 0.0000e+00,
         1.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.0118e-03, 4.5767e-02, 3.4200e-02, 9.9833e-01, 8.0470e-03, 1.0000e+00,
         0.0000e+00, 0.0000e+00, 3.0000e+00],
        [2.0115e-03, 4.8779e-02, 3.3693e-02, 9.9821e-01, 8.2471e-03, 1.0000e+00,
         0.0000e+00, 0.0000e+00, 2.0000e+00],
        [1.6074e-03, 9.1421e-02, 4.5208e-02, 9.9478e-01, 2.2102e-03, 0.0000e+00,
         1.0000e+00

In order to use a it is needed to implement Lightning model using LightningModule instead of the regular PyTorch

In [56]:
class MultiLayerRegressor(pl.LightningModule):
    def __init__(self, input_size = x_train.shape[1], hidden_units=(32, 16, 8)):
        super().__init__()
        
        # new PL attributes:
        self.input_size = input_size
        self.train_mae = MeanAbsoluteError()
        self.test_mae = MeanAbsoluteError()
        
        # Model similar to previous section:

        all_layers = []
        for hidden_unit in hidden_units:
            layer = torch.nn.Linear(input_size, hidden_unit)
            all_layers.append(layer)
            all_layers.append(torch.nn.ReLU())
            input_size = hidden_unit

        all_layers.append(torch.nn.Linear(hidden_units[-1], 1))
        self.model = nn.Sequential(*all_layers)

    def forward(self, x):
        x = self.model(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.huber_loss(self(x), y, reduction='mean', delta=1.0)
        self.train_mae.update(logits, y)
        self.log("train_hubber", loss, prog_bar=True)
        self.log('train_mae', self.train_mae.compute(), prog_bar = True)
        return loss

    def training_epoch_end(self, outs):
        self.log("train_mae", self.train_mae.compute())

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.huber_loss(self(x), y, reduction='mean', delta=1.0)
        self.log("test_loss", loss, prog_bar=True)
        self.test_mae.update(logits, y)
        predictions_pred.append(logits)
        predictions_actual.append(y.data)
        self.log('test_loss',loss, prog_bar = True)
        self.log('test_mae', self.test_mae.compute(), prog_bar = True)
        return {'test_loss': loss}
     

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        return optimizer
   

In [52]:
# Init our model
mpg_model = MultiLayerRegressor()

# Initialize a trainer
trainer = pl.Trainer(
    accelerator="auto",
    devices=1 if torch.cuda.is_available() else None,  # limiting got iPython runs
    max_epochs=50,
    callbacks=[TQDMProgressBar(refresh_rate=20)],
)

# Train the model ⚡
trainer.fit(mpg_model, train_dl)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name      | Type              | Params
------------------------------------------------
0 | train_mae | MeanAbsoluteError | 0     
1 | test_mae  | MeanAbsoluteError | 0     
2 | model     | Sequential        | 993   
------------------------------------------------
993       Trainable params
0         Non-trainable params
993       Total params
0.004     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Epoch 49: 100%|██████████| 40/40 [00:02<00:00, 18.62it/s, loss=3.58, v_num=5, train_hubber=2.230]  

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 40/40 [00:02<00:00, 17.90it/s, loss=3.58, v_num=5, train_hubber=2.230]


In [58]:
# Test the model
# NEW !!!
# Evaluate model based on test_step

predictions_pred = []
predictions_actual = []

train_mae = trainer.test(dataloaders=train_dl)[0]["test_mae"]
test_mae = trainer.test(dataloaders=test_dl)[0]["test_mae"]
print(
    f"Train MAE {train_mae:.2f}"
    f" | Test MAE {test_mae:.2f}"
)

Restoring states from the checkpoint path at c:\Users\MRM\Desktop\Data_Analytics\Medium_and_PPB\Machine_Learning\Machine_Learning_Projects\Pytorch\lightning_logs\version_5\checkpoints\epoch=49-step=2000.ckpt
Loaded model weights from checkpoint at c:\Users\MRM\Desktop\Data_Analytics\Medium_and_PPB\Machine_Learning\Machine_Learning_Projects\Pytorch\lightning_logs\version_5\checkpoints\epoch=49-step=2000.ckpt


Testing DataLoader 0: 100%|██████████| 40/40 [00:01<00:00, 26.38it/s]

Restoring states from the checkpoint path at c:\Users\MRM\Desktop\Data_Analytics\Medium_and_PPB\Machine_Learning\Machine_Learning_Projects\Pytorch\lightning_logs\version_5\checkpoints\epoch=49-step=2000.ckpt
Loaded model weights from checkpoint at c:\Users\MRM\Desktop\Data_Analytics\Medium_and_PPB\Machine_Learning\Machine_Learning_Projects\Pytorch\lightning_logs\version_5\checkpoints\epoch=49-step=2000.ckpt



────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss            3.446131467819214
        test_mae            3.9127731323242188
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Testing DataLoader 0: 100%|██████████| 10/10 [00:00<00:00, 32.58it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss            3.105956554412842
        test_mae            3.8806960582733154
────────────────────────────────────────────────