In [1]:
import torch
import numpy as np
import pandas as pd
from torch.utils import data
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch import nn


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


The Data set is the AutoMPG Dataset


In [2]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)

In [3]:
dataset = raw_dataset.copy()
dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1
397,31.0,4,119.0,82.0,2720.0,19.4,82,1


Now procede to clean the data by removing the Nan and fixing other values.


In [4]:
dataset.isna().sum()


MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [5]:
dataset = dataset.dropna()


Change categorical column


In [6]:
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Europe,Japan,USA
393,27.0,4,140.0,86.0,2790.0,15.6,82,False,False,True
394,44.0,4,97.0,52.0,2130.0,24.6,82,True,False,False
395,32.0,4,135.0,84.0,2295.0,11.6,82,False,False,True
396,28.0,4,120.0,79.0,2625.0,18.6,82,False,False,True
397,31.0,4,119.0,82.0,2720.0,19.4,82,False,False,True


In [7]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
print("shape of train_dataset: ", train_dataset.shape)
print("shape of test_dataset: ", test_dataset.shape)


shape of train_dataset:  (314, 10)
shape of test_dataset:  (78, 10)


In [8]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')
print("shape of train_features: ", train_features.shape)
print("shape of test_features: ", test_features.shape)
print("shape of train_labels: ", train_labels.shape)
print("shape of test_labels: ", test_labels.shape)

shape of train_features:  (314, 9)
shape of test_features:  (78, 9)
shape of train_labels:  (314,)
shape of test_labels:  (78,)


In [9]:
batch = 10
X =np.array(train_features.values.astype(np.float32))
X = torch.Tensor(X)
y = np.array(train_labels.values.astype(np.float32))
y = torch.Tensor(y)
y = torch.reshape(y, (-1,1))
train = TensorDataset(X,y)
train_dataloader = DataLoader(train, batch_size=batch, shuffle=True)
sample = next(iter(train_dataloader))
print("sample batch: ", sample)
print("batch len: ", len(sample))

sample batch:  [tensor([[6.0000e+00, 2.5000e+02, 1.0000e+02, 3.2820e+03, 1.5000e+01, 7.1000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [6.0000e+00, 2.0000e+02, 8.1000e+01, 3.0120e+03, 1.7600e+01, 7.6000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [4.0000e+00, 1.5100e+02, 9.0000e+01, 3.0030e+03, 2.0100e+01, 8.0000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [8.0000e+00, 3.0200e+02, 1.4000e+02, 4.1410e+03, 1.4000e+01, 7.4000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [6.0000e+00, 1.9900e+02, 9.0000e+01, 2.6480e+03, 1.5000e+01, 7.0000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [4.0000e+00, 9.8000e+01, 6.8000e+01, 2.1550e+03, 1.6500e+01, 7.8000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [4.0000e+00, 1.1600e+02, 9.0000e+01, 2.1230e+03, 1.4000e+01, 7.1000e+01,
         1.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.0000e+00, 9.0000e+01, 4.8000e+01, 2.3350e+03, 2.3700e+01, 8.0000e+01,
         1.000

In [10]:
for X, y in train_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([10, 9])
Shape of y: torch.Size([10, 1]) torch.float32


In [11]:
X =np.array(test_features.values.astype(np.float32))
X = torch.Tensor(X)
y = np.array(test_labels.values.astype(np.float32))
y = torch.Tensor(y)
y = torch.reshape(y, (-1,1))
test = TensorDataset(X,y)
test_dataloader = DataLoader(test, batch_size=batch, shuffle=True)
sample = next(iter(test_dataloader))
print("sample batch: ", sample[0])
print("batch len: ", len(sample))

sample batch:  tensor([[4.0000e+00, 1.5100e+02, 8.4000e+01, 2.6350e+03, 1.6400e+01, 8.1000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [8.0000e+00, 3.0700e+02, 1.3000e+02, 4.0980e+03, 1.4000e+01, 7.2000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [3.0000e+00, 7.0000e+01, 9.7000e+01, 2.3300e+03, 1.3500e+01, 7.2000e+01,
         0.0000e+00, 1.0000e+00, 0.0000e+00],
        [6.0000e+00, 2.5000e+02, 1.0000e+02, 3.7810e+03, 1.7000e+01, 7.4000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [4.0000e+00, 9.8000e+01, 6.5000e+01, 2.0450e+03, 1.6200e+01, 8.1000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [4.0000e+00, 9.1000e+01, 6.7000e+01, 1.9650e+03, 1.5700e+01, 8.2000e+01,
         0.0000e+00, 1.0000e+00, 0.0000e+00],
        [4.0000e+00, 1.0500e+02, 7.5000e+01, 2.2300e+03, 1.4500e+01, 7.8000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [4.0000e+00, 9.8000e+01, 6.6000e+01, 1.8000e+03, 1.4400e+01, 7.8000e+01,
         0.0000

In [12]:
for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([10, 9])
Shape of y: torch.Size([10, 1]) torch.float32


In [28]:
class NNModel(nn.Module):
    def __init__(self):
        #super(NNModel, self).__init__()
        super().__init__()
        self.norm = nn.BatchNorm1d(9)
        self.fc1 = nn.Linear(9,1)


    def forward(self, x):
        t = self.norm(x)
        t = self.fc1(t)
        return t


In [45]:
class NNModel2(nn.Module):
    def __init__(self):
        #super(NNModel, self).__init__()
        super().__init__()
        self.norm = nn.BatchNorm1d(9)
        self.fc1 = nn.Linear(9,18)
        self.fc2 = nn.Linear(18,9)
        self.fc3 = nn.Linear(9,1)
        self.relu = nn.ReLU()


    def forward(self, x):
        t = self.norm(x)
        t = self.fc1(t)
        t = self.relu(t)
        t = self.fc2(t)
        t = self.relu(t)
        t = self.fc3(t)
        return t

In [66]:
#nnet = NNModel()
nnet = NNModel2()


In [67]:
from torchinfo import summary
batch = 10
summary(nnet, input_size=(batch, 9), device='cpu', col_names=['input_size', 'output_size',
                                                              'num_params'])


Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
NNModel2                                 [10, 9]                   [10, 1]                   --
├─BatchNorm1d: 1-1                       [10, 9]                   [10, 9]                   18
├─Linear: 1-2                            [10, 9]                   [10, 18]                  180
├─ReLU: 1-3                              [10, 18]                  [10, 18]                  --
├─Linear: 1-4                            [10, 18]                  [10, 9]                   171
├─ReLU: 1-5                              [10, 9]                   [10, 9]                   --
├─Linear: 1-6                            [10, 9]                   [10, 1]                   10
Total params: 379
Trainable params: 379
Non-trainable params: 0
Total mult-adds (M): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00

In [68]:
def train(dataloader, model, loss_fn, optimizer, device):
    size = len(dataloader.dataset)
    print("size: ", size)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        #print("X: " , X)
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 10 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"Train loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [69]:
def test(dataloader, model, loss_fn, device):
    size = len(dataloader.dataset)
    print("size: ", size)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [75]:
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(nnet.parameters(), lr=0.001)


In [76]:
device = 'cpu'
epochs = 60
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, nnet, criterion, optimizer, device)
    test(test_dataloader, nnet, criterion, device)
print("Done!")

Epoch 1
-------------------------------
size:  314
Train loss: 2.985722  [   10/  314]
Train loss: 6.383417  [  110/  314]
Train loss: 8.665548  [  210/  314]
Train loss: 5.512855  [  310/  314]
size:  78
Test Error: 
 Accuracy: 0.0%, Avg loss: 7.911702 

Epoch 2
-------------------------------
size:  314
Train loss: 20.824890  [   10/  314]
Train loss: 9.538658  [  110/  314]
Train loss: 6.225008  [  210/  314]
Train loss: 12.321730  [  310/  314]
size:  78
Test Error: 
 Accuracy: 0.0%, Avg loss: 6.768775 

Epoch 3
-------------------------------
size:  314
Train loss: 4.476262  [   10/  314]
Train loss: 19.012058  [  110/  314]
Train loss: 9.495032  [  210/  314]
Train loss: 14.579503  [  310/  314]
size:  78
Test Error: 
 Accuracy: 0.0%, Avg loss: 6.792705 

Epoch 4
-------------------------------
size:  314
Train loss: 11.359545  [   10/  314]
Train loss: 33.392159  [  110/  314]
Train loss: 42.585793  [  210/  314]
Train loss: 16.360174  [  310/  314]
size:  78
Test Error: 
 Accur

In [18]:
train_labels.iloc[1]
train_features.iloc[1]

Cylinders            4
Displacement     140.0
Horsepower        88.0
Weight          2890.0
Acceleration      17.3
Model Year          79
Europe           False
Japan            False
USA               True
Name: 282, dtype: object

In [24]:
temp = train_labels.to_numpy()
temp[1]
temp2 = train_features.to_numpy()
temp2[1].shape
temp2.shape
temp2[1].shape

(9,)

Setup the data loaders


In [None]:
class MPGDataSet(Dataset):
    def __init__(self, features, labels, feature_transform=None,
                 label_transform=None):
        self.features = features
        self.labels = labels
        self.feature_transform = feature_transform
        self.label_transform = label_transform

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
