In [114]:
import torch
import numpy as np
import pandas as pd
from torch.utils import data
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch import nn


The Data set is the AutoMPG Dataset


In [115]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)

In [116]:
dataset = raw_dataset.copy()
dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1
397,31.0,4,119.0,82.0,2720.0,19.4,82,1


Now procede to clean the data by removing the Nan and fixing other values.


In [30]:
dataset.isna().sum()


MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [31]:
dataset = dataset.dropna()


Change categorical column


In [32]:
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Europe,Japan,USA
393,27.0,4,140.0,86.0,2790.0,15.6,82,False,False,True
394,44.0,4,97.0,52.0,2130.0,24.6,82,True,False,False
395,32.0,4,135.0,84.0,2295.0,11.6,82,False,False,True
396,28.0,4,120.0,79.0,2625.0,18.6,82,False,False,True
397,31.0,4,119.0,82.0,2720.0,19.4,82,False,False,True


In [33]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
print("shape of train_dataset: ", train_dataset.shape)
print("shape of test_dataset: ", test_dataset.shape)


shape of train_dataset:  (314, 10)
shape of test_dataset:  (78, 10)


In [34]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')
print("shape of train_features: ", train_features.shape)
print("shape of test_features: ", test_features.shape)
print("shape of train_labels: ", train_labels.shape)
print("shape of test_labels: ", test_labels.shape)

shape of train_features:  (314, 9)
shape of test_features:  (78, 9)
shape of train_labels:  (314,)
shape of test_labels:  (78,)


In [84]:
batch = 10
X =np.array(train_features.values.astype(np.float32))
X = torch.Tensor(X)
y = np.array(train_labels.values.astype(np.float32))
y = torch.Tensor(y)
y = torch.reshape(y, (-1,1))
train = TensorDataset(X,y)
train_dataloader = DataLoader(train, batch_size=batch, shuffle=True)
sample = next(iter(train_dataloader))
print("sample batch: ", sample)
print("batch len: ", len(sample))

sample batch:  [tensor([[4.0000e+00, 9.0000e+01, 7.1000e+01, 2.2230e+03, 1.6500e+01, 7.5000e+01,
         1.0000e+00, 0.0000e+00, 0.0000e+00],
        [6.0000e+00, 1.5500e+02, 1.0700e+02, 2.4720e+03, 1.4000e+01, 7.3000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [4.0000e+00, 1.2000e+02, 8.7000e+01, 2.9790e+03, 1.9500e+01, 7.2000e+01,
         1.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.0000e+00, 1.0700e+02, 8.6000e+01, 2.4640e+03, 1.5500e+01, 7.6000e+01,
         1.0000e+00, 0.0000e+00, 0.0000e+00],
        [6.0000e+00, 2.3100e+02, 1.0500e+02, 3.4250e+03, 1.6900e+01, 7.7000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [4.0000e+00, 7.1000e+01, 6.5000e+01, 1.7730e+03, 1.9000e+01, 7.1000e+01,
         0.0000e+00, 1.0000e+00, 0.0000e+00],
        [4.0000e+00, 1.1600e+02, 7.5000e+01, 2.2460e+03, 1.4000e+01, 7.4000e+01,
         1.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.0000e+00, 1.2000e+02, 8.8000e+01, 3.2700e+03, 2.1900e+01, 7.6000e+01,
         1.000

In [85]:
for X, y in train_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([10, 9])
Shape of y: torch.Size([10, 1]) torch.float32


In [86]:
X =np.array(test_features.values.astype(np.float32))
X = torch.Tensor(X)
y = np.array(test_labels.values.astype(np.float32))
y = torch.Tensor(y)
y = torch.reshape(y, (-1,1))
test = TensorDataset(X,y)
test_dataloader = DataLoader(test, batch_size=batch, shuffle=True)
sample = next(iter(test_dataloader))
print("sample batch: ", sample[0])
print("batch len: ", len(sample))

sample batch:  tensor([[6.0000e+00, 2.0000e+02, 9.5000e+01, 3.1550e+03, 1.8200e+01, 7.8000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [4.0000e+00, 1.0700e+02, 7.2000e+01, 2.2900e+03, 1.7000e+01, 8.0000e+01,
         0.0000e+00, 1.0000e+00, 0.0000e+00],
        [8.0000e+00, 3.5000e+02, 1.4500e+02, 4.0550e+03, 1.2000e+01, 7.6000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [8.0000e+00, 3.5000e+02, 1.0500e+02, 3.7250e+03, 1.9000e+01, 8.1000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [4.0000e+00, 9.8000e+01, 7.9000e+01, 2.2550e+03, 1.7700e+01, 7.6000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [8.0000e+00, 3.9000e+02, 1.9000e+02, 3.8500e+03, 8.5000e+00, 7.0000e+01,
         0.0000e+00, 0.0000e+00, 1.0000e+00],
        [4.0000e+00, 8.9000e+01, 7.1000e+01, 1.9250e+03, 1.4000e+01, 7.9000e+01,
         1.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.0000e+00, 1.5600e+02, 1.0500e+02, 2.8000e+03, 1.4400e+01, 8.0000e+01,
         0.0000

In [87]:
for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([10, 9])
Shape of y: torch.Size([10, 1]) torch.float32


In [88]:
class NNModel(nn.Module):
    def __init__(self):
        super(NNModel, self).__init__()
        self.fc1 = nn.Linear(9,1)


    def forward(self, x):
        x = self.fc1(x)
        return x


In [89]:
nnet = NNModel()

In [90]:
from torchinfo import summary
batch = 10
summary(nnet, input_size=(batch, 9), device='cpu', col_names=['input_size', 'output_size',
                                                              'num_params'])


Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
NNModel                                  [10, 9]                   [10, 1]                   --
├─Linear: 1-1                            [10, 9]                   [10, 1]                   10
Total params: 10
Trainable params: 10
Non-trainable params: 0
Total mult-adds (M): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00

In [109]:
def train(dataloader, model, loss_fn, optimizer, device):
    size = len(dataloader.dataset)
    print("size: ", size)
    #model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        #print("X: " , X)
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 10 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [110]:
def test(dataloader, model, loss_fn, device):
    size = len(dataloader.dataset)
    print("size: ", size)
    num_batches = len(dataloader)
    #model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [111]:
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.SGD(nnet.parameters(), lr=0.03)


In [113]:
device = 'cpu'
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, nnet, criterion, optimizer, device)
    test(test_dataloader, nnet, criterion, device)
print("Done!")

Epoch 1
-------------------------------
size:  314
loss:     nan  [   10/  314]
loss:     nan  [  110/  314]
loss:     nan  [  210/  314]
loss:     nan  [  310/  314]
size:  78
Test Error: 
 Accuracy: 0.0%, Avg loss:      nan 

Epoch 2
-------------------------------
size:  314
loss:     nan  [   10/  314]
loss:     nan  [  110/  314]
loss:     nan  [  210/  314]
loss:     nan  [  310/  314]
size:  78
Test Error: 
 Accuracy: 0.0%, Avg loss:      nan 

Epoch 3
-------------------------------
size:  314
loss:     nan  [   10/  314]
loss:     nan  [  110/  314]
loss:     nan  [  210/  314]
loss:     nan  [  310/  314]
size:  78
Test Error: 
 Accuracy: 0.0%, Avg loss:      nan 

Epoch 4
-------------------------------
size:  314
loss:     nan  [   10/  314]
loss:     nan  [  110/  314]
loss:     nan  [  210/  314]
loss:     nan  [  310/  314]
size:  78
Test Error: 
 Accuracy: 0.0%, Avg loss:      nan 

Epoch 5
-------------------------------
size:  314
loss:     nan  [   10/  314]
loss:   

In [18]:
train_labels.iloc[1]
train_features.iloc[1]

Cylinders            4
Displacement     140.0
Horsepower        88.0
Weight          2890.0
Acceleration      17.3
Model Year          79
Europe           False
Japan            False
USA               True
Name: 282, dtype: object

In [24]:
temp = train_labels.to_numpy()
temp[1]
temp2 = train_features.to_numpy()
temp2[1].shape
temp2.shape
temp2[1].shape

(9,)

Setup the data loaders


In [None]:
class MPGDataSet(Dataset):
    def __init__(self, features, labels, feature_transform=None,
                 label_transform=None):
        self.features = features
        self.labels = labels
        self.feature_transform = feature_transform
        self.label_transform = label_transform

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
