In [1]:
import math

import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt

from skimpy import skim

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.compose import ColumnTransformer

In [3]:
import time
import copy

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [4]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']
df = pd.read_csv(url, names=column_names, na_values = "?", comment='\t', sep=" ", skipinitialspace=True, dtype={'Origin': 'category'})

In [5]:
skim(df)

In [6]:
df = (
      df.dropna().reset_index(drop=True) # Remove 6 NA HorsePower rows
        # [ - 73) -> 0
        # [73 - 76) -> 1
        # [76 - 79) -> 2
        # [79 - ] -> 3
        .assign(**{'Model Year': 
                   lambda df_ : pd.cut(df_['Model Year'],  
                                       bins=[float('-inf'), 73, 76, 79, float('inf')], 
                                       labels=[0, 1, 2, 3], right=False)}))

In [7]:
df

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,0,1
1,15.0,8,350.0,165.0,3693.0,11.5,0,1
2,18.0,8,318.0,150.0,3436.0,11.0,0,1
3,16.0,8,304.0,150.0,3433.0,12.0,0,1
4,17.0,8,302.0,140.0,3449.0,10.5,0,1
...,...,...,...,...,...,...,...,...
387,27.0,4,140.0,86.0,2790.0,15.6,3,1
388,44.0,4,97.0,52.0,2130.0,24.6,3,2
389,32.0,4,135.0,84.0,2295.0,11.6,3,1
390,28.0,4,120.0,79.0,2625.0,18.6,3,1


In [8]:
df_train, df_test = train_test_split(df, train_size=0.8, shuffle=True)
df_test, df_val = train_test_split(df_test, train_size=0.5)

In [9]:
transformer = ColumnTransformer([('Scaling', StandardScaler(), slice(0, 6)), 
                                 ('OHE', OneHotEncoder(), ['Origin'])], 
                                remainder='passthrough')

In [10]:
X_train = transformer.fit_transform(df_train)
X_val = transformer.fit_transform(df_val)
X_test = transformer.fit_transform(df_test)
# Split X, y
X_train, y_train = X_train[:, 1:], X_train[:, 0]
X_val, y_val = X_val[:, 1:], X_val[:, 0]
X_test, y_test = X_test[:, 1:], X_test[:, 0]
# Cast to Tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [11]:
train_dl = DataLoader(TensorDataset(X_train, y_train.reshape(-1, 1)), batch_size=64)
val_dl = DataLoader(TensorDataset(X_val, y_val.reshape(-1, 1)), batch_size=16)
test_dl = DataLoader(TensorDataset(X_test, y_test.reshape(-1, 1)), batch_size=16)
data_loader = {'train': train_dl, 'val': val_dl, 'test': test_dl}

In [12]:
def train_model(model, criterion, optimizer, data_loader, num_epochs, epoch_print=1):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(1, num_epochs + 1):
        if epoch == 1 or epoch % epoch_print == 0:
            print(f'\nEpoch {epoch}/{num_epochs}')
            print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_accs = []
            
            # Iterate over data.
            for inputs, targets in data_loader[phase]:

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_accs.append(explained_variance_score(targets.numpy(), outputs.detach().numpy())) 

            epoch_loss = running_loss / len(data_loader[phase].dataset)
            epoch_acc = np.mean(running_accs)
            
            if epoch == 1 or epoch % epoch_print == 0:
                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

    time_elapsed = time.time() - since
    
    print(f'\nTraining complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model


In [13]:
class VNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.fc_1 = nn.Sequential(
            nn.Linear(9, 64),
            nn.LeakyReLU()
        )
        self.fc_2 = nn.Sequential(
            nn.Linear(64, 64),
            nn.LeakyReLU()
        )
        self.fc_3 = nn.Sequential(
            nn.Linear(64, 16),
            nn.LeakyReLU()
        )
        self.fc_4 = nn.Linear(16, 1)
        self.layers = nn.ModuleList([self.fc_1, self.fc_2, self.fc_3, self.fc_4])
        
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [14]:
model = VNN()
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)

In [15]:
model = train_model(model, criterion, optimizer, data_loader, 100, 20)


Epoch 1/100
----------
train Loss: 0.6906 Acc: 0.3440
val Loss: 0.5719 Acc: 0.4990

Epoch 20/100
----------
train Loss: 0.0802 Acc: 0.9197
val Loss: 0.0699 Acc: 0.9281

Epoch 40/100
----------
train Loss: 0.0626 Acc: 0.9418
val Loss: 0.0781 Acc: 0.9201

Epoch 60/100
----------
train Loss: 0.0547 Acc: 0.9494
val Loss: 0.0911 Acc: 0.9203

Epoch 80/100
----------
train Loss: 0.0395 Acc: 0.9624
val Loss: 0.0861 Acc: 0.9061

Epoch 100/100
----------
train Loss: 0.0446 Acc: 0.9592
val Loss: 0.0820 Acc: 0.9271

Training complete in 0m 2s
Best val Acc: 0.930731


In [16]:
def check_accuracy(loader, model):
    model.eval()
    
    accs = []
    with torch.no_grad():
        for x, y in loader:
            
            outputs = model(x)
            accs.append(explained_variance_score(y, outputs))
        
        print(f'{np.mean(accs):.2%}')

In [17]:
check_accuracy(data_loader['test'], model)

86.86%


In [18]:
from sklearn.svm import SVR

In [19]:
reg = SVR(C=10)
reg.fit(X_train, y_train)
explained_variance_score(y_test, reg.predict(X_test))

0.8749014159702325