In [None]:
!curl -O https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 30286    0 30286    0     0   6570      0 --:--:--  0:00:04 --:--:--  6709


In [None]:
!ls

auto-mpg.data  sample_data


In [None]:
url = "auto-mpg.data"

In [None]:
import pandas as pd
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']
df = pd.read_csv(url, names=column_names, na_values = "?", comment='\t', sep=" ", skipinitialspace=True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
MPG,0
Cylinders,0
Displacement,0
Horsepower,6
Weight,0
Acceleration,0
Model Year,0
Origin,0


In [None]:
df = df.dropna()

In [None]:
df.head(4)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1


In [None]:
 df = df.reset_index(drop=True)

In [None]:
df.head(4)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1


In [None]:
from sklearn.model_selection import train_test_split
dftrain, dftest = train_test_split(df, train_size=0.8, random_state=1)

In [None]:
train_stats = dftrain.describe().transpose()

In [None]:
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,313.0,23.404153,7.666909,9.0,17.5,23.0,29.0,46.6
Cylinders,313.0,5.402556,1.701506,3.0,4.0,4.0,8.0,8.0
Displacement,313.0,189.51278,102.675646,68.0,104.0,140.0,260.0,455.0
Horsepower,313.0,102.929712,37.919046,46.0,75.0,92.0,120.0,230.0
Weight,313.0,2961.198083,848.602146,1613.0,2219.0,2755.0,3574.0,5140.0
Acceleration,313.0,15.704473,2.725399,8.5,14.0,15.5,17.3,24.8
Model Year,313.0,75.929712,3.675305,70.0,73.0,76.0,79.0,82.0
Origin,313.0,1.591054,0.807923,1.0,1.0,1.0,2.0,3.0


In [None]:
numeric_column_names = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']
df_train_norm, df_test_norm = dftrain.copy(), dftest.copy()

In [None]:
for cols in numeric_column_names:
  mean = train_stats.loc[cols, "mean"]
  std = train_stats.loc[cols, "std"]
  df_train_norm.loc[:, cols] = (df_train_norm.loc[:, cols] - mean) / std
  df_test_norm.loc[:, cols] = (df_test_norm.loc[:, cols] - mean) / std
df_train_norm.tail(5)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
203,28.0,-3.659617,-1.854518,-2.733884,-3.490621,-5.668627,76,3
255,19.4,-2.968799,-1.841712,-2.723452,-3.489156,-5.560923,78,1
72,13.0,-2.277982,-1.834598,-2.695633,-3.487922,-5.991737,72,1
235,30.5,-3.659617,-1.854423,-2.74223,-3.490765,-5.587849,77,1
37,14.0,-2.277982,-1.830519,-2.671291,-3.487768,-6.260996,71,1


In [None]:
##bucketing
import torch
boundaries = torch.tensor([73, 76, 79])
v = torch.tensor(df_train_norm['Model Year'].values)
df_train_norm['Model Year Bucketed'] = torch.bucketize(
    v, boundaries, right=True)
v = torch.tensor(df_test_norm['Model Year'].values)
df_test_norm['Model Year Bucketed'] = torch.bucketize(
    v, boundaries, right=True)
numeric_column_names.append('Model Year Bucketed')

In [None]:
numeric_column_names

['Cylinders',
 'Displacement',
 'Horsepower',
 'Weight',
 'Acceleration',
 'Model Year Bucketed']

In [None]:
from torch.nn.functional import one_hot
total_origin = len(set(df_train_norm['Origin']))
origin_encoded = one_hot(torch.from_numpy(
    df_train_norm['Origin'].values) % total_origin)
x_train_numeric = torch.tensor(
df_train_norm[numeric_column_names].values)
x_train = torch.cat([x_train_numeric, origin_encoded], 1).float()
origin_encoded = one_hot(torch.from_numpy(
    df_test_norm['Origin'].values) % total_origin)
x_test_numeric = torch.tensor(
    df_test_norm[numeric_column_names].values)
x_test = torch.cat([x_test_numeric, origin_encoded], 1).float()

In [None]:
df_train_norm.tail(5)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,Model Year Bucketed
203,28.0,-3.659617,-1.854518,-2.733884,-3.490621,-5.668627,76,3,2
255,19.4,-2.968799,-1.841712,-2.723452,-3.489156,-5.560923,78,1,2
72,13.0,-2.277982,-1.834598,-2.695633,-3.487922,-5.991737,72,1,0
235,30.5,-3.659617,-1.854423,-2.74223,-3.490765,-5.587849,77,1,2
37,14.0,-2.277982,-1.830519,-2.671291,-3.487768,-6.260996,71,1,0


In [None]:
y_train = torch.tensor(df_train_norm["MPG"].values).float()
y_test = torch.tensor(df_test_norm["MPG"].values).float()

In [None]:
from torch.utils.data import TensorDataset, DataLoader
train_ds = TensorDataset(x_train, y_train)
batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

In [None]:
import torch.nn as nn
hidden_units = [8, 4]
input_size = x_train.shape[1]
all_layers = []
for hidden_unit in hidden_units:
  layer = nn.Linear(input_size, hidden_unit)
  all_layers.append(layer)
  all_layers.append(nn.ReLU())
  input_size = hidden_unit
all_layers.append(nn.Linear(hidden_units[-1], 1))
model = nn.Sequential(*all_layers)
model


Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [None]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [None]:
torch.manual_seed(1)
num_epochs = 200
log_epochs = 20

In [None]:
for epoch in range(num_epochs):
  loss_hist_train = 0
  for x_batch, y_batch in train_dl:
    pred = model(x_batch)[:, 0]
    loss = loss_fn(pred, y_batch)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    loss_hist_train += loss.item()
  if epoch % log_epochs==0:
    print(f'Epoch {epoch} Loss '
    f'{loss_hist_train/len(train_dl):.4f}')

Epoch 0 Loss 150.1605
Epoch 20 Loss 63.5871
Epoch 40 Loss 63.9400
Epoch 60 Loss 61.5167
Epoch 80 Loss 63.0962
Epoch 100 Loss 64.3972
Epoch 120 Loss 60.8011
Epoch 140 Loss 58.6094
Epoch 160 Loss 58.3221
Epoch 180 Loss 59.1051


In [35]:
with torch.no_grad():
  pred = model(x_test.float())[:, 0]
  loss = loss_fn(pred, y_test)
  print(f'Test MSE: {loss.item():.4f}')
  print(f'Test MAE: {nn.L1Loss()(pred, y_test).item():.4f}')

Test MSE: 69.4262
Test MAE: 7.3347


In [36]:
!ls

auto-mpg.data  sample_data
