## Setup

UCI Data can be found [here](https://archive.ics.uci.edu/dataset/275/bike+sharing+dataset).

Notebook inspired by [Hands-On Machine Learning with Scikit-Learn and PyTorch](https://www.oreilly.com/library/view/hands-on-machine-learning/9798341607972/).

In [15]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [16]:
dat = pd.read_csv('day.csv')

In [17]:
X = dat.drop(columns = ['cnt', 'instant','dteday'])
y = dat['cnt']

In [18]:
# create train & test splits
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state = 501)

# create validation split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                    test_size = 0.2,
                                                    random_state = 501)

In [19]:
# center and scale X data
X_train = torch.FloatTensor(X_train.to_numpy())
X_valid = torch.FloatTensor(X_val.to_numpy())
X_test = torch.FloatTensor(X_test.to_numpy())

means = X_train.mean(dim=0, keepdims=True)
stds = X_train.std(dim=0, keepdims=True)

X_train = (X_train - means) / stds
X_valid = (X_valid - means) / stds
X_test = (X_test - means) / stds

In [20]:
# center and scale y data
y_train = torch.FloatTensor(y_train.to_numpy()).reshape(-1,1)
y_valid = torch.FloatTensor(y_val.to_numpy()).reshape(-1,1)
y_test = torch.FloatTensor(y_test.to_numpy()).reshape(-1,1)

y_mean = y_train.mean(dim=0, keepdims=True)
y_std  = y_train.std(dim=0, keepdims=True)

y_train = (y_train - y_mean) / y_std
y_valid = (y_valid - y_mean) / y_std
y_test  = (y_test  - y_mean) / y_std

### Helpers

In [21]:
def train_bgd(model, optimizer, criterion, X_train, y_train, n_epochs):
  for epoch in range(n_epochs):
    y_pred = model(X_train)
    loss = criterion(y_pred, y_train) # get loss val
    loss.backward() # calc grads
    optimizer.step() # take grad desc step
    optimizer.zero_grad() # zero out grads for next pass
    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

# train function to implement mb gd
def train_mbgd(model, optimizer, criterion, train_loader, n_epochs):
  model.train() # set training mode
  for epoch in range(n_epochs):
    total_loss = 0
    for X_batch, y_batch in train_loader:
      # get batch
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      # mod pred
      y_pred = model(X_batch)
      # calc loss and tally
      loss = criterion(y_pred, y_batch)
      total_loss += loss.item()
      # calc grads and do step
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    mean_loss = total_loss / len(train_loader)
    if epoch % 10 == 0: # every ten epochs, print out loss
      print(f'Epoch {epoch + 1}, Loss: {mean_loss}')

## create evaluation function
def evaluate(model, data_loader, metric, aggregate = torch.mean):
  model.eval() # change model mode to evaluation (no gradient work)
  metrics = []

  with torch.no_grad():
    for X_batch, y_batch in data_loader:
      # move data to GPU / cuda
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      y_pred = model(X_batch)
      metric_val = metric(y_pred, y_batch)
      metrics.append(metric_val)

  # retrun agg met over all batches
  return aggregate(torch.stack(metrics))

In [22]:
# set device depending on what's available
if torch.cuda.is_available():
  device = 'cuda'
elif torch.backends.mps.is_available():
  device = 'mps'
else:
  device = 'cpu'

In [23]:
n_features = X_train.shape[1] # get cols

## Define Wide and Deep Network

In [34]:
## create a custom pytorch class for a wide and deep nn
class WideAndDeep(nn.Module):
  def __init__(self, n_features):
    super().__init__() # initialize from nn module parent
    # create nn sequential stack / layers
    self.deep_stack = nn.Sequential(
        nn.Linear(n_features, 50),
        nn.ReLU(),
        nn.Linear(50,40),
        nn.ReLU()
    )
    # define model output
    self.output_layer = nn.Linear(40 + n_features, 1)

    # define forward pass
  def forward(self, X):
    deep_output = self.deep_stack(X) # pass data into stack
    wide_and_deep = torch.concat([X, deep_output], dim = 1) # output nn output plus og input
    return self.output_layer(wide_and_deep) # pass through output layer

### Create and Instance of Custom Module

In [35]:
# set seed
torch.manual_seed(501)

# create mod instance
model = WideAndDeep(n_features).to(device)

# define learning rate
learning_rate = 0.002

In [36]:
# set model training params
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
n_epochs = 100

In [37]:
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle = True)

In [38]:
train_mbgd(model, optimizer, criterion, train_loader, n_epochs)

Epoch 1, Loss: 0.8037662327289581
Epoch 11, Loss: 0.08511834293603897
Epoch 21, Loss: 0.04591403106848399
Epoch 31, Loss: 0.03313921789328257
Epoch 41, Loss: 0.026537391170859338
Epoch 51, Loss: 0.02094374137620131
Epoch 61, Loss: 0.016860316569606463
Epoch 71, Loss: 0.013902504183351993
Epoch 81, Loss: 0.01142065618187189
Epoch 91, Loss: 0.009896273693690697


In [39]:
# set up validation data loader
# don't typically use shuffle in evaluation so it's deterministic
# and the ordering is stable
valid_dataset = TensorDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle = False)

# RMSE on validation data
evaluate(model, valid_loader, criterion,
         aggregate = lambda metrics: torch.sqrt(torch.mean(metrics)))

tensor(0.1137)

## Wide and Deep V2

In [44]:
# create a new version which sends some features through the wide path
# and others through the deep path
class WideAndDeepV2(nn.Module):
  def __init__(self, n_features):
    super().__init__() # initialize from nn module parent
    # create nn sequential stack / layers
    self.deep_stack = nn.Sequential(
        nn.Linear(n_features - 2, 50), # adjust for deep change
        nn.ReLU(),
        nn.Linear(50,40),
        nn.ReLU()
    )
    # define model output
    self.output_layer = nn.Linear(40 + 5, 1) # adjust for wide change

    # define forward pass
  def forward(self, X):
    X_wide = X[:, :5]
    X_deep = X[:, 2:]
    deep_output = self.deep_stack(X_deep) # pass data into stack
    wide_and_deep = torch.concat([X_wide, deep_output], dim = 1) # output nn output plus og input
    return self.output_layer(wide_and_deep) # pass through output layer

In [45]:
# set seed
torch.manual_seed(501)

# create mod instance
model = WideAndDeepV2(n_features).to(device)

# define learning rate
learning_rate = 0.002

In [46]:
# set model training params
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
n_epochs = 100

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle = True)

In [47]:
train_mbgd(model, optimizer, criterion, train_loader, n_epochs)

Epoch 1, Loss: 0.8754307985305786
Epoch 11, Loss: 0.4230686555306117
Epoch 21, Loss: 0.250144832332929
Epoch 31, Loss: 0.14925698985656102
Epoch 41, Loss: 0.1075581689675649
Epoch 51, Loss: 0.08855915988485018
Epoch 61, Loss: 0.08018558671077093
Epoch 71, Loss: 0.07391772791743279
Epoch 81, Loss: 0.06863964820901552
Epoch 91, Loss: 0.06610908806324005


In [48]:
# set up validation data loader
# don't typically use shuffle in evaluation so it's deterministic
# and the ordering is stable
valid_dataset = TensorDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle = False)

# RMSE on validation data
evaluate(model, valid_loader, criterion,
         aggregate = lambda metrics: torch.sqrt(torch.mean(metrics)))

tensor(0.2906)

## Models with Multiple Inputs

In [49]:
class WideAndDeepV3(nn.Module):
  def __init__(self, n_features):
    super().__init__() # initialize from nn module parent
    # create nn sequential stack / layers
    self.deep_stack = nn.Sequential(
        nn.Linear(n_features - 2, 50), # adjust for deep change
        nn.ReLU(),
        nn.Linear(50,40),
        nn.ReLU()
    )
    # define model output
    self.output_layer = nn.Linear(40 + 5, 1) # adjust for wide change

    # define forward pass
  def forward(self, X_wide, X_deep):
    deep_output = self.deep_stack(X_deep) # pass data into stack
    wide_and_deep = torch.concat([X_wide, deep_output], dim = 1) # output nn output plus og input
    return self.output_layer(wide_and_deep)

In [52]:
## create custom dataset to handle returning three tensors
class WideAndDeepDataset(torch.utils.data.Dataset):
  def __init__(self, X_wide, X_deep, y):
    self.X_wide = X_wide
    self.X_deep = X_deep
    self.y = y

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    input_dict = {"X_wide": self.X_wide[idx], "X_deep": self.X_deep[idx]}
    return input_dict, self.y[idx]

In [65]:
## create datasets and data loaders
train_data_named = WideAndDeepDataset(X_wide = X_train[:, :5], X_deep = X_train[:, 2:], y = y_train)
train_loader_named = DataLoader(train_data_named, batch_size=32, shuffle = True)

valid_data_named = WideAndDeepDataset(X_wide = X_valid[:, :5], X_deep = X_valid[:, 2:], y = y_valid)
valid_loader_named = DataLoader(valid_data_named, batch_size=32, shuffle = False)

test_data_named = WideAndDeepDataset(X_wide = X_test[:, :5], X_deep = X_test[:, 2:], y = y_test)
test_loader_named = DataLoader(test_data_named, batch_size=32, shuffle = False)

In [66]:
## update main loop in eval and train functions
# train function to implement mb gd
def train_mbgd(model, optimizer, criterion, train_loader, n_epochs):
  model.train() # set training mode
  for epoch in range(n_epochs):
    total_loss = 0
    for inputs, y_batch in train_loader:
      # get batch
      inputs = {name: X.to(device) for name, X in inputs.items()}
      y_batch = y_batch.to(device)
      # mod pred
      y_pred = model(X_wide = inputs['X_wide'], X_deep = inputs['X_deep'])
      # calc loss and tally
      loss = criterion(y_pred, y_batch)
      total_loss += loss.item()
      # calc grads and do step
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    mean_loss = total_loss / len(train_loader)
    if epoch % 10 == 0: # every ten epochs, print out loss
      print(f'Epoch {epoch + 1}, Loss: {mean_loss}')

## create evaluation function
def evaluate(model, data_loader, metric, aggregate = torch.mean):
  model.eval() # change model mode to evaluation (no gradient work)
  metrics = []

  with torch.no_grad():
    for inputs, y_batch in data_loader:
      # get batch
      inputs = {name: X.to(device) for name, X in inputs.items()}
      y_batch = y_batch.to(device)
      # mod pred
      y_pred = model(X_wide = inputs['X_wide'], X_deep = inputs['X_deep'])
      metric_val = metric(y_pred, y_batch)
      metrics.append(metric_val)

  # retrun agg met over all batches
  return aggregate(torch.stack(metrics))

In [67]:
# set seed
torch.manual_seed(501)

# create mod instance
model = WideAndDeepV3(n_features).to(device)

# define learning rate
learning_rate = 0.002

# set model training params
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
n_epochs = 100

train_mbgd(model, optimizer, criterion, train_loader_named, n_epochs)

Epoch 1, Loss: 0.8754307985305786
Epoch 11, Loss: 0.4230686555306117
Epoch 21, Loss: 0.250144832332929
Epoch 31, Loss: 0.14925698985656102
Epoch 41, Loss: 0.1075581689675649
Epoch 51, Loss: 0.08855915988485018
Epoch 61, Loss: 0.08018558671077093
Epoch 71, Loss: 0.07391772791743279
Epoch 81, Loss: 0.06863964820901552
Epoch 91, Loss: 0.06610908806324005


In [68]:
# RMSE on validation data
evaluate(model, valid_loader_named, criterion,
         aggregate = lambda metrics: torch.sqrt(torch.mean(metrics)))

tensor(0.2906)

## Models with Multiple Outputs

In [69]:
## adding auxilary output to act as a form of regularization
class WideAndDeepV4(nn.Module):
  def __init__(self, n_features):
    super().__init__() # initialize from nn module parent
    # create nn sequential stack / layers
    self.deep_stack = nn.Sequential(
        nn.Linear(n_features - 2, 50), # adjust for deep change
        nn.ReLU(),
        nn.Linear(50, 40), # 50 inputs, 40 outputs
        nn.ReLU()
    )

    # define model output
    self.output_layer = nn.Linear(40 + 5, 1) # adjust for wide change

    self.aux_output_layer = nn.Linear(40, 1)

  def forward(self, X_wide, X_deep):
    deep_output = self.deep_stack(X_deep) # pass data into stack
    wide_and_deep = torch.concat([X_wide, deep_output], dim = 1) # output nn output plus og input

    main_output = self.output_layer(wide_and_deep)
    aux_output = self.aux_output_layer(deep_output)

    return main_output, aux_output

In [81]:
## update main loop in training function to accomodate aux layer
def train_mbgd(model, optimizer, criterion, train_loader, n_epochs):
  model.train() # set training mode
  for epoch in range(n_epochs):
    total_loss = 0
    for inputs, y_batch in train_loader:
      # get batch
      inputs = {name: X.to(device) for name, X in inputs.items()}
      y_batch = y_batch.to(device)
      # mod pred
      # unpack outputs from dictionary
      #y_pred, y_pred_aux = model(X_wide = inputs['X_wide'], X_deep = inputs['X_deep'])

      y_pred, y_pred_aux = model(**inputs) # easy way to unpack multiple inputs

      # get loss vals for each output
      main_loss = criterion(y_pred, y_batch)
      aux_loss = criterion(y_pred_aux, y_batch)

      loss = 0.8 * main_loss + 0.2 * aux_loss # 80/20 weight; essentially regularization
      total_loss += loss.item()
      # calc grads and do step
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    mean_loss = total_loss / len(train_loader)
    if epoch % 10 == 0: # every ten epochs, print out loss
      print(f'Epoch {epoch + 1}, Loss: {mean_loss}')

In [82]:
## update main loop in eval function
def evaluate(model, data_loader, metric, aggregate = torch.mean):
  model.eval() # change model mode to evaluation (no gradient work)
  metrics = []

  with torch.no_grad():
    for inputs, y_batch in data_loader:
      # get batch
      inputs = {name: X.to(device) for name, X in inputs.items()}
      y_batch = y_batch.to(device)
      # mod pred
      #y_pred, _ = model(X_wide = inputs['X_wide'], X_deep = inputs['X_deep'])

      y_pred, _ = model(**inputs) # easy way to unpack multiple inputs

      metric_val = metric(y_pred, y_batch)
      metrics.append(metric_val)

  # retrun agg met over all batches
  return aggregate(torch.stack(metrics))

In [83]:
# set seed
torch.manual_seed(501)

# create mod instance
model = WideAndDeepV4(n_features).to(device)

# define learning rate
learning_rate = 0.002

# set model training params
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
n_epochs = 100

train_mbgd(model, optimizer, criterion, train_loader_named, n_epochs)

Epoch 1, Loss: 0.9135758399963378
Epoch 11, Loss: 0.6055165727933248
Epoch 21, Loss: 0.4511197785536448
Epoch 31, Loss: 0.34409328798453015
Epoch 41, Loss: 0.2715859274069468
Epoch 51, Loss: 0.22237140834331512
Epoch 61, Loss: 0.18544450104236604
Epoch 71, Loss: 0.15996836225191752
Epoch 81, Loss: 0.13978618284066519
Epoch 91, Loss: 0.12438217649857203


In [89]:
# RMSE on validation data
print('RMSE of V4 on validation data:', round(
    (evaluate(model, valid_loader_named, criterion,
         aggregate = lambda metrics: torch.sqrt(torch.mean(metrics)))).item(),
    5))

RMSE of V4 on validation data: 0.30504
