# <center> Online Deterministic Annealing for Classification </center>

<img src="tensors.jpeg" width="600">

In [3]:
%matplotlib inline
from IPython.display import HTML

import torch
import numpy as np
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.utils.data.dataset import random_split

device = 'cuda' if torch.cuda.is_available() else 'cpu'

np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x19e590e9190>

Creating a Dataset

$X \sim U[0,1]$

$Y = 2X + 1 + 0.1 \epsilon,\ \epsilon \sim U[0,1]$

In [4]:
x = np.random.rand(100, 1)
y = 1 + 2 * x + .1 * np.random.randn(100, 1)

x_tensor = torch.from_numpy(x).float()
y_tensor = torch.from_numpy(y).float()

dataset = TensorDataset(x_tensor, y_tensor)

train_dataset, val_dataset = random_split(dataset, [80, 20])

train_loader = DataLoader(dataset=train_dataset, batch_size=16)
val_loader = DataLoader(dataset=val_dataset, batch_size=16)

- Assume a parametrized model: 
$$ \hat Y = f(X; \theta),\ \theta \in \mathbb{R}^K $$
and initialize $\theta_i,\ i=1,\ldots,K$, randomly

In [5]:
class MyCustomModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.a = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        self.b = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        
    def forward(self, x):
        return self.a + self.b * x
    
model = MyCustomModel().to(device) # model = nn.Sequential(nn.Linear(1, 1)).to(device)

- Determine a loss function: $J(\hat Y, Y)$, e.g. 

$$J(\hat Y, Y) = E[(\hat Y - Y)^2] \simeq \frac 1 N \sum_{i=1}^N (\hat Y(\omega_i) - Y(\omega_i))^2,\ \omega_i \in \Omega$$

In [6]:
loss_fn = nn.MSELoss(reduction='mean') # nn.L1Loss, nn.CosineSimilarity, nn.CrossEntropyLoss, nn.NLLLoss, nn.KLDivLoss, etc.

- Define the gradient descent rule, e.g.

$$ \theta_i(j+1) = \theta_i(j) - \eta(j) \frac \partial {\partial \theta_i} J,\ i=1,\ldots, K,\ j=1,\ldots, n$$

In [7]:
lr = 1e-1

optimizer = optim.SGD(model.parameters(), lr=lr) # Adam, Adagrad, etc.

def train_step(x, y):
        model.train()
        yhat = model(x)
        loss = loss_fn(y, yhat)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        return loss.item()

For n epochs, do:

- For every batch of the $N$ realizations of $X$ in your training data set compute $\hat Y$, and $J(\hat Y, Y)$.

- Compute the gradients
$$ \frac \partial {\partial \theta_i} J = 
\frac {\partial J} {\partial f} \frac {\partial f} {\partial @} \ldots \frac {\partial @} {\partial \theta_i},\ 
i=1,\ldots, K$$
and update the parameters

- For every batch of the realizations of $X$ in your evaluation data set compute $\hat Y$, and $J_v(\hat Y, Y)$.

- Save the optimal parameter values (with respect to $J_v$) in a file

- Print Progress

In [8]:
n_epochs = 10 # 1000
training_losses = []
validation_losses = []
loss_optimal = np.inf

# Use previously saved state 
train_from_scratch = True
if not train_from_scratch:
    model.load_state_dict(torch.load("state_file.pth"))

print(model.state_dict())

for epoch in range(n_epochs):
    
    # Compute the loss in training data & update parameters for each batch
    batch_losses = []
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        loss = train_step(x_batch, y_batch)
        batch_losses.append(loss)
    training_loss = np.mean(batch_losses)
    training_losses.append(training_loss)

    # Compute the loss in evaluation data
    with torch.no_grad():
        val_losses = []
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            y_val = y_val.to(device)
            model.eval()
            yhat = model(x_val)
            val_loss = loss_fn(y_val, yhat).item()
            val_losses.append(val_loss)
        validation_loss = np.mean(val_losses)
        validation_losses.append(validation_loss)
        
        # Save optimal parameters for re-training
        if validation_loss < loss_optimal:
            loss_optimal = validation_loss
            torch.save(model.state_dict(), "state_file.pth")

    # Print loss at each epoch
    print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}")

print(model.state_dict())

OrderedDict([('a', tensor([2.5952])), ('b', tensor([2.7504]))])
[1] Training loss: 1.681	 Validation loss: 0.228
[2] Training loss: 0.108	 Validation loss: 0.019
[3] Training loss: 0.014	 Validation loss: 0.009
[4] Training loss: 0.009	 Validation loss: 0.009
[5] Training loss: 0.008	 Validation loss: 0.010
[6] Training loss: 0.008	 Validation loss: 0.010
[7] Training loss: 0.008	 Validation loss: 0.010
[8] Training loss: 0.008	 Validation loss: 0.010
[9] Training loss: 0.008	 Validation loss: 0.010
[10] Training loss: 0.008	 Validation loss: 0.010
OrderedDict([('a', tensor([1.0117])), ('b', tensor([1.9685]))])
