# Variation of Hyperparameters for Analog AI Resnet-34 Model
Notebook based on the following papers:
* [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)
* [Enabling Training of Neural Networks on Noisy Hardware](https://www.frontiersin.org/articles/10.3389/frai.2021.699148/full)

Notes:


*   The model uses tiki-taka learning
*   Modeling of difference between hyper perameters uses wandb.ai



In [None]:
# To install the aihwkit and graphing library 
# ! pip install aihwkit
# ! pip install wandb

# Imports from PyTorch.
from torchvision.models import resnet34

# Imports from aihwkit.
from aihwkit.nn.conversion import convert_to_analog_mapped
from aihwkit.simulator.presets import TikiTakaReRamSBPreset
from aihwkit.simulator.configs.utils import MappingParameter
from aihwkit.simulator.configs import UnitCellRPUConfig
from aihwkit.simulator.configs.devices import (
    TransferCompound,
    SoftBoundsDevice)
from aihwkit.simulator.rpu_base import cuda
import wandb

Load digital pytorch model of ResNet-34. To learn more: [ResNet-34](https://arxiv.org/abs/1512.03385)

In [None]:
model = resnet34()
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

Function defintion of defining rpu_config to allow for variations of hyper peramters

In [None]:
def create_rpu_config(use_preset):
  # usage of preset
  if(use_preset):
    from aihwkit.simulator.presets import TikiTakaReRamSBPreset

    rpu_config = TikiTakaReRamSBPreset()

    return rpu_config


  # Define device and chip configuration used in the RPU tile
  mapping = MappingParameter(max_input_size=512,  # analog tile size
                            max_output_size=512,
                            digital_bias=True,
                            weight_scaling_omega=0.6)  # whether to use analog or digital bias

  # The Tiki-taka learning rule can be implemented using the transfer device.
  rpu_config = UnitCellRPUConfig(
      device=TransferCompound(

          # Devices that compose the Tiki-taka compound.
          unit_cell_devices=[
              SoftBoundsDevice(w_min=-0.3, w_max=0.3),
              SoftBoundsDevice(w_min=-0.6, w_max=0.6)
          ],

          # Make some adjustments of the way Tiki-Taka is performed.
          units_in_mbatch=True,    # batch_size=1 anyway
          transfer_every=2,        # every 2 batches do a transfer-read
          n_reads_per_transfer=1,  # one forward read for each transfer
          gamma=0.0,               # all SGD weight in second device
          scale_transfer_lr=True,  # in relative terms to SGD LR
          transfer_lr=1.0,         # same transfer LR as for SGD
          fast_lr=0.1,             # SGD update onto first matrix constant
          transfer_columns=True    # transfer use columns (not rows)
      ), mapping = mapping
  )

  return rpu_config

Function below converts the digital model defined my pytorch into an analog aihwkit variant.

In [None]:
def create_analog_network(digital_model, rpu_config):
  # Convert the model to its analog version.
  model = convert_to_analog_mapped(digital_model, rpu_config)

  return model

from torch.nn import Tanh, MaxPool2d, LogSoftmax, Flatten
from aihwkit.nn import AnalogConv2d, AnalogLinear, AnalogSequential

def create_analog_network(rpu_config):
  # this creates an analog network of lenet 5
  channel = [16, 32, 512, 128]
  model = AnalogSequential(
      AnalogConv2d(in_channels=1, out_channels=channel[0], kernel_size=5, stride=1,
                      rpu_config=rpu_config),
      Tanh(),
      MaxPool2d(kernel_size=2),
      AnalogConv2d(in_channels=channel[0], out_channels=channel[1], kernel_size=5, stride=1,
                      rpu_config=rpu_config),
      Tanh(),
      MaxPool2d(kernel_size=2),
      Tanh(),
      Flatten(),
      AnalogLinear(in_features=channel[2], out_features=channel[3], rpu_config=rpu_config),
      Tanh(),
      AnalogLinear(in_features=channel[3], out_features=10, rpu_config=rpu_config),
      LogSoftmax(dim=1)
  )

  return model

Below cross entropy is used to calculate the loss and the Stochastic Gradient Descent (SGD) as optimizer:

In [None]:
from torch.nn import CrossEntropyLoss

criterion = CrossEntropyLoss()

from aihwkit.optim import AnalogSGD

def create_analog_optimizer(model):
    """Create the analog-aware optimizer.

    Args:
        model (nn.Module): model to be trained

    Returns:
        Optimizer: created analog optimizer
    """
    
    optimizer = AnalogSGD(model.parameters(), lr=0.01) # we will use a learning rate of 0.01 as in the paper
    optimizer.regroup_param_groups(model)

    return optimizer

We can now write the train function which will optimize the network over the MNIST train dataset. The train_step function will take as input the images to train on, the model to train and the criterion and optimizer to train with:

In [None]:
from torch import device, cuda

DEVICE = device('cuda' if cuda.is_available() else 'cpu')
print('Running the simulation on: ', DEVICE)

def train_step(train_data, model, criterion, optimizer):
    """Train network.

    Args:
        train_data (DataLoader): Validation set to perform the evaluation
        model (nn.Module): Trained model to be evaluated
        criterion (nn.CrossEntropyLoss): criterion to compute loss
        optimizer (Optimizer): analog model optimizer

    Returns:
        train_dataset_loss: epoch loss of the train dataset
    """
    total_loss = 0

    model.train()

    for images, labels in train_data:
        images = images.to(DEVICE)
        labels = labels.to(DEVICE)
        optimizer.zero_grad()

        # Add training Tensor to the model (input).
        output = model(images)
        loss = criterion(output, labels)

        # Run training (backward propagation).
        loss.backward()

        # Optimize weights.
        optimizer.step()
        total_loss += loss.item() * images.size(0)
    train_dataset_loss = total_loss / len(train_data.dataset)

    return train_dataset_loss

Running the simulation on:  cpu


Since training can be quite time consuming it is nice to see the evolution of the training process by testing the model capabilities on a set of images that it has not seen before (test dataset). So we write a test_step function:

In [None]:
def test_step(validation_data, model, criterion):
    """Test trained network

    Args:
        validation_data (DataLoader): Validation set to perform the evaluation
        model (nn.Module): Trained model to be evaluated
        criterion (nn.CrossEntropyLoss): criterion to compute loss

    Returns: 
        test_dataset_loss: epoch loss of the train_dataset
        test_dataset_error: error of the test dataset
        test_dataset_accuracy: accuracy of the test dataset
    """
    total_loss = 0
    predicted_ok = 0
    total_images = 0

    model.eval()

    for images, labels in validation_data:
        images = images.to(DEVICE)
        labels = labels.to(DEVICE)

        pred = model(images)
        loss = criterion(pred, labels)
        total_loss += loss.item() * images.size(0)

        _, predicted = torch.max(pred.data, 1)
        total_images += labels.size(0)
        predicted_ok += (predicted == labels).sum().item()
        test_dataset_accuracy = predicted_ok/total_images*100
        test_dataset_error = (1-predicted_ok/total_images)*100

    test_dataset_loss = total_loss / len(validation_data.dataset)

    return test_dataset_loss, test_dataset_error, test_dataset_accuracy

To reach satisfactory accuracy levels, the train_step will have to be repeated mulitple time so we will implement a loop over a certain number of epochs:

In [None]:
def training_loop(model, criterion, optimizer, train_data, validation_data, epochs=15, print_every=1):
  """Training loop.

  Args:
      model (nn.Module): Trained model to be evaluated
      criterion (nn.CrossEntropyLoss): criterion to compute loss
      optimizer (Optimizer): analog model optimizer
      train_data (DataLoader): Validation set to perform the evaluation
      validation_data (DataLoader): Validation set to perform the evaluation
      epochs (int): global parameter to define epochs number
      print_every (int): defines how many times to print training progress

  """
  train_losses = []
  valid_losses = []
  test_error = []

  # Train model
  for epoch in range(0, epochs):
      # Train_step
      train_loss = train_step(train_data, model, criterion, optimizer)
      train_losses.append(train_loss)

      if epoch % print_every == (print_every - 1):
          # Validate_step
          with torch.no_grad():
              valid_loss, error, accuracy = test_step(validation_data, model, criterion)
              valid_losses.append(valid_loss)
              test_error.append(error)

          print(f'Epoch: {epoch}\t'
                f'Train loss: {train_loss:.4f}\t'
                f'Valid loss: {valid_loss:.4f}\t'
                f'Test error: {error:.2f}%\t'
                f'Test accuracy: {accuracy:.2f}%\t')

We will now download the MNIST dataset and prepare the images for the training and test:

In [None]:
import os
from torchvision import datasets, transforms
PATH_DATASET = os.path.join('data', 'DATASET')
os.makedirs(PATH_DATASET, exist_ok=True)

def load_images():
    """Load images for train from torchvision datasets."""

    transform = transforms.Compose([transforms.ToTensor()])
    train_set = datasets.MNIST(PATH_DATASET, download=True, train=True, transform=transform)
    test_set = datasets.MNIST(PATH_DATASET, download=True, train=False, transform=transform)
    train_data = torch.utils.data.DataLoader(train_set, batch_size=8, shuffle=True)
    test_data = torch.utils.data.DataLoader(test_set, batch_size=8, shuffle=False)

    return train_data, test_data

Put together all the code above to train

In [None]:
import torch

torch.manual_seed(1)

#load the dataset
train_data, test_data = load_images()

#create the rpu_config
use_lenet5 = True
rpu_config = create_rpu_config(use_lenet5)

# load the resnet model
digital_model = resnet34()

#create the model
if(use_lenet5):
  model = create_analog_network(rpu_config).to(DEVICE)
else:
  model = create_analog_network(digital_model, rpu_config).to(DEVICE)

#define the analog optimizer
optimizer = create_analog_optimizer(model)

#training
training_loop(model, criterion, optimizer, train_data, test_data)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to data/DATASET/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting data/DATASET/MNIST/raw/train-images-idx3-ubyte.gz to data/DATASET/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to data/DATASET/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting data/DATASET/MNIST/raw/train-labels-idx1-ubyte.gz to data/DATASET/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to data/DATASET/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting data/DATASET/MNIST/raw/t10k-images-idx3-ubyte.gz to data/DATASET/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to data/DATASET/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting data/DATASET/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/DATASET/MNIST/raw

Processing...


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


Done!
Epoch: 0	Train loss: 2.6585	Valid loss: 2.5088	Test error: 89.82%	Test accuracy: 10.18%	
Epoch: 1	Train loss: 2.7142	Valid loss: 2.7410	Test error: 89.36%	Test accuracy: 10.64%	
Epoch: 2	Train loss: 2.7374	Valid loss: 2.8049	Test error: 89.96%	Test accuracy: 10.04%	
Epoch: 3	Train loss: 2.8196	Valid loss: 2.8251	Test error: 89.83%	Test accuracy: 10.17%	
Epoch: 4	Train loss: 2.8271	Valid loss: 2.7738	Test error: 91.16%	Test accuracy: 8.84%	
Epoch: 5	Train loss: 2.9055	Valid loss: 3.1197	Test error: 90.18%	Test accuracy: 9.82%	
Epoch: 6	Train loss: 2.8759	Valid loss: 2.5989	Test error: 88.65%	Test accuracy: 11.35%	
Epoch: 7	Train loss: 2.8195	Valid loss: 2.9209	Test error: 89.95%	Test accuracy: 10.05%	
Epoch: 8	Train loss: 2.8615	Valid loss: 2.7345	Test error: 88.65%	Test accuracy: 11.35%	
Epoch: 9	Train loss: 2.9426	Valid loss: 3.1395	Test error: 89.91%	Test accuracy: 10.09%	
Epoch: 10	Train loss: 2.9133	Valid loss: 2.8123	Test error: 90.49%	Test accuracy: 9.51%	
Epoch: 11	Train l