# Prepare

In [37]:
import torch.nn as nn
import torch
from torchvision import datasets, transforms
from torch import nn, optim
import torch.nn.functional as F
import torchvision
import os

In [38]:
class NegativePenaltySparseCategoricalCrossentropy(nn.Module):
  def __init__(self, class_num:int, p_indices:list, alpha=1.0, penalty_scale=None, reduction='mean', \
         from_where='softmax', eps=1e-10, name='negative_penalty_sparse_categorical_crossentropy'):
    super(NegativePenaltySparseCategoricalCrossentropy, self).__init__()
    self.p_indices = [[p_index] for p_index in p_indices]
    self.alpha = alpha
    self.penalty_scale = float(len(p_indices)) if penalty_scale is None else penalty_scale
    self.penalty_label = _get_penalty_label(class_num, p_indices)
    self.reduction_fn = {
        'none': _no_reduction_over_batch, 'mean': _average_over_batch,
        'sum': _sum_over_batch
    }[reduction]
    self.cce_loss_fn = {
        'logits': _cce_loss_from_logits, 'softmax': _cce_loss_from_softmax,
    }[from_where]
    self.penalty_loss_fn = {
        'logits': _penalty_loss_from_logits, 'softmax': _penalty_loss_from_softmax,
    }[from_where]
    self.eps = eps

  def forward(self, y_pred, y_true):
    num_classes = y_pred.shape[-1]
    # y_true = torch.squeeze(F.one_hot(y_true, num_classes=num_classes), dim=1)
    y_true = F.one_hot(y_true, num_classes=num_classes).float()
    losses = _get_losses(
        y_true, y_pred, self.p_indices, self.penalty_label, self.alpha, self.penalty_scale, self.eps,
        self.cce_loss_fn, self.penalty_loss_fn
    )
    losses = self.reduction_fn(losses)
    return losses


def _get_losses(y_true, y_pred, p_indices:list, penalty_label:list, alpha:float, penalty_scale:float,
         eps:float, cce_loss_fn, penalty_loss_fn):
  batch_size = y_true.shape[0]
  # cce_loss_sample_weights
  cce_loss_sample_weights = torch.any(
      torch.transpose(torch.eq(torch.tensor(p_indices), torch.argmax(y_true, dim=-1)), 0, 1), dim=-1
  ).float()
  # cce loss
  cce_losses = cce_loss_fn(y_pred, y_true, eps)
  cce_losses = cce_loss_sample_weights * cce_losses
  # y_penalty
  y_penalty = torch.repeat_interleave(torch.unsqueeze(torch.tensor(penalty_label), dim=0), batch_size, dim=0).float()
  # penalty_loss_sample_weights
  penalty_loss_sample_weights = 1.0 - cce_loss_sample_weights
  # penalty loss
  penalty_losses = penalty_loss_fn(y_pred, y_penalty, penalty_scale, eps)
  penalty_losses = penalty_loss_sample_weights * penalty_losses
  # total loss
  losses = cce_losses + alpha * penalty_losses
  return losses


def _no_reduction_over_batch(losses):
  return losses


def _average_over_batch(losses):
  return torch.mean(losses)


def _sum_over_batch(losses):
  return torch.sum(losses)


def _cce_loss_from_logits(y_pred, y_true, eps):
  return F.cross_entropy(y_pred, y_true, reduction='none')


def _cce_loss_from_softmax(y_pred, y_true, eps):
  return torch.sum(-y_true * torch.log(torch.clip(y_pred, eps, 1.0 - eps)), dim=-1)


def _penalty_loss_from_logits(y_pred, y_penalty, penalty_scale, eps):
  return F.cross_entropy(1.0 - y_pred, y_penalty, reduction='none') / penalty_scale


def _penalty_loss_from_softmax(y_pred, y_penalty, penalty_scale, eps):
  return torch.sum(
      -y_penalty * torch.log(torch.clip(1.0 - y_pred, eps, 1.0 - eps)), dim=-1
  ) / penalty_scale


def _get_penalty_label(class_num:int, p_indices:list):
  penalty_label = [1 if i in p_indices else 0 for i in range(0, class_num)]
  return penalty_label

In [48]:
class CustomCrossEntropyLoss(nn.Module):
  def __init__(self, eps=1e-10):
    super(CustomCrossEntropyLoss, self).__init__()
    self.eps = eps

  def forward(self, output, target):
    num_classes = output.shape[-1]
    target = F.one_hot(target, num_classes=num_classes).float()
    return torch.sum(-target * torch.log(torch.clip(output, self.eps, 1.0 - self.eps)), dim=-1).mean()

In [39]:
def train(network, train_loader, optimizer, criterion, epoch, train_losses, train_counter):
  network.train()
  for batch_idx, (data, target) in enumerate(train_loader):
    optimizer.zero_grad()
    output = network(data)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    if batch_idx % 1000 == 0:
      print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
        epoch, batch_idx * len(data), len(train_loader.dataset),
        100. * batch_idx / len(train_loader), loss.item()))
      train_losses.append(loss.item())
      train_counter.append(
        (batch_idx*64) + ((epoch-1)*len(train_loader.dataset)))
      torch.save(network.state_dict(), 'results/model.pth')
      torch.save(optimizer.state_dict(), 'results/optimizer.pth')

def test(network, test_loader, criterion, test_losses):
  network.eval()
  test_loss = 0
  correct = 0
  with torch.no_grad():
    for data, target in test_loader:
      output = network(data)
      test_loss += criterion(output, target).item()
      pred = output.data.max(1, keepdim=True)[1]
      correct += pred.eq(target.data.view_as(pred)).sum()
  test_loss /= len(test_loader.dataset)
  test_losses.append(test_loss)
  print(
    '\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))
  )

In [40]:
# Define the model, loss function and optimizer
class Net(nn.Module):
  def __init__(self, from_where:str):
    super(Net, self).__init__()
    self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
    self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
    self.conv2_drop = nn.Dropout2d()
    self.fc1 = nn.Linear(320, 50)
    self.fc2 = nn.Linear(50, 20)
    self.from_where = from_where

  def forward(self, x):
    x = F.relu(F.max_pool2d(self.conv1(x), 2))
    x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
    x = x.view(-1, 320)
    x = F.relu(self.fc1(x))
    x = F.dropout(x, training=self.training)
    x = self.fc2(x)
    if self.from_where == 'logits':
      return x
    elif self.from_where == 'softmax':
      return F.softmax(x, dim=-1)

In [41]:
# Load the MNIST dataset
train_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST(
    '/files/', train=True, download=True,
    transform=torchvision.transforms.Compose(
      [
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.1307,), (0.3081,))
      ]
    )
  ),
  batch_size=32,
  shuffle=True
)

test_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST(
    '/files/', train=False, download=True,
    transform=torchvision.transforms.Compose(
      [
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize((0.1307,), (0.3081,))
      ]
    )
  ),
  batch_size=32,
  shuffle=True
)

# Simply train model on mnist dataset with normal categorical crossentroy loss

1. from_where = 'logits'

In [42]:
network = Net('logits')
optimizer = optim.SGD(network.parameters(), lr=0.01, momentum=0.5)
criterion = nn.CrossEntropyLoss()
# Training loop
n_epochs = 3
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]
if os.path.exists('results'):
  os.system('rm -r results')
os.mkdir('results')
for epoch in range(1, n_epochs + 1):
  train(network, train_loader, optimizer, criterion, epoch, train_losses, train_counter)
  test(network, test_loader, criterion, test_losses)


Test set: Avg. loss: 0.0044, Accuracy: 9596/10000 (96%)


Test set: Avg. loss: 0.0027, Accuracy: 9723/10000 (97%)


Test set: Avg. loss: 0.0021, Accuracy: 9785/10000 (98%)



2. from_where == 'softmax'

In [49]:
network = Net('softmax')
optimizer = optim.SGD(network.parameters(), lr=0.01, momentum=0.5)
criterion = CustomCrossEntropyLoss()
# Training loop
n_epochs = 3
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]
if os.path.exists('results'):
  os.system('rm -r results')
os.mkdir('results')
for epoch in range(1, n_epochs + 1):
  train(network, train_loader, optimizer, criterion, epoch, train_losses, train_counter)
  test(network, test_loader, criterion, test_losses)


Test set: Avg. loss: 0.0043, Accuracy: 9577/10000 (96%)


Test set: Avg. loss: 0.0029, Accuracy: 9720/10000 (97%)


Test set: Avg. loss: 0.0022, Accuracy: 9781/10000 (98%)



# Train model on mnist dataset with the proposed 'NegativePenaltySparseCategoricalCrossentropy' loss function

1. from_where = 'logits'

In [43]:
network = Net('logits')
optimizer = optim.SGD(network.parameters(), lr=0.01, momentum=0.5)
criterion = NegativePenaltySparseCategoricalCrossentropy(class_num=20, p_indices=[0, 1, 2, 3, 4, 5, 6, 7], from_where='logits')
# Training loop
n_epochs = 3
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]
if os.path.exists('results'):
  os.system('rm -r results')
os.mkdir('results')
for epoch in range(1, n_epochs + 1):
  train(network, train_loader, optimizer, criterion, epoch, train_losses, train_counter)
  test(network, test_loader, criterion, test_losses)


Test set: Avg. loss: 0.0193, Accuracy: 7805/10000 (78%)


Test set: Avg. loss: 0.0169, Accuracy: 7867/10000 (79%)


Test set: Avg. loss: 0.0159, Accuracy: 7895/10000 (79%)



2. from_where = 'softmax'

In [44]:
network = Net('softmax')
optimizer = optim.SGD(network.parameters(), lr=0.01, momentum=0.5)
criterion = NegativePenaltySparseCategoricalCrossentropy(class_num=20, p_indices=[0, 1, 2, 3, 4, 5, 6, 7], from_where='softmax')
# Training loop
n_epochs = 3
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]
if os.path.exists('results'):
  os.system('rm -r results')
os.mkdir('results')
for epoch in range(1, n_epochs + 1):
  train(network, train_loader, optimizer, criterion, epoch, train_losses, train_counter)
  test(network, test_loader, criterion, test_losses)


Test set: Avg. loss: 0.0037, Accuracy: 7774/10000 (78%)


Test set: Avg. loss: 0.0028, Accuracy: 7836/10000 (78%)


Test set: Avg. loss: 0.0025, Accuracy: 7874/10000 (79%)



# Conclusion: Accuracy dropped to approximately 80% (79%) which means the proposed 'NegativePenaltySparseCategoricalCrossentropy' loss function workd as expected