# PAAI21

Image classification can be successfully solved by modern CNNs. However, there are still
a plethora of questions regarding how those models manage to extract and model general
features for large number of classes. A straightforward strategy is to focus on saliency i.e.,
the area of the image that has a maximal response w.r.t. the predicted class. The aim of
this project is to complement that idea by estimating the number of salient regions that
common image classifiers react to and measure the consistency of those regions
regarding the predicted class.

## Resnet18 - Training

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import random
import skimage.transform
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

from keras.optimizers import SGD
from matplotlib.pyplot import imshow
from PIL import Image
from torch.autograd import Variable
from torch import topk
from torchvision import models, datasets, transforms

In [None]:
# -------------------------------------
# Define paramters and other variables
# -------------------------------------

# Classes labels CIFAR-10
classes = ('plane',
           'auto',
           'bird',
           'cat',
           'deer',
           'dog',
           'frog',
           'horse',
           'ship',
           'truck')

# CIFAR1-10 parameters
input_size = 32
num_classes = 10

# Check for GPU/CPU to allocate tensor
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(device)

In [None]:
# Data transformations
normalize = transforms.Normalize(
    (0.4914, 0.4822, 0.4465),
    (0.2023, 0.1994, 0.2010)
)
    
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    normalize
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    normalize,
])

# Download & transform CIFAR-10 datasets
train_dataset = datasets.CIFAR10("./data", train=True,
                                 transform=train_transform, download=True)

test_dataset = datasets.CIFAR10("./data", train=False,
                                transform=test_transform, download=True)


train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=5000, shuffle=False)

In [None]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.resnet = models.resnet18(pretrained=False, num_classes=10)

        self.resnet.conv1 = torch.nn.Conv2d(
            3, 64, kernel_size=3, stride=1, padding=1, bias=False
        )
        self.resnet.maxpool = torch.nn.Identity()

    def forward(self, x):
        x = self.resnet(x)
        x = F.log_softmax(x, dim=1)

        return x

In [None]:
def train_model(model, train_loader, optimizer, epoch, verbose=False):
  model.train()
  total_loss = []
  
  for data, target in train_loader:
    data = data.cuda()
    target = target.cuda()
    
    optimizer.zero_grad()
    prediction = model(data)
    loss = F.nll_loss(prediction, target)
    
    loss.backward()
    optimizer.step()
    total_loss.append(loss.item())
    
    avg_loss = sum(total_loss) / len(total_loss)
    if (verbose):
        print("Training set: Epoch: {} Average Loss: {:.2f}".format(epoch, avg_loss))

  return avg_loss


def test_model(model, test_loader, verbose=False):
  model.eval()
  loss = 0
  correct = 0

  for data, target in test_loader:
    with torch.no_grad():
      data = data.cuda()
      target = target.cuda()

      prediction = model(data)
      loss += F.nll_loss(prediction, target, reduction="sum")

      prediction = prediction.max(1)[1]
      correct += prediction.eq(target.view_as(prediction)).sum().item()

      loss /= len(test_loader.dataset)
      percentage_correct = 100.0 * correct / len(test_loader.dataset)

      if (verbose):
        print("Testing set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)".format(
            loss, correct, len(test_loader.dataset), percentage_correct))
      
  return loss, percentage_correct

In [None]:
# Hyperparameters
epochs=50
lr=0.1
    
model = Model()
model = model.cuda()

milestones = [25, 40]

optimizer = torch.optim.SGD(
    model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4
)
scheduler = torch.optim.lr_scheduler.MultiStepLR(
    optimizer, milestones=milestones, gamma=0.1
)

print("Start train/test resnet18!")
for epoch in range(1, epochs + 1):
    avg_loss = train_model(model, train_loader, optimizer, epoch)
    loss, percentage_correct = test_model(model, test_loader)
    print("Epoch: {} Training: Loss: {:.2f} - Testing: Loss: {:.2f} Accuracy {:.2f}%".format(
        epoch, avg_loss, loss, percentage_correct))

    scheduler.step()

torch.save(model.state_dict(), "PAAI21_CIFAR10_model.pt")

# Class activation maps

## Terminology
- **Receptive field:** The receptive field of a neuron is composed by all
  pixels in **X** input that influence it.

- **Convolutional units:** A convolutional layer contains units whose receptive
  fields cover a patch of the previous layer[1].

- **Softmax:** It is a function used to become an output score in a probablity
  distribution.

- **Global average pooling (GAP):** It is an operation that consists of take the
  averag of each feature map and the resulting vector is used to feed the
  softmax layer[2]. 

## Summary

1. The convolutional units of several CNNs layers behave as object detectors
  even object location is not given. So they have the ability to localize
  objects.

2. This ability is lost when fully-connected layers are used for classification.

3. we can replace fully-connected layers by GAP.

4. There is no parameter to optimize in the global average pooling,
  thus overfitting is avoided at this layer. So GAP acts as a regularizer.

5. We can modify GAP and use it in combination with a class called 
  **class activation mapping (CAM)** to retain this localization ability
  until the final layer.

6. Therefore a CNN trained on object categorization is successfully able to
  localize the discriminative regions for action classification.

7. A **class activation map (CAM)** for a particular category indicates the
  discriminative image regions used by the CNN to identify that category.

8. we can identify the importance of the image regions by projecting back
  the weights of the output layer on to the convolutional feature maps, a
  technique we call **class activation mapping**.

9. Normally, We perform GAP on the convulitional feature maps and this
  output feed a fully-connected layer that produces the final output, So the
  weighted sum of GAP output is used to generate the final output
  (e.g. category of something).
 
10. We can identify the importance of the image regions by projecting back
   the weights of the output layer on to the convolutional feature maps.
 
## CAM description

$M_c(x, y) = \sum_k w_k^c f_k(x,y)$
<br><br>
Where:<br>
$M_c$ is the class activation map for a class $c$<br>
$f_k(x,y)$ is the activation of unit $k$ in the last convolutional layer at spatial location $(x,y)$<br>
$F^k$ is the output of GAP on $f_k(x,y)$ , then $F^k=\sum_{x,y} f_k(x,y)$<br>
$w_k^c$ indicates the importance of $F^k$ for class $c$


## References:
[1] https://en.wikipedia.org/wiki/Convolutional_neural_network

[2] https://arxiv.org/pdf/1312.4400v3.pdf



In [None]:
# Load model
model = Model()
model.cuda()
model.load_state_dict(torch.load("../input/paaimodel/PAAI21_CIFAR10_model.pt"))
model.eval()

In [None]:
class LayerFeatures():
    features=None
    def __init__(self, m):
      self.hook = m.register_forward_hook(self.hook_fn)

    def hook_fn(self, module, input, output):
      self.features = ((output.cpu()).data).numpy()

    def remove(self): self.hook.remove()


def compute_CAM(feature_conv, class_weights):
    _, num_channels, h, w = feature_conv.shape
    CAM = np.zeros((h, w))
    i = 0
    for act_map in feature_conv[0]:
        CAM += act_map * class_weights[i]
        i+=1

    # Now we need to normalize our CAM in [0,1] range
    CAM = CAM - np.min(CAM)
    CAM = CAM / np.max(CAM)

    return CAM


def get_one_random_sample(test_dataset):
  num_total_imgs = len(test_dataset.data)
  random_index = random.randint(1, num_total_imgs)
  img = test_dataset.data[random_index]
  label = test_dataset.targets[random_index]

  return img, label

# Compute CAM

In [None]:
display_transform = transforms.Compose([
   transforms.Resize((32,32))])

image, label =  get_one_random_sample(test_dataset)
tensor = test_transform(image)

prediction_var = Variable((tensor.unsqueeze(0)).cuda(), requires_grad=True)

model.cuda()
model.eval()
model._modules.keys()

final_layer = model._modules.get("resnet").layer4[-1]
activated_features = LayerFeatures(final_layer)

prediction = model(prediction_var)
pred_probabilities = F.softmax(prediction, dim=1).data.squeeze()
activated_features.remove()

# Indentify the predicted class
value, index = topk(pred_probabilities, 1)

# Get information from identified class 
weight_softmax_params = list(model._modules.get('resnet').fc.parameters())
weight_softmax = np.squeeze(weight_softmax_params[0].cpu().data.numpy())
class_id = topk(pred_probabilities,1)[1].int()
class_weights = weight_softmax[class_id]
img_cam = compute_CAM(activated_features.features, class_weights)

# As we can see, our CAM size does not match with the our
# image. We need to resize our map and interpolate the values
# according to our image
resized_cam = skimage.transform.resize(img_cam, tensor.shape[1:3])

# Visualize results
print("IMAGE CLASS: ", classes[label])
print("PREDICTION: ", classes[index.tolist()[0]])
print("ACCURACY: {:.2f}%".format(value.tolist()[0] * 100))
print("Shape:", tensor.shape[1:3])

# Plot
plt.figure()
f, ax = plt.subplots(nrows=1, ncols=3, figsize=(16, 16)) 
ax[0].imshow(image)
ax[1].imshow(img_cam, alpha=0.5, cmap='jet')
ax[2].imshow(image)
ax[2].imshow(resized_cam, alpha=0.5, cmap='jet')