In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
from torchvision.models import *
from visualisation.core.utils import device

model = alexnet(pretrained=True).to(device)
print(model)

Now we need some inputs

In [None]:
# %matplotlib notebook

Now we need some inputs images. We are going to use three pictures, a cat, the beautiful Basilica di San Pietro and an image with a dog and a cat.

In [None]:
import glob
import matplotlib.pyplot as plt
import numpy as np
import torch
from utils import *
from PIL import Image

plt.rcParams["figure.figsize"] = 16, 8

We loaded a few packages. In `utils` there are several utility function to creates the plots.

In [None]:
import glob
import matplotlib.pyplot as plt
import numpy as np

from visualisation.core.utils import device
from PIL import Image

image_paths = glob.glob("./images/*.*")

images = list(map(lambda x: Image.open(x), image_paths))

subplot(images, title="inputs", rows_titles=["cat", "san pietro", "dog_cat"], nrows=1, ncols=3)

In [None]:
from torchvision.transforms import ToTensor, Resize, Compose, ToPILImage
from visualisation.core import *
from visualisation.core.utils import image_net_preprocessing

inputs = [
    Compose([Resize((224, 224)), ToTensor(), image_net_preprocessing])(x).unsqueeze(0)
    for x in images
]  # add 1 dim for batch
inputs = [i.to(device) for i in inputs]

We also define an utility function to clean the gpu cache

In [None]:
def free(list_models: list):
    for m in list_models:
        del m
    torch.cuda.empty_cache()

In [None]:
vis = Weights(model, device)

In [None]:
first_layer = model_traced[0]

plt.rcParams["figure.figsize"] = 16, 16

run_vis_plot(vis, inputs[0], first_layer, ncols=4, nrows=4)

In [None]:
first_maxpool_layer = model_traced[2]
run_vis_plot(vis, inputs[0], first_maxpool_layer, ncols=4, nrows=4)

Let's try with an other input, the San Pietro Basilica

In [None]:
run_vis_plot(vis, inputs[1], first_maxpool_layer, ncols=4, nrows=4)

By looking at them, these images make somehow sense; they highlight the basilica layout but it is hard to understand what the model is actually doing. We got the idea that is computing something correctly but we could ask some questions, for example: is it looking at the cupola? Which are the most important features of the Basilica?

Moreover, the deeper we go the harder it becomes to even recognize the input. 

In [None]:
deeper_layer = model_traced[6]
run_vis_plot(vis, inputs[1], deeper_layer, ncols=4, nrows=4)

In [None]:
alexnet(pretrained=True).to(device)

run_vis_plot_across_models(modules, inputs[0], None, GradCam , 'Gradcam', device,
                           nrows=1, 
                           ncols=4, 
                           target_class=None, 
                           postprocessing=image_net_postprocessing)
free([alexnet])

In [None]:
from visualisation.core.utils import imshow

module = module.to(device)

vis = GradCam(module, device)

classes = [None, 285, 453]
outs = [
    vis(inputs[0], None, postprocessing=image_net_postprocessing, target_class=c) for c in classes
]

images, classes = vis_outs2images_classes(outs)

subplot(images, title="resnet34", rows_titles=classes, nrows=1, ncols=len(outs), parse=tensor2img)

Notice how similar to the `CAM` output they are. To better compore our three models, below we plot the grad cam for each input with respect to each model

In [None]:
modules = (
    m(pretrained=True).to(device) for m in modules_instances
)  # make a generator, we don't want to store in memory all of them at once

run_vis_plot_across_models(
    modules,
    inputs[0],
    None,
    GradCam,
    "Gradcam",
    device,
    nrows=4,
    ncols=3,
    target_class=None,
    inputs=inputs,
    idx2label=imagenet2human,
    annotations=["alexnet", "vgg16", "resnet34", "resnet152"],
    postprocessing=image_net_postprocessing,
)

free(modules)

The reader can immediately notice the difference across the models.

### Interesting region 
We talk before about interesting region localizations. Grad-cam can be also used to extract the class object out of the image. Easily, once the have the grad-cam image we can used it as mask to crop out form the input image what we want. The reader can play with the `TR` parameter to see different effects.

In [None]:
TR = 0.3

In [None]:
alexnet_pretrained.eval()

vis = GradCam(alexnet_pretrained, device)

_ = vis(inputs[0], None, postprocessing=image_net_postprocessing)

In [None]:
import cv2


def gradcam2crop(cam, original_img):
    b, c, w, h = inputs[0].shape
    cam = cam.numpy()
    cam -= np.min(cam)
    cam /= np.max(cam)

    cam = cv2.resize(cam, (w, h))
    mask = cam > TR

    original_img = tensor2img(image_net_postprocessing(original_img[0].squeeze()))

    crop = original_img.copy()
    crop[mask == 0] = 0

    return crop


crop = gradcam2crop(vis.cam.cpu(), inputs[0].cpu())

fig = plt.figure()
plt.imshow(crop)

*et voilà*! We can also change again class, and crop the interest region for that class.

In [None]:
_ = vis(inputs[0], None, target_class=231, postprocessing=image_net_postprocessing)

crop = gradcam2crop(vis.cam.cpu(), inputs[0].cpu())

fig = plt.figure()
plt.imshow(crop)