In [6]:
import numpy as np

import torch
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

from PIL import Image
import cv2

from transformers import ResNetForImageClassification

In [7]:
#load pretrained resnet model
model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

#define transforms to preprocess input image into format expected by model
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
#inverse transform to get normalize image back to original form for visualization
inv_normalize = transforms.Normalize(
    mean=[-0.485/0.229, -0.456/0.224, -0.406/0.255],
    std=[1/0.229, 1/0.224, 1/0.255]
)

#transforms to resize image to the size expected by pretrained model,
#convert PIL image to tensor, and
#normalize the image
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    normalize,          
])

config.json:   0%|          | 0.00/69.6k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

In [11]:
readImg = './eagle.jpg'
img0 = Image.open(readImg).convert("RGB")

In [12]:
activation = {}
def getActivation(name):
    # the hook signature
    def hook(module, input, output):
        activation[name] = output.detach().cpu().numpy()
    return hook

In [13]:
model.resnet.encoder.stages[-1].register_forward_hook(getActivation('last_stage'))

<torch.utils.hooks.RemovableHandle at 0x720df96fa8a0>

In [15]:
params = list(model.classifier[1].parameters())

weight = np.squeeze(params[0].data.numpy())

In [16]:
def return_CAM(feature_conv, weight, class_idx):
    # generate the class -activation maps upsample to 256x256
    size_upsample = (256, 256)
    bz, nc, h, w = feature_conv.shape
    output_cam = []
    for idx in class_idx:
        beforeDot =  feature_conv.reshape((nc, h*w))
        cam = np.matmul(weight[idx], beforeDot)
        cam = cam.reshape(h, w)
        cam = cam - np.min(cam)
        cam_img = cam / np.max(cam)
        cam_img = np.uint8(255 * cam_img)
        output_cam.append(cv2.resize(cam_img, size_upsample))
    return output_cam

In [17]:
model.eval()
logit = model(transform(img0).unsqueeze(0)).logits

In [18]:
h_x = F.softmax(logit, dim=1).data.squeeze()

probs, idx = h_x.sort(0, True)
probs = probs.detach().numpy()
idx = idx.numpy()

In [19]:
CAMs = return_CAM(activation['last_stage'], weight, [idx[0]])

In [20]:
img = cv2.imread(readImg)
height, width, _ = img.shape
heatmap = cv2.applyColorMap(cv2.resize(CAMs[0],(width, height)), cv2.COLORMAP_JET)
result = heatmap * 0.5 + img * 0.5

cv2.imwrite("image_1.jpg", result)

[ WARN:0@743.390] global loadsave.cpp:848 imwrite_ Unsupported depth image for selected encoder is fallbacked to CV_8U.


True

In [21]:
model.config.id2label[idx[0]]

'bald eagle, American eagle, Haliaeetus leucocephalus'