In [1]:
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F
from PIL import Image
import warnings
warnings.filterwarnings("ignore")
from torchvision.models.detection.image_list import ImageList
import numpy as np
import pandas as pd
import os

In [2]:
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:00<00:00, 271MB/s]  


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [3]:
def get_embeddings(model, image):
    with torch.no_grad():
        # Converting the PIL image into Pytorch tensor
        image_tensor = F.to_tensor(image).unsqueeze(0)
#         print(type(image_tensor))
#         print('image are successfully converted into tensors')
        
        # Getting Predictions from the Model - It helped me to calculate the Number of Bounding Boxes
        detections = model(image_tensor)
        
        # Extraction of Features from the Image
        features = model.backbone(image_tensor)
#         print(type(features))
#         print("Features extracted Successfully")

#         if not isinstance(features, dict):
#             raise ValueError("Backbone features are not in the expected dictionary format.")

        # Shape of Image tensors basically height and width
        image_shape = image_tensor.shape[-2:] 
        image_list = ImageList(image_tensor, [image_shape])
#         print(type(image_list))
#         print('Tensors are convereted to Image list - which is the input for RPN)

        # Passing into RPN
        proposals, _ = model.rpn(image_list, features)
#         print(type(proposals))
#         print("Proposals Retrieved Successfully")

        # Region of Interest (ROI)
        box_features = model.roi_heads.box_roi_pool(features, proposals,[image_shape] )
#         print(type(box_features))

        # Flattening
        box_features = box_features.flatten(start_dim=1)
        
        # Embedding Extraction
        box_embeddings = model.roi_heads.box_head(box_features)
#         print(type(box_embeddings))

        # Extract the Bouding Boxes from the detection
        boxes = detections[0]['boxes']
        num_boxes = boxes.shape[0]
        print(num_boxes)
        
        return box_embeddings, num_boxes

### Function to Process the Images like Applying embedding and saving the embeddings

In [4]:
def process_images(model, image_paths):
    data = []
    for image_path in image_paths:
        image = Image.open(image_path)
        embeddings, num_boxes = get_embeddings(model, image)
        print(embeddings)
        embeddings_filename = os.path.splitext(os.path.basename(image_path))[0] + '_embeddings.npy'
        np.save(embeddings_filename, embeddings.cpu().numpy())
        data.append({
            'Image': os.path.basename(image_path),
            'Num_BBoxes': num_boxes,
            'Embedding_Size': embeddings.size(),
            'Embeddings_File': embeddings_filename
        })
    return data

### Function to get the paths

In [5]:
def get_image_paths(image_folder, limit=10):
    image_paths = []
    count = 0
    for file_name in os.listdir(image_folder):
        if file_name.endswith('.jpg'):
            full_path = os.path.join(image_folder, file_name)
            image_paths.append(full_path)
            count += 1
            if count >= limit:
                break
    return image_paths

In [6]:
image_folder = '/kaggle/input/ms-coco-dataset/train2014/train2014'
image_paths = get_image_paths(image_folder)


In [7]:
image_data = process_images(model, image_paths)

52
tensor([[0.0000, 1.4023, 0.0000,  ..., 0.4037, 0.0000, 0.0000],
        [0.6489, 0.5089, 0.6012,  ..., 0.0000, 0.0000, 2.1589],
        [1.1482, 0.3465, 0.1313,  ..., 0.0000, 0.0000, 1.8975],
        ...,
        [0.0000, 0.0000, 0.0325,  ..., 0.0160, 1.8877, 0.3443],
        [0.0000, 0.0000, 0.2788,  ..., 0.4492, 1.5149, 0.4678],
        [0.0000, 0.4755, 0.0000,  ..., 0.0000, 2.1681, 0.5031]])
3
tensor([[0.0336, 0.1411, 0.2680,  ..., 0.0000, 0.0000, 1.3485],
        [0.0000, 0.0000, 0.2409,  ..., 0.0000, 0.0000, 0.6287],
        [0.0205, 0.0000, 0.6677,  ..., 0.0000, 0.8458, 1.0905],
        ...,
        [0.0000, 0.6868, 0.7615,  ..., 0.0000, 0.8355, 0.6601],
        [0.0000, 0.8652, 0.3970,  ..., 0.0000, 1.8524, 0.4940],
        [0.0000, 0.2949, 0.0000,  ..., 0.0000, 0.4014, 1.2985]])
100
tensor([[0.4606, 1.8814, 0.0000,  ..., 0.0000, 0.0665, 0.0000],
        [0.0000, 0.0000, 0.4352,  ..., 0.0000, 0.0000, 0.0000],
        [0.6684, 0.5941, 0.0480,  ..., 0.0000, 0.0000, 0.0000],
   

In [8]:
df = pd.DataFrame(image_data)
df

Unnamed: 0,Image,Num_BBoxes,Embedding_Size,Embeddings_File
0,COCO_train2014_000000263229.jpg,52,"(1000, 1024)",COCO_train2014_000000263229_embeddings.npy
1,COCO_train2014_000000381595.jpg,3,"(1000, 1024)",COCO_train2014_000000381595_embeddings.npy
2,COCO_train2014_000000147733.jpg,100,"(1000, 1024)",COCO_train2014_000000147733_embeddings.npy
3,COCO_train2014_000000559395.jpg,16,"(1000, 1024)",COCO_train2014_000000559395_embeddings.npy
4,COCO_train2014_000000374072.jpg,23,"(1000, 1024)",COCO_train2014_000000374072_embeddings.npy
5,COCO_train2014_000000233539.jpg,38,"(974, 1024)",COCO_train2014_000000233539_embeddings.npy
6,COCO_train2014_000000213863.jpg,49,"(1000, 1024)",COCO_train2014_000000213863_embeddings.npy
7,COCO_train2014_000000471409.jpg,45,"(1000, 1024)",COCO_train2014_000000471409_embeddings.npy
8,COCO_train2014_000000487632.jpg,100,"(1000, 1024)",COCO_train2014_000000487632_embeddings.npy
9,COCO_train2014_000000242092.jpg,9,"(1000, 1024)",COCO_train2014_000000242092_embeddings.npy


In [9]:
df.to_excel('data_emebeddings.xlsx', index=False)
print("Data saved")

Data saved
