# **MMDetection** Interface

In [None]:
import sys, pathlib

root = pathlib.Path().resolve().parent
src  = root / 'scenic_reasoning' / 'src'
sys.path.insert(0, str(src))

sys.path

['/Users/jdiazchao/Git/scenic-reasoning/scenic_reasoning/src',
 '/Users/jdiazchao/Git/scenic-reasoning/scenic_reasoning/src',
 '/Users/jdiazchao/Git/scenic-reasoning/scenic_reasoning/src',
 '/Users/jdiazchao/Git/scenic-reasoning/scenic_reasoning/src',
 '/Users/jdiazchao/Git/scenic-reasoning/scenic_reasoning/src',
 '/opt/miniconda3/envs/openmmlab/lib/python38.zip',
 '/opt/miniconda3/envs/openmmlab/lib/python3.8',
 '/opt/miniconda3/envs/openmmlab/lib/python3.8/lib-dynload',
 '',
 '/opt/miniconda3/envs/openmmlab/lib/python3.8/site-packages']

In [12]:
from scenic_reasoning.src.scenic_reasoning.models.bMMDetection import MMDetection

ModuleNotFoundError: No module named 'scenic_reasoning.src'

In [1]:
coco_labels = {
    -1: "undefined",
    0: "person",
    1: "bicycle",
    2: "car",
    3: "motorcycle",
    4: "airplane",
    5: "bus",
    6: "train",
    7: "truck",
    8: "boat",
    9: "traffic light",
    10: "fire hydrant",
    11: "stop sign",
    12: "parking meter",
    13: "bench",
    14: "bird",
    15: "cat",
    16: "dog",
    17: "horse",
    18: "sheep",
    19: "cow",
    20: "elephant",
    21: "bear",
    22: "zebra",
    23: "giraffe",
    24: "backpack",
    25: "umbrella",
    26: "handbag",
    27: "tie",
    28: "suitcase",
    29: "frisbee",
    30: "skis",
    31: "snowboard",
    32: "sports ball",
    33: "kite",
    34: "baseball bat",
    35: "baseball glove",
    36: "skateboard",
    37: "surfboard",
    38: "tennis racket",
    39: "bottle",
    40: "wine glass",
    41: "cup",
    42: "fork",
    43: "knife",
    44: "spoon",
    45: "bowl",
    46: "banana",
    47: "apple",
    48: "sandwich",
    49: "orange",
    50: "broccoli",
    51: "carrot",
    52: "hot dog",
    53: "pizza",
    54: "donut",
    55: "cake",
    56: "chair",
    57: "couch",
    58: "potted plant",
    59: "bed",
    60: "dining table",
    61: "toilet",
    62: "tv",
    63: "laptop",
    64: "mouse",
    65: "remote",
    66: "keyboard",
    67: "cell phone",
    68: "microwave",
    69: "oven",
    70: "toaster",
    71: "sink",
    72: "refrigerator",
    73: "book",
    74: "clock",
    75: "vase",
    76: "scissors",
    77: "teddy bear",
    78: "hair drier",
    79: "toothbrush",
}

In [2]:
from enum import Enum

class BBox_Format(Enum):
    XYXY = 'xyxy'

class ObjectDetectionResultI:
    def __init__(self, score, cls, label, bbox, image_hw, bbox_format):
        self.score = score
        self.cls = cls
        self.label = label
        self.bbox = bbox
        self.image_hw = image_hw
        self.bbox_format = bbox_format

## Implementation

In [None]:
from typing import Union, List, Tuple
from pathlib import Path

import torch
import numpy as np
import cv2

# from scenic_reasoning.interfaces.ObjectDetectionI import *
# from scenic_reasoning.utilities.coco import coco_labels
from mmdet.apis import DetInferencer


Image = Union[np.ndarray, torch.Tensor, str]


class MMDetection: # (ObjectDetectionModelI):
    def __init__(self, **kwargs) -> None:

        # TODO: Take config_file and/or checkpoint_file as input

        self.model = kwargs.get('model', 'rtmdet_tiny_8xb32-300e_coco.py')
        self.weights = kwargs.get('weights', 'rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth')
        self.inferencer = DetInferencer(
            model=self.model,
            weights=self.weights,
            device=kwargs.get('device', 'cpu')
        )

        self.batch_size = kwargs.get('batch_size', 1)

    def out_to_obj(self, out: dict, image_hw: Tuple[int, int]):
        obj = []
        for label, score, bbox in zip(out['labels'], out['scores'], out['bboxes']):
            obj += [
                ObjectDetectionResultI(
                    score=score,
                    cls=label,
                    label=coco_labels[label],
                    bbox=bbox,
                    image_hw=image_hw,
                    bbox_format=BBox_Format.XYXY
                )
            ]

        return obj

    def identify_for_image(
        self,
        image: Image,
        debug: bool = False,
        **kwargs
    ):

        if isinstance(image, str):
            image_hw = cv2.imread(image).shape[:2]
        else:
            if isinstance(image, torch.Tensor):
                image = image.detach().cpu().numpy()

            image = image.astype(np.uint8)
            image_hw = image.shape[:2]

        pred = self.inferencer(
            inputs=image,
            out_dir=kwargs.get('out_dir', ''),
            batch_size=1
        )['predictions'][0]

        return [self.out_to_obj(pred, image_hw)]

    def identify_for_image_batch(
        self,
        images: Union[List[Image], str],
        debug: bool = False,
        **kwargs,
    ):
        
        image_hws = []
        input_data = images

        if isinstance(images, str):
            image_dir = Path(images)
            
            image_paths = sorted([
                p for p in image_dir.iterdir() 
                if p.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff']
            ])
            if not image_paths:
                return []
            
            for path in image_paths:
                image_hws.append(cv2.imread(str(path)).shape[:2])
            
            input_data = [str(p) for p in image_paths]

        elif isinstance(images, list):
            if not images:
                return []
            
            processed_list = []
            for img_item in images:
                if isinstance(img_item, str):
                    image_hw = cv2.imread(img_item).shape[:2]
                    processed_list.append(img_item)
                elif isinstance(img_item, torch.Tensor):
                    img_np = img_item.detach().cpu().numpy().astype(np.uint8)
                    image_hw = img_np.shape[:2]
                    processed_list.append(img_np)
                elif isinstance(img_item, np.ndarray):
                    img_np = img_item.astype(np.uint8)
                    image_hw = img_np.shape[:2]
                    processed_list.append(img_np)
                else:
                    raise TypeError(f"Unsupported image type in list: {type(img_item)}")
                image_hws.append(image_hw)
            input_data = processed_list
        else:
            raise TypeError("Input must be a list of images or a path to a directory.")

        predictions = self.inferencer(
            inputs=input_data,
            out_dir=kwargs.get('out_dir', ''),
            batch_size=kwargs.get('batch_size', self.batch_size)
        )['predictions']

        return [self.out_to_obj(pred, hw) for pred, hw in zip(predictions, image_hws)]


    def to(self, device: Union[str, torch.device]):
        self.inferencer = DetInferencer(
            model=self.model,
            weights=self.weights,
            device=device
        )

## Usage

### Download models

With openmim installed, run the following to download the model
    
    mim download mmdet --config rtmdet_tiny_8xb32-300e_coco --dest .

### Initialization

In [5]:
model = MMDetection(
    model='rtmdet_tiny_8xb32-300e_coco.py',
    weights='rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth'
)

Loads checkpoint by local backend from path: rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth
The model and loaded state dict do not match exactly

unexpected key in source state_dict: data_preprocessor.mean, data_preprocessor.std



### Inference

In [6]:
def eval(results: List[List[ObjectDetectionResultI]]):
    for idx, i in enumerate(results[:2]):
        print(f"Image {idx+1}")
        print(f"    Detected {len(i)} objects")
        for j in i[:4]:
            print(f"        label={j.label}, bbox={j.bbox}, score={j.score}")
        if len(i) > 4:
            print("        ...")
    if len(results) > 2:
        print("    ...")

Support for single image inference

In [8]:
# str
eval(model.identify_for_image('demo/demo.jpg'))

# numpy.ndarray
import cv2
np_im = cv2.imread("demo/demo.jpg")
eval(model.identify_for_image(np_im))

# torch.Tensor
torch_im = torch.from_numpy(np_im)
eval(model.identify_for_image(torch_im))

Image 1
    Detected 300 objects
        label=bench, bbox=[221.37191772460938, 176.12808227539062, 456.25811767578125, 383.2401428222656], score=0.8703237771987915
        label=car, bbox=[295.3505554199219, 117.18350219726562, 378.571533203125, 150.27117919921875], score=0.7677364945411682
        label=car, bbox=[190.573486328125, 109.70985412597656, 299.5221252441406, 155.0396270751953], score=0.7427825331687927
        label=car, bbox=[431.36944580078125, 104.98468017578125, 484.879150390625, 131.94033813476562], score=0.6994597911834717
        ...


Image 1
    Detected 300 objects
        label=bench, bbox=[221.37191772460938, 176.12808227539062, 456.25811767578125, 383.2401428222656], score=0.8703237771987915
        label=car, bbox=[295.3505554199219, 117.18350219726562, 378.571533203125, 150.27117919921875], score=0.7677364945411682
        label=car, bbox=[190.573486328125, 109.70985412597656, 299.5221252441406, 155.0396270751953], score=0.7427825331687927
        label=car, bbox=[431.36944580078125, 104.98468017578125, 484.879150390625, 131.94033813476562], score=0.6994597911834717
        ...


Image 1
    Detected 300 objects
        label=bench, bbox=[221.37191772460938, 176.12808227539062, 456.25811767578125, 383.2401428222656], score=0.8703237771987915
        label=car, bbox=[295.3505554199219, 117.18350219726562, 378.571533203125, 150.27117919921875], score=0.7677364945411682
        label=car, bbox=[190.573486328125, 109.70985412597656, 299.5221252441406, 155.0396270751953], score=0.7427825331687927
        label=car, bbox=[431.36944580078125, 104.98468017578125, 484.879150390625, 131.94033813476562], score=0.6994597911834717
        ...


and batch image inference

In [9]:
# str
eval(model.identify_for_image_batch('demo', batch_size=2))

# list[str]
eval(model.identify_for_image_batch(['demo/demo.jpg', 'demo/demo2.jpg'], batch_size=2))

# list[numpy.ndarray]
np_im1 = cv2.imread("demo/demo.jpg")
np_im2 = cv2.imread("demo/demo2.jpg")
eval(model.identify_for_image_batch([np_im1, np_im2], batch_size=2))

# list[str]
torch_im1 = torch.from_numpy(np_im1)
torch_im2 = torch.from_numpy(np_im2)
eval(model.identify_for_image_batch([torch_im1, torch_im2], batch_size=2))

Image 1
    Detected 300 objects
        label=bench, bbox=[221.37191772460938, 176.12808227539062, 456.25811767578125, 383.2401428222656], score=0.8703237771987915
        label=car, bbox=[295.3505554199219, 117.18350219726562, 378.571533203125, 150.27117919921875], score=0.7677364945411682
        label=car, bbox=[190.573486328125, 109.70985412597656, 299.5221252441406, 155.0396270751953], score=0.7427825331687927
        label=car, bbox=[431.36944580078125, 104.98468017578125, 484.879150390625, 131.94033813476562], score=0.6994597911834717
        ...
Image 2
    Detected 300 objects
        label=person, bbox=[2472.887939453125, 298.7748718261719, 2928.625732421875, 1002.2025756835938], score=0.7596670389175415
        label=person, bbox=[1560.3583984375, 193.0028076171875, 1881.524169921875, 754.938232421875], score=0.7277154922485352
        label=person, bbox=[1651.1370849609375, 362.84014892578125, 2243.866943359375, 986.57421875], score=0.7058905363082886
        label=person,

Image 1
    Detected 300 objects
        label=bench, bbox=[221.37191772460938, 176.12808227539062, 456.25811767578125, 383.2401428222656], score=0.8703237771987915
        label=car, bbox=[295.3505554199219, 117.18350219726562, 378.571533203125, 150.27117919921875], score=0.7677364945411682
        label=car, bbox=[190.573486328125, 109.70985412597656, 299.5221252441406, 155.0396270751953], score=0.7427825331687927
        label=car, bbox=[431.36944580078125, 104.98468017578125, 484.879150390625, 131.94033813476562], score=0.6994597911834717
        ...
Image 2
    Detected 300 objects
        label=person, bbox=[2472.887939453125, 298.7748718261719, 2928.625732421875, 1002.2025756835938], score=0.7596670389175415
        label=person, bbox=[1560.3583984375, 193.0028076171875, 1881.524169921875, 754.938232421875], score=0.7277154922485352
        label=person, bbox=[1651.1370849609375, 362.84014892578125, 2243.866943359375, 986.57421875], score=0.7058905363082886
        label=person,

Image 1
    Detected 300 objects
        label=bench, bbox=[221.37191772460938, 176.12808227539062, 456.25811767578125, 383.2401428222656], score=0.8703237771987915
        label=car, bbox=[295.3505554199219, 117.18350219726562, 378.571533203125, 150.27117919921875], score=0.7677364945411682
        label=car, bbox=[190.573486328125, 109.70985412597656, 299.5221252441406, 155.0396270751953], score=0.7427825331687927
        label=car, bbox=[431.36944580078125, 104.98468017578125, 484.879150390625, 131.94033813476562], score=0.6994597911834717
        ...
Image 2
    Detected 300 objects
        label=person, bbox=[2472.887939453125, 298.7748718261719, 2928.625732421875, 1002.2025756835938], score=0.7596670389175415
        label=person, bbox=[1560.3583984375, 193.0028076171875, 1881.524169921875, 754.938232421875], score=0.7277154922485352
        label=person, bbox=[1651.1370849609375, 362.84014892578125, 2243.866943359375, 986.57421875], score=0.7058905363082886
        label=person,

Image 1
    Detected 300 objects
        label=bench, bbox=[221.37191772460938, 176.12808227539062, 456.25811767578125, 383.2401428222656], score=0.8703237771987915
        label=car, bbox=[295.3505554199219, 117.18350219726562, 378.571533203125, 150.27117919921875], score=0.7677364945411682
        label=car, bbox=[190.573486328125, 109.70985412597656, 299.5221252441406, 155.0396270751953], score=0.7427825331687927
        label=car, bbox=[431.36944580078125, 104.98468017578125, 484.879150390625, 131.94033813476562], score=0.6994597911834717
        ...
Image 2
    Detected 300 objects
        label=person, bbox=[2472.887939453125, 298.7748718261719, 2928.625732421875, 1002.2025756835938], score=0.7596670389175415
        label=person, bbox=[1560.3583984375, 193.0028076171875, 1881.524169921875, 754.938232421875], score=0.7277154922485352
        label=person, bbox=[1651.1370849609375, 362.84014892578125, 2243.866943359375, 986.57421875], score=0.7058905363082886
        label=person,