In [None]:
from itertools import islice
from scenic_reasoning.data.ImageLoader import Bdd100kDataset
from scenic_reasoning.models.UltralyticsYolo import Yolo
from scenic_reasoning.measurements.ObjectDetection import ObjectDetectionMeasurements
from scenic_reasoning.utilities.common import get_default_device
import torch
from ultralytics.data.augment import LetterBox

In [10]:
NUM_EXAMPLES_TO_SHOW = 3
BATCH_SIZE = 1 # > 1 is not working yet

In [4]:
shape_transform = LetterBox(new_shape=(768, 1280))
def transform_image_for_yolo(image : torch.Tensor):
    # 1) convert from tensor to cv2 image
    image_np  = image.permute(1, 2, 0).numpy()
    # 2) resize to 768x1280
    image_np = shape_transform(image=image_np)
    # 3) convert back to tensor
    image = torch.tensor(image_np).permute(2, 0, 1)
    # 4) normalize to 0-1
    image = image.to(torch.float32) / 255.0

    return image

In [11]:
bdd = Bdd100kDataset(split="val", transform=transform_image_for_yolo) # YOLO requires images to be 640x640 but BDD100K images are 720x1280
# https://docs.ultralytics.com/models/yolov5/#performance-metrics
model = Yolo(model="../yolov5x6u.pt") # v5 can handle 1280 while v8 can handle 640. makes no sense ><
measurements = ObjectDetectionMeasurements(model, bdd, batch_size=BATCH_SIZE, collate_fn=lambda x: x) # hacky way to avoid RuntimeError: each element in list of batch should be of equal size

# WARNING ⚠️ imgsz=[720, 1280] must be multiple of max stride 64, updating to [768, 1280]
from pprint import pprint
for results in islice(measurements.iter_measurements(
        device=get_default_device(), 
        imgsz=[768, 1280],
        bbox_offset=24,
        debug=True,
        conf=0.1,
        ), 
    NUM_EXAMPLES_TO_SHOW):
    pprint(results)


0: 768x1280 22 cars, 1 bus, 1 truck, 6 traffic lights, 99.6ms
Speed: 0.0ms preprocess, 99.6ms inference, 15.1ms postprocess per image at shape (1, 3, 768, 1280)
[{'ap_per_class': {'bus': 0,
                   'car': 0.14634146341463414,
                   'motorcycle': 0,
                   'rider': 0,
                   'traffic light': 0,
                   'traffic sign': 0,
                   'truck': 0},
  'f1': 0,
  'false_negatives': 7,
  'false_positives': 6,
  'mAP': 0.020905923344947733,
  'precision': 0.0,
  'recall': 0.0,
  'true_positives': 0}]

0: 768x1280 3 cars, 1 traffic light, 108.4ms
Speed: 0.0ms preprocess, 108.4ms inference, 14.1ms postprocess per image at shape (1, 3, 768, 1280)
[{'ap_per_class': {'car': 0, 'traffic light': 0, 'traffic sign': 0},
  'f1': 0,
  'false_negatives': 2,
  'false_positives': 3,
  'mAP': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'true_positives': 0}]

0: 768x1280 7 cars, 1 truck, 114.9ms
Speed: 0.0ms preprocess, 114.9ms inference, 9.7m