In [2]:
import matplotlib.pyplot as plt
import torchvision
from torchvision.io import read_video
from torchvision.utils import save_image
from typing import List, OrderedDict, Tuple
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
from PIL import Image, ImageDraw, ImageFont
from torchvision import transforms
import torchinfo
import torch
from pathlib import Path
import cv2
from enum import Enum
import xml.etree.ElementTree as ET 

In [3]:
def load_frcnn_model():
  frcnn = fasterrcnn_resnet50_fpn_v2()
  frcnn = frcnn.eval().cuda()
  torchinfo.summary(frcnn)
  return frcnn

class YoloModel(Enum):
  nano = 'yolov5n'
  small = 'yolov5s'
  medium = 'yolov5m'
  large = 'yolov5l'
  xlarge = 'yolov5x'

def load_yolo5_model(model=YoloModel.small):
  return torch.hub.load('ultralytics/yolov5', model.value, pretrained=True)

def load_frame_to_pil(path: str, frame: int):
  video = cv2.VideoCapture(path)
  frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
  if frame >= frame_count:
    raise Exception(f'Frame {frame} but a total of {frame_count} frames exist')
  video.set(cv2.CAP_PROP_POS_FRAMES, frame)
  ret, frame = video.read()
  video.release()

  if ret:
      return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
  else:
      return None

In [4]:
from typing import Dict

class ObjectType(Enum):
  CONSTRUCTION = 1
  CYCLIST = 2
  VEHICLE = 3
  PEDESTRIAN = 4
  BUS = 5
  
class AnnotationObject:
  def __init__(self, object: ET.Element):
    self.type: ObjectType = ObjectType.__dict__[object.find("./name").text.upper()]
    self.xmin = float(object.find("./bndbox/xmin").text)
    self.xmax = float(object.find("./bndbox/xmax").text)
    self.ymin = float(object.find("./bndbox/ymin").text)
    self.ymax = float(object.find("./bndbox/ymax").text)
  
  def __str__(self):
    return f'({self.xmin}, {self.ymin}, {self.xmax}, {self.ymax})'
  
  @property
  def area(self):
    return (self.ymax-self.ymin)*(self.xmax-self.xmin)
    
  def intersect_area(self, other):
    dx = min(other.xmax, self.xmax) - max(other.xmin, self.xmin)
    dy = min(other.ymax, self.ymax) - max(other.ymin, self.ymin)
    if (dx>=0) and (dy>=0):
      return dx*dy
  
  def difference(self, other):
    intersect = self.intersect_area(other)
    if intersect is None:
      return 1
    my_area = self.area
    other_area = (other.ymax-other.ymin)*(other.xmax-other.xmin)
    score = ((my_area+other_area) - (2*intersect))/my_area
    return score if score<1 else 1

class Annotation:
  def __init__(self, tree: ET.ElementTree):
    root = tree.getroot()
    self.image_filename = root.find("filename").text
    self.objects: List[AnnotationObject] = []
    self.types = set()
    for object in root.findall("./object"):
      anot = AnnotationObject(object)
      self.objects.append(anot)
      self.types.add(anot.type)
    

def load_canada_dataset():
  parent_dir = Path("canada-dataset")
  annotation_dir = parent_dir.joinpath("Annotations")
  image_dir = parent_dir.joinpath("JPEGImages")
  
  annotations: Dict[str,Annotation] = {}
  for annotation in annotation_dir.glob("*.xml"):
    tree = ET.parse(annotation)
    a = Annotation(tree)
    annotations[a.image_filename] = a
  
  images: Dict[str,Image.Image] = {}
  for img_file in image_dir.glob("*.jpeg"):
    images[img_file.name] = Image.open(img_file)
    images[img_file.name].load()
  
  return annotations, images

def show_rectangle(img: Image.Image, xmin, ymin, xmax, ymax):
  tmp_img = img.copy()
  draw = ImageDraw.Draw(tmp_img)
  draw.rectangle([(xmin,ymin),(xmax,ymax)], outline="red", width=1)
  tmp_img.show()
  print(tmp_img.size)
  print(xmin, ymin, xmax, ymax)

def show_annotations(img: Image.Image, annotation: Annotation):
  tmp_img = img.copy()
  draw = ImageDraw.Draw(tmp_img)
  for obj in annotation.objects:
    print((obj.xmin,obj.ymin),(obj.xmax,obj.ymax))
    draw.rectangle([(obj.xmin,obj.ymin),(obj.xmax,obj.ymax)], outline="red", width=1)
  tmp_img.show()

def apply_annotations(
  img: Image.Image, true_annotations: List[AnnotationObject], 
  predicted_annotations, other_annotations, dif_scores
):
  tmp_img = img.copy()
  draw = ImageDraw.Draw(tmp_img)
  for i, true_annot in enumerate(true_annotations):
    draw.rectangle([(true_annot.xmin,true_annot.ymin),(true_annot.xmax,true_annot.ymax)], outline="green", width=1)
  for p_annot in predicted_annotations:
    draw.rectangle([(p_annot.xmin,p_annot.ymin),(p_annot.xmax,p_annot.ymax)], outline="orange", width=1)
  for p_annot in other_annotations:
    draw.rectangle([(p_annot.xmin,p_annot.ymin),(p_annot.xmax,p_annot.ymax)], outline="blue", width=1)
  for i, true_annot in enumerate(true_annotations):
    draw.text((true_annot.xmin,true_annot.ymin), f'{dif_scores[i]:.2f}', "red", font=ImageFont.truetype("arial.ttf", 15))
  return tmp_img

In [5]:
annotations, images = load_canada_dataset()

In [6]:
import pandas as pd
import json
import time

In [16]:
def test_yolo(out_dir: str, model: YoloModel = YoloModel.small, gen_images = True, use_gpu = False):
  out_dir = Path(out_dir)
  out_dir.mkdir(exist_ok=True)
  track_list = set([ObjectType.BUS.name, ObjectType.VEHICLE.name, ObjectType.PEDESTRIAN.name, ObjectType.CYCLIST.name])
  yolo = load_yolo5_model(model)
  if use_gpu:
    yolo=yolo.cuda()
  else:
    yolo=yolo.cpu()
  detected_classes = set()
  annotation_count = len(annotations)
  i = 0
  summary = {}
  start = time.time()
  for tot in track_list:
    summary[f'total_{tot}_misses'] = 0
    summary[f'total_{tot}_loss_sum'] = 0
    summary[f'total_{tot}_objects'] = 0
  summary[f'total_VEHICLE_detections'] = 0
  summary[f'total_objects'] = 0
  summary[f'total_PERSON_detections'] = 0
      
  for img_file, annotation in annotations.items():
    if i > 1000:
      break
    img = images[img_file]
    yolo_out = yolo([img])
    detected_objects: pd.DataFrame = yolo_out.pandas().xyxy[0]
    
    detected_interests = []
    detected_car_count = 0
    detected_person_count = 0
    detected_other = []
    for index, object in detected_objects.iterrows():
      # Collect all detected objects for image annotations
      #print(object['class'], object['name'], object.xmin, object.ymin, object.xmax, object.ymax, object.confidence)
      detected_classes.add(object['name'])
      if object['name'] in ['car','truck','bus']:
        detected_car_count += 1
        summary[f'total_VEHICLE_detections'] += 1
        detected_interests.append(object)
      elif object['name'] in ['person',]:
        summary[f'total_PERSON_detections'] += 1
        detected_person_count += 1
        detected_interests.append(object)
      else:
        detected_other.append(object)
      
      # Check for the minimum intersection dif
    dif_scores = []
    score_sum = 0
    miss_count = 0
    true_annotations = []
    summary['total_objects'] += len(annotation.objects)
    for true_obj in annotation.objects:
      tot = true_obj.type.name
      if tot in track_list:
        true_annotations.append(true_obj)
        summary[f'total_{tot}_objects'] += 1
        best = 1
        for object in detected_interests:
          dif = true_obj.difference(object)
          if dif is not None:
            if dif < best:
              best = dif
        dif_scores.append(best)
        score_sum += best
        summary[f'total_{tot}_loss_sum'] += best
        if best == 1:
          miss_count += 1
          summary[f'total_{tot}_misses'] += 1
    
    if i%1000 == 0:
      print(f'Completed: {i}/{annotation_count} ({i/annotation_count:.2f})')
    i+=1
    
    avg_score = score_sum/len(dif_scores) if len(dif_scores)>0 else 0
    if gen_images:
      annotated = apply_annotations(img, true_annotations, detected_interests, detected_other, dif_scores)
      annotated.save(out_dir.joinpath(f'{img_file.rstrip(".jpeg")}-{avg_score:.2f}-{miss_count}-{len(dif_scores)}.jpeg'))
  total_score_sum = 0
  total_objects = 0
  total_found = 0
  for tot in track_list:
    score = summary[f'total_{tot}_loss_sum']
    objects = summary[f'total_{tot}_objects']
    misses = summary[f'total_{tot}_misses']
    summary[f'total_{tot}_avg_loss'] = score/objects if objects > 0 else 0
    summary[f'total_{tot}_pct_found'] = (objects-misses)/objects if objects > 0 else 1
    total_found += (objects-misses)
    total_score_sum += score
    total_objects += objects
  summary[f'total_avg_loss'] = total_score_sum/total_objects
  summary[f'total_pct_found'] = total_found/total_objects
  total_time = time.time()-start
  summary[f'total_time'] = total_time
  summary[f'time_per_image'] = total_time/annotation_count
  summary[f'detected_classes'] = list(detected_classes)
  with open(out_dir.joinpath('summary.json'), 'w') as outfile:
    json.dump(summary, outfile)
    
test_yolo('test-s', YoloModel.small, gen_images=True, use_gpu=True)
#test_yolo('test-m', YoloModel.medium, gen_images=False, use_gpu=True)
#test_yolo('test-l', YoloModel.large, gen_images=False, use_gpu=False)
test_yolo('test-xl', YoloModel.xlarge, gen_images=True, use_gpu=True)

Using cache found in C:\Users\mrhae/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-11-18 Python-3.11.2 torch-2.5.1+cu118 CUDA:0 (NVIDIA GeForce RTX 2070, 8192MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):


Completed: 0/9998 (0.00)


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with a

Completed: 1000/9998 (0.10)


Using cache found in C:\Users\mrhae/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-11-18 Python-3.11.2 torch-2.5.1+cu118 CUDA:0 (NVIDIA GeForce RTX 2070, 8192MiB)

Fusing layers... 
YOLOv5x summary: 444 layers, 86705005 parameters, 0 gradients, 205.5 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):


Completed: 0/9998 (0.00)


  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with a

Completed: 1000/9998 (0.10)


  with amp.autocast(autocast):
  with amp.autocast(autocast):
