<a href="https://colab.research.google.com/github/MastafaF/DETR/blob/main/medium_detr_human_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<table align="center"><td>
  <a target="_blank"  href="https://github.com/ultralytics/yolov3/blob/master/tutorial.ipynb">
    <img src="https://cdn.mos.cms.futurecdn.net/xJGh6cXvC69an86AdrLD98-320-80.jpg" />View Medium article
  </a>
</td><td>
</td></table>

This notebook contains software developed by Ultralytics LLC, and **is freely available for redistribution under the GPL-3.0 license**. For more information please visit https://github.com/ultralytics/yolov3 and https://www.ultralytics.com.

This notebook has been developped and adapted by Mastafa Foufa. 


In [None]:
"""
Building a simple dataset from camera footage
"""
import requests
from PIL import Image


URL = 'http://83.140.123.184/ImageHarvester/Images/copyright!-stureplan_1_live.jpg'

img_arr = []
for idx_image in range(1, 20):
  response = requests.get(url=URL, stream=True)
  # GET IMAGE IN REQUEST
  img = Image.open(response.raw)
  img_arr.append(img)

for idx, img in enumerate(img_arr): 
  img.save("data/samples/img_stockholm_"+str(idx)+".jpg")

In [None]:
#@title Object Detection with DETR

In [None]:
"""
Object detection with DETR 
"""

import torch as th 
import torchvision.transforms as T 
import requests 
from PIL import Image, ImageDraw, ImageFont

# Instantiate DETR model 
model = th.hub.load('facebookresearch/detr', 'detr_resnet101', pretrained=True)
model.eval()
model = model.cuda()

# Define classes to be detected and transformations of our original images 
# standard PyTorch mean-std input image normalization
# T.Nomarlize([mean_R, mean_G, mean_B], [std_R, std_G, std_B])
transform = T.Compose([
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

CLASSES = [
    'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
    'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
    'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
    'toothbrush'
]

In [None]:
"""
Simple detection of people:
Example following https://colab.research.google.com/github/facebookresearch/detr/blob/colab/notebooks/detr_demo.ipynb#scrollTo=Y6Jrz6xz71C0 
"""

import torch 
import matplotlib.pyplot as plt
import os
import sys
from datetime import datetime
import time

from PIL import Image
import requests
import matplotlib.pyplot as plt
import numpy as np

# colors for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

# standard PyTorch mean-std input image normalization
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# for output bounding box post-processing
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    # Push to CPU to perform operation after
    b = box_cxcywh_to_xyxy(out_bbox).cpu()
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

def detect(im, model, transform, confidence_level = 0.7):
    # mean-std normalize the input image (batch-size: 1)
    img = transform(im).unsqueeze(0)
    # Additional if on GPU: push image to GPU
    img = img.cuda()
    # propagate through the model
    outputs = model(img)

    # keep only predictions with 0.7+ confidence
    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > confidence_level # CHANGED FOR MORE PREDICTIONS

    # convert boxes from [0; 1] to image scales
    bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)
    return probas[keep], bboxes_scaled



# url_classroom = "https://docs.openvinotoolkit.org/2020.2/person-detection-action-recognition-teacher-0002.png"
# url_camera_footage = 'http://data.goteborg.se/TrafficCamera/v0.2/CameraImage/83a5905d-7889-4bae-bd6d-0a7e67f4f6af/16'
# im = Image.open(requests.get(url_camera_footage, stream=True).raw).resize((800,600)).convert('RGB')

"""
One can define an array of images img_arr where we can store images 
"""
# In the array img_arr, we have previously stored images from camera footage 
for idx, im in enumerate(img_arr): 
  scores, boxes = detect(im, model, transform)

  def plot_results(pil_img, prob, boxes, filename):
      plt.figure(figsize=(16,10))
      plt.imshow(pil_img)
      ax = plt.gca()
      COUNT_PERSON = 0
      for p, (xmin, ymin, xmax, ymax), c in zip(prob, boxes.tolist(), COLORS * 100):
          
          cl = p.argmax()
          label = CLASSES[cl]
          if label == "person": 
            COUNT_PERSON += 1 
            text = f'{CLASSES[cl]}: {p[cl]:0.2f}'

            ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                      fill=False, color=c, linewidth=3))
            
            ax.text(xmin, ymin, text, fontsize=15,
                    bbox=dict(facecolor='yellow', alpha=0.5))
            
      print("Number of people detected: {}".format(COUNT_PERSON))
      plt.axis('off')
      # plt.show()
      plt.savefig(filename)
      return
  # Saving images with 'human detection' in the folder detr_output
  plot_results(im, scores, boxes, filename = "detr_output/img_stockholm_detr_detection" + str(idx) + ".jpg")

In [None]:
#@title DETR vs YOLO

In [None]:
##### Yolo ###### 

"""
Yolo implementation from https://github.com/ultralytics/yolov3 
"""
import time
import glob
import torch
import os

from IPython.display import Image, clear_output 
print('PyTorch %s %s' % (torch.__version__, torch.cuda.get_device_properties(0) if torch.cuda.is_available() else 'CPU'))

In [None]:
!git clone https://github.com/ultralytics/yolov3  # clone
!bash yolov3/data/get_coco_dataset_gdrive.sh  # copy COCO2014 dataset (19GB)
%cd yolov3

In [None]:
"""
A simple way to apply the Pytorch implementation of Yolo on your images 
is to add your own images in the folder data/samples.
"""

!python3 detect.py
# The original image of Zidane is in the folder data/samples 
# The image with detected objects is in the folder output
Image(filename='output/zidane.jpg', width=600)