# Real-time object detection in webcam video stream using Ultralytics YOLOv8

This is a refactored version of an older, YOLOv3-compatible [notebook][older_notebook] which no longer works.

Instructions:

* Run `Runtime` > `Run all` (or press <kbd>Ctrl</kbd> <kbd>F9</kbd>) to run all cells.
* Allow access to the webcam, if asked.
* To stop the webcam capture, click the red text or the picture.

Resources:
* Ultralytics [documentation][ultralytics_doc] for object detection.
* Google Colab [documentation][ipython_display_doc] for executing JavaScript from Python for tasks such as webcam capture, etc.

Author: [Kiril Isakov][kisakov_linkedin] ([kirisakow][kirisakow_github])

[kisakov_linkedin]: https://www.linkedin.com/in/kisakov/
[kirisakow_github]: https://github.com/kirisakow
[older_notebook]: https://github.com/vindruid/yolov3-in-colab/blob/aaac930911e18826c560d3156220cedb8726b8c7/yolov3_streaming_webcam.ipynb
[ipython_display_doc]: https://colab.research.google.com/notebooks/snippets/advanced_outputs.ipynb#scrollTo=2viqYx97hPMi
[ultralytics_doc]: https://docs.ultralytics.com/tasks/detect/

## 0. Prerequisites

### 0.1. Install and initialize libraries and constants

In [None]:
! pip install --upgrade --quiet ultralytics

In [None]:
### 1. Mount Google Drive ###

from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
from base64 import b64decode, b64encode
from google.colab.output import eval_js
from IPython.display import display, Javascript
from PIL import Image
from ultralytics import YOLO
from ultralytics.engine.results import Results
import io
import numpy as np

# Caminho do modelo treinado
model_path = "/content/gdrive/MyDrive/Sistema_de_Visão/yolov8/Cashew_nut/Train_CashewNut/runs/detect/train/weights/best.pt"

MODEL_NAMES = ['yolov8n.pt', 'yolov8s.pt', 'yolov8m.pt', 'yolov8l.pt', 'yolov8x.pt']
#PRE_TRAINED_MODEL = YOLO(MODEL_NAMES[0])
PRE_TRAINED_MODEL = YOLO(model_path)
IMG_SHAPE = [640, 480]
IMG_QUALITY = 0.8

### 0.2. Define JavaScript functions to capture webcam stream, and to be called by Python

In [None]:
def start_stream():
    js = Javascript(f'''
    const IMG_SHAPE = {IMG_SHAPE};
    const IMG_QUALITY = {IMG_QUALITY};
    ''' + '''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;

    var pendingResolve = null;
    var shutdown = false;

    function removeDom() {
        stream.getVideoTracks()[0].stop();
        video.remove();
        div.remove();
        video = null;
        div = null;
        stream = null;
        imgElement = null;
        captureCanvas = null;
        labelElement = null;
    }

    function onAnimationFrame() {
        if (!shutdown) {
            window.requestAnimationFrame(onAnimationFrame);
        }
        if (pendingResolve) {
            var result = "";
            if (!shutdown) {
                captureCanvas.getContext('2d').drawImage(video, 0, 0, IMG_SHAPE[0], IMG_SHAPE[1]);
                result = captureCanvas.toDataURL('image/jpeg', IMG_QUALITY)
            }
            var lp = pendingResolve;
            pendingResolve = null;
            lp(result);
        }
    }

    async function createDom() {
        if (div !== null) {
            return stream;
        }

        div = document.createElement('div');
        div.style.border = '2px solid black';
        div.style.padding = '3px';
        div.style.width = '100%';
        div.style.maxWidth = '600px';
        document.body.appendChild(div);

        const modelOut = document.createElement('div');
        modelOut.innerHTML = "<span>Status: </span>";
        labelElement = document.createElement('span');
        labelElement.innerText = 'No data';
        labelElement.style.fontWeight = 'bold';
        modelOut.appendChild(labelElement);
        div.appendChild(modelOut);

        video = document.createElement('video');
        video.style.display = 'block';
        video.width = div.clientWidth - 6;
        video.setAttribute('playsinline', '');
        video.onclick = () => { shutdown = true; };
        stream = await navigator.mediaDevices.getUserMedia(
            {video: { facingMode: "environment"}});
        div.appendChild(video);

        imgElement = document.createElement('img');
        imgElement.style.position = 'absolute';
        imgElement.style.zIndex = 1;
        imgElement.onclick = () => { shutdown = true; };
        div.appendChild(imgElement);

        const instruction = document.createElement('div');
        instruction.innerHTML =
            '<span style="color: red; font-weight: bold;">' +
            'When finished, click here or on the video to stop this demo</span>';
        div.appendChild(instruction);
        instruction.onclick = () => { shutdown = true; };

        video.srcObject = stream;
        await video.play();

        captureCanvas = document.createElement('canvas');
        captureCanvas.width = IMG_SHAPE[0]; //video.videoWidth;
        captureCanvas.height = IMG_SHAPE[1]; //video.videoHeight;
        window.requestAnimationFrame(onAnimationFrame);

        return stream;
    }
    async function takePhoto(label, imgData) {
        if (shutdown) {
            removeDom();
            shutdown = false;
            return '';
        }

        var preCreate = Date.now();
        stream = await createDom();

        var preShow = Date.now();
        if (label != "") {
            labelElement.innerHTML = label;
        }

        if (imgData != "") {
            var videoRect = video.getClientRects()[0];
            imgElement.style.top = videoRect.top + "px";
            imgElement.style.left = videoRect.left + "px";
            imgElement.style.width = videoRect.width + "px";
            imgElement.style.height = videoRect.height + "px";
            imgElement.src = imgData;
        }

        var preCapture = Date.now();
        var result = await new Promise((resolve, reject) => pendingResolve = resolve);
        shutdown = false;

        return {
            'create': preShow - preCreate,
            'show': preCapture - preShow,
            'capture': Date.now() - preCapture,
            'img': result,
        };
    }
    ''')
    display(js)

def take_photo(label, img_data):
    data = eval_js(f'takePhoto("{label}", "{img_data}")')
    return data

### 0.3. Define Python functions to

In [None]:
def js_response_to_image(js_response) -> Image.Image:
    _, b64_str = js_response['img'].split(',')
    jpeg_bytes = b64decode(b64_str)
    image = Image.open(io.BytesIO(jpeg_bytes))
    return image

def turn_non_black_pixels_visible(rgba_compatible_array: np.ndarray) -> np.ndarray:
    rgba_compatible_array[:, :, 3] = (rgba_compatible_array.max(axis=2) > 0).astype(int) * 255
    return rgba_compatible_array

def black_transparent_rgba_canvas(w, h) -> np.ndarray:
    return np.zeros([w, h, 4], dtype=np.uint8)

def draw_annotations_on_transparent_bg(detection_result: Results) -> Image.Image:
    black_rgba_canvas = black_transparent_rgba_canvas(*detection_result.orig_shape)
    transparent_canvas_with_boxes_invisible = detection_result.plot(font='verdana', masks=False, img=black_rgba_canvas)
    transparent_canvas_with_boxes_visible = turn_non_black_pixels_visible(transparent_canvas_with_boxes_invisible)
    image = Image.fromarray(transparent_canvas_with_boxes_visible, 'RGBA')
    return image

## 1. Perform real-time object detection in webcam stream

In [None]:
start_stream()
img_data = ''
while True:
    js_response = take_photo('Capturing...', img_data)
    if not js_response:
        break

    captured_img = js_response_to_image(js_response)

    # Ajustando a chamada para o modelo YOLOv8
    for detection_result in PRE_TRAINED_MODEL(
            source=np.array(captured_img),
            imgsz=640,  # Ajuste o tamanho da imagem para melhorar a performance
            conf=0.4,   # Ajuste o threshold de confiança (conf-thres)
            iou=0.4,    # Ajuste o threshold de IoU (iou-thres)
            verbose= False):

        # Desenhar as anotações nas detecções
        annotations_img = draw_annotations_on_transparent_bg(detection_result)

        # Converter a imagem anotada para base64 para exibição
        with io.BytesIO() as buffer:
            annotations_img.save(buffer, format='png')
            img_as_base64_str = str(b64encode(buffer.getvalue()), 'utf-8')
            img_data = f'data:image/png;base64,{img_as_base64_str}'



# Uma imagem

In [None]:
# Importar bibliotecas necessárias
from PIL import Image, ImageDraw, ImageFont
import io
import numpy as np
from ultralytics import YOLO
import matplotlib.pyplot as plt

# Caminho do modelo treinado
model_path = "/content/gdrive/MyDrive/Sistema_de_Visão/yolov8/Cashew_nut/Train_CashewNut/runs/detect/train/weights/best.pt"
PRE_TRAINED_MODEL = YOLO(model_path)

# Função para carregar e processar a imagem
def load_image(image_path):
    img = Image.open(image_path).convert("RGB")
    return img

# Função para executar o modelo e exibir resultados
def detect_and_display(image_path, model):
    # Carregar a imagem
    img = load_image(image_path)

    # Realizar detecção
    results = model(img)

    # Desenhar as detecções na imagem
    img_with_detections = img.copy()
    draw = ImageDraw.Draw(img_with_detections)

    # Define uma fonte padrão para o texto (ou usa a fonte do sistema)
    try:
        font = ImageFont.truetype("arial.ttf", 16)
    except IOError:
        font = ImageFont.load_default()

    for result in results:
        for box in result.boxes:
            x1, y1, x2, y2 = box.xyxy[0].numpy()  # Coordenadas do bounding box
            label = int(box.cls.item())  # Classe
            conf = float(box.conf.item())  # Confiança

            # Definir cor com base na classe
            color = "green" if label == 1 else "red"
            draw.rectangle([x1, y1, x2, y2], outline=color, width=3)

            # Texto e fundo do texto
            text = f"{label}: {conf:.2f}"
            text_bbox = draw.textbbox((x1, y1), text, font=font)
            text_width = text_bbox[2] - text_bbox[0]
            text_height = text_bbox[3] - text_bbox[1]
            text_background = (x1, y1 - text_height, x1 + text_width, y1)
            draw.rectangle(text_background, fill=color)
            draw.text((x1, y1 - text_height), text, fill="white", font=font)

    # Exibir imagem com detecções usando Matplotlib
    plt.figure(figsize=(10, 10))
    plt.imshow(img_with_detections)
    plt.axis("off")  # Remover eixos para uma melhor visualização
    plt.show()

# Definir caminho da imagem e executar detecção
image_path = "/content/gdrive/MyDrive/Sistema_de_Visão/yolov8/Cashew_nut/Train_CashewNut/data/images/val/WIN_20241001_11_41_15_Pro.jpg"  # Caminho da imagem enviada
detect_and_display(image_path, PRE_TRAINED_MODEL)


# Tentativa real time extra

In [None]:
from google.colab import output
from base64 import b64decode
import IPython
from PIL import Image
import io

# Código JavaScript para capturar a imagem
def take_photo():
    js = """
    async function takePhoto() {
      const div = document.createElement('div');
      const video = document.createElement('video');
      const canvas = document.createElement('canvas');
      const button = document.createElement('button');
      button.textContent = 'Tirar Foto';
      document.body.appendChild(div);
      div.appendChild(video);
      div.appendChild(button);
      const stream = await navigator.mediaDevices.getUserMedia({video: true});
      video.srcObject = stream;
      await video.play();

      // Captura a imagem ao clicar no botão
      button.onclick = () => {
        canvas.width = video.videoWidth;
        canvas.height = video.videoHeight;
        canvas.getContext('2d').drawImage(video, 0, 0);
        stream.getTracks().forEach(track => track.stop());
        div.remove();
        const dataUrl = canvas.toDataURL('image/jpeg', 0.8);
        google.colab.kernel.invokeFunction('notebook.process_image', [dataUrl], {});
      };
    }
    takePhoto();
    """
    display(IPython.display.Javascript(js))

# Chame a função para capturar a imagem
take_photo()


In [None]:
import numpy as np

# Função para processar imagem capturada
def process_image(js_data):
    # Decodificar a imagem
    img_bytes = b64decode(js_data.split(',')[1])
    img = Image.open(io.BytesIO(img_bytes)).convert('RGB')

    img = img.resize((640, 640))  # Ajuste o tamanho da imagem para algo mais eficiente

    # Processar imagem com YOLO
    results = PRE_TRAINED_MODEL(source=np.array(img))  # Adicionar thresholds aqui
    annotated_img = draw_annotations_on_transparent_bg(results[0])

    # Exibir resultado
    annotated_img.show()

