In [None]:
# Notes

# This version includes:

# capture interval, 
# confidence threshold, 
# capturing only different gestures (i.e., the current captured gesture must be different from the previous one).

In [3]:
# This code started as a direct copy from opencv
# https://docs.opencv.org/4.x/dd/d43/tutorial_py_video_display.html

import numpy as np
import cv2 as cv
import time

# Info about YOLO import and loading the Yolo Model
# https://docs.ultralytics.com/tasks/classify/#train

# Added this to import YOLO
from ultralytics import YOLO

# Load the YOLOv8 model, this is loading our custom-trained weights for our model.
model = YOLO("best_v2.pt")
 
cap = cv.VideoCapture(0)

captured_text = []
last_captured = None

# Set up Confidence Threshold
confidence_threshold = 0.5  # Set a confidence threshold

# Set up Capture interval
capture_interval = 1  # Time in seconds between capturing gestures

# Initialize time for the capture interval
last_capture_time = time.time()  # Time of the last capture

if not cap.isOpened():
    print("Cannot open camera")
    exit()

while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
 
    # if frame is read correctly ret is True
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break

   
    # Set the size of the video
    ret = cap.set(cv.CAP_PROP_FRAME_WIDTH, 240)
    ret = cap.set(cv.CAP_PROP_FRAME_HEIGHT, 240)
    
    # Convert the frame to grayscale
    gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
    
    # Display the grayscale frame
    cv.imshow('frame', gray)
    
    # Run YOLOv8 inference on the frame
    results = model(frame)
    top_class = results[0].probs.top1  # Get the most confident prediction
    top_confidence = results[0].probs.top1conf  # Get confidence of the top-class prediction

    # Check the time interval and confidence threshold before capturing
    current_time = time.time()
    if current_time - last_capture_time >= capture_interval:
        # Only capture if confidence is greater than threshold and gesture is new
        if top_confidence >= confidence_threshold and top_class != last_captured:
            captured_text.append(top_class)
            last_captured = top_class  # Update the last captured gesture
            last_capture_time = current_time  # Reset the capture time



    # Visualize the results on the frame
    annotated_frame = results[0].plot()

    # Display the annotated frame
    cv.imshow("YOLOv8 Inference", annotated_frame)

    # Exit when 'q' key is pressed
    if cv.waitKey(1) == ord('q'):
        print("Captured top class: ", captured_text)

        # ASCII reference to convert integer classes to text (A-Z, del, nothing, space)
        translated_text = []
        for i in captured_text:
            if i < 26:
                translated_text.append(chr(i + 65))  # A-Z
            elif i == 26:
                translated_text.append("del")
            elif i == 27:
                translated_text.append("nothing")
            elif i == 28:
                translated_text.append("space")
        
        print("Translated text: ", translated_text)
        break
        
# When everything is done, release the capture
cap.release()
cv.destroyAllWindows()



0: 224x224 A 0.22, O 0.12, B 0.08, N 0.07, U 0.06, 40.3ms
Speed: 9.6ms preprocess, 40.3ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 O 0.23, D 0.19, F 0.14, C 0.11, K 0.07, 38.0ms
Speed: 8.1ms preprocess, 38.0ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 K 0.18, nothing 0.11, O 0.10, A 0.10, F 0.08, 31.0ms
Speed: 5.8ms preprocess, 31.0ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 O 0.22, K 0.18, C 0.10, D 0.07, N 0.06, 58.0ms
Speed: 8.5ms preprocess, 58.0ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 K 0.28, F 0.10, N 0.09, O 0.08, A 0.05, 48.2ms
Speed: 6.1ms preprocess, 48.2ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 K 0.25, N 0.11, O 0.10, M 0.07, F 0.06, 42.5ms
Speed: 0.0ms preprocess, 42.5ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 K 0.20, O 0.15, N 0.12, F 0.11, M 0.07, 46.9ms