In [None]:
# Notes

# This version includes:

# capture interval, 
# confidence threshold, 
# capturing only different gestures (i.e., the current captured gesture must be different from the previous one).

In [19]:
# This code started as a direct copy from opencv
# https://docs.opencv.org/4.x/dd/d43/tutorial_py_video_display.html

import numpy as np
import cv2 as cv
import time

# Info about YOLO import and loading the Yolo Model
# https://docs.ultralytics.com/tasks/classify/#train

# Added this to import YOLO
from ultralytics import YOLO

# Load the YOLOv8 model, this is loading our custom-trained weights for our model.
model = YOLO("best_v2.pt")
 
cap = cv.VideoCapture(0)

captured_text = []
captured_confidence = []

last_captured = None

# Set up Confidence Threshold
confidence_threshold = 0.9  # Set a confidence threshold

# Set up Capture interval
capture_interval = 1  # Time in seconds between capturing gestures

# Initialize time for the capture interval
last_capture_time = time.time()  # Time of the last capture

if not cap.isOpened():
    print("Cannot open camera")
    exit()

while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
 
    # if frame is read correctly ret is True
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break

   
    # Set the size of the video
   # ret = cap.set(cv.CAP_PROP_FRAME_WIDTH, 240)
   # ret = cap.set(cv.CAP_PROP_FRAME_HEIGHT, 240)
    
    # Convert the frame to grayscale
    gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
    
    # Display the grayscale frame
    cv.imshow('frame', gray)
    
    # Run YOLOv8 inference on the frame
    results = model(frame)
    top_class = results[0].probs.top1  # Get the most confident prediction
    top_confidence = results[0].probs.top1conf  # Get confidence of the top-class prediction

    

    # Check the time interval and confidence threshold before capturing
    current_time = time.time()
    if current_time - last_capture_time >= capture_interval:
        # Only capture if confidence is greater than threshold and gesture is new
        if top_confidence >= confidence_threshold and top_class != last_captured:
            captured_text.append(top_class)
            captured_confidence.append(top_confidence)
            
            last_captured = top_class  # Update the last captured gesture
            last_capture_time = current_time  # Reset the capture time



    # Visualize the results on the frame
    annotated_frame = results[0].plot()

    # Display the annotated frame
    cv.imshow("YOLOv8 Inference", annotated_frame)

    # Exit when 'q' key is pressed
    if cv.waitKey(1) == ord('q'):
        print("Captured top class: ", captured_text)

        # ASCII reference to convert integer classes to text (A-Z, del, nothing, space)
        translated_text = []
        for i in captured_text:
            if i < 26:
                translated_text.append(chr(i + 65))  # A-Z
            elif i == 26:
                translated_text.append("del")
            elif i == 27:
                translated_text.append("nothing")
            elif i == 28:
                translated_text.append("space")
        
        print("Translated text: ", translated_text)
        print("Confidence: ", captured_confidence)
        break
        
# When everything is done, release the capture
cap.release()
cv.destroyAllWindows()



0: 224x224 E 0.30, O 0.14, C 0.11, S 0.09, R 0.09, 23.4ms
Speed: 8.0ms preprocess, 23.4ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 R 0.17, Z 0.15, X 0.12, S 0.09, O 0.07, 14.4ms
Speed: 8.1ms preprocess, 14.4ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 X 0.21, R 0.16, Z 0.11, S 0.09, N 0.06, 20.0ms
Speed: 0.0ms preprocess, 20.0ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 R 0.26, X 0.11, Z 0.10, S 0.09, N 0.06, 17.7ms
Speed: 8.0ms preprocess, 17.7ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 O 0.13, X 0.11, N 0.10, R 0.10, Q 0.09, 20.2ms
Speed: 6.0ms preprocess, 20.2ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 O 0.21, R 0.15, D 0.07, Q 0.07, X 0.06, 24.3ms
Speed: 0.0ms preprocess, 24.3ms inference, 2.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 R 0.36, O 0.09, D 0.07, Q 0.05, A 0.05, 14.2ms
Speed