In [1]:
# This code started as a direct copy from opencv
# https://docs.opencv.org/4.x/dd/d43/tutorial_py_video_display.html

import numpy as np
import cv2 as cv

# Info about YOLO import and loading the Yolo Model
# https://docs.ultralytics.com/tasks/classify/#train

# Added this to import YOLO
from ultralytics import YOLO

# Load the YOLOv8 model, this is loading our custom trained weights for our model.
model = YOLO("best_v2.pt")
 
cap = cv.VideoCapture(0)

# Stored captured text
captured_text = []

# Stores captured confidence
captured_confidence = []

# Set a threshold for the sign to register
confidence_requirement = 0.90

# Counts the number of consecutive significant signs
count = 0

# Counts the number of consecutive insignificant signs
noise_count = 0

# Keeps track of the last sign
last = None

translator = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","DEL","NOTHING","SPACE"]
live_text = "Text: "


if not cap.isOpened():
    print("Cannot open camera")
    exit()
while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
 
    # if frame is read correctly ret is True
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break
    
    # These two lines are found here https://docs.opencv.org/4.x/dd/d43/tutorial_py_video_display.html
    # They reduce the size of the video
    
    ret = cap.set(cv.CAP_PROP_FRAME_WIDTH,240)
    ret = cap.set(cv.CAP_PROP_FRAME_HEIGHT,240)
    
    # Our operations on the frame come here
    #gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
    # Display the resulting frame
    #cv.imshow('frame', gray)
    
    # If you scroll to the very bottom of this link
    # https://docs.ultralytics.com/modes/predict/#thread-safe-inference
    # You will find the next 3 lines of code which I took from their and applied to this similar example
    
    # Run YOLOv8 inference on the frame
    results = model(frame)
    top_class = results[0].probs.top1
    top_confidence = results[0].probs.top1conf  # Get confidence of the top-class prediction
    
    # If the confidence of the sign is above the threshold 
    if (top_confidence >= confidence_requirement):
        
        # If the top_class is the last class
        # hence it is consecutive increase count by 1.
        if (top_class==last):
            
            count = count+1
        
        # If the top_class is not the last class
        # it is  a new class restart counter 
        else:
    
            count = 1
            
        # If there are 3 consecutive significant signs track it
        if (count == 3):
            
            
            captured_text.append(translator[top_class])
            captured_confidence.append(top_confidence)
            live_text = live_text + translator[top_class]
        
        # Set last to be the top_class
        last = top_class
        
        # Set the noise counter to 0 since this is not noise
        noise_count = 0 
    
    # If the confidence of the current sign is not enough for the threshold increase noise counter
    else:
        
        noise_count=noise_count+1
    
    # If there are three consecutive insignificant signs
    # Reset count, allowing another consecutive sign to be registerred for instance (A,A)
    # Reset the noise counter
    if noise_count == 3:
        count = 0
        noise_count = 0
    
    # Visualize the results on the frame
    #annotated_frame = results[0].plot()
    
    # Display the annotated frame
    
    # Example from: https://www.geeksforgeeks.org/python-opencv-write-text-on-video/
    cv.putText(frame,  
                live_text,  
                (10, 460),  
                cv.FONT_HERSHEY_SIMPLEX, 1,  
                (0, 255, 255),  
                2,  
                cv.LINE_4) 
    
    cv.imshow("Capture", frame)
    
    if cv.waitKey(1) == ord('q'):
        
        for i in range(len(captured_text)):
            
            print("Translated text: ", captured_text[i] , " Confidence: ", captured_confidence[i].item())
        
        break
        
        
 
# When everything done, release the capture
cap.release()
cv.destroyAllWindows()


0: 224x224 G 0.06, U 0.06, E 0.06, O 0.05, N 0.05, 13.3ms
Speed: 8.9ms preprocess, 13.3ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 G 0.10, E 0.10, M 0.08, S 0.07, O 0.06, 6.6ms
Speed: 3.0ms preprocess, 6.6ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 G 0.12, E 0.11, M 0.08, S 0.07, F 0.06, 6.4ms
Speed: 2.6ms preprocess, 6.4ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 E 0.09, G 0.09, M 0.09, F 0.07, S 0.07, 6.7ms
Speed: 2.9ms preprocess, 6.7ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 E 0.10, G 0.09, M 0.08, S 0.07, F 0.06, 7.1ms
Speed: 2.7ms preprocess, 7.1ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 E 0.10, G 0.09, M 0.08, S 0.07, F 0.06, 8.1ms
Speed: 2.7ms preprocess, 8.1ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 E 0.11, M 0.09, G 0.09, S 0.07, F 0.06, 9.8ms
Speed: 2.9ms pre

2024-09-30 12:01:52.072 Python[52413:6554278] +[IMKClient subclass]: chose IMKClient_Legacy
2024-09-30 12:01:52.072 Python[52413:6554278] +[IMKInputSession subclass]: chose IMKInputSession_Legacy



0: 224x224 E 0.12, M 0.09, G 0.08, A 0.06, S 0.06, 7.6ms
Speed: 3.0ms preprocess, 7.6ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 E 0.10, M 0.09, G 0.08, S 0.06, F 0.06, 4.9ms
Speed: 2.6ms preprocess, 4.9ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 E 0.11, G 0.10, M 0.08, S 0.07, A 0.06, 5.2ms
Speed: 2.7ms preprocess, 5.2ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 E 0.11, G 0.08, M 0.08, S 0.07, A 0.07, 7.2ms
Speed: 2.6ms preprocess, 7.2ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 E 0.11, G 0.08, S 0.08, M 0.07, A 0.06, 7.3ms
Speed: 2.7ms preprocess, 7.3ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 E 0.10, G 0.08, M 0.08, S 0.07, O 0.06, 7.1ms
Speed: 2.9ms preprocess, 7.1ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 I 0.31, Y 0.14, A 0.11, X 0.05, M 0.05, 14.3ms
Speed: 3.4ms prep