In [1]:
# This code started as a direct copy from opencv
# https://docs.opencv.org/4.x/dd/d43/tutorial_py_video_display.html

import numpy as np
import cv2 as cv

# Info about YOLO import and loading the Yolo Model
# https://docs.ultralytics.com/tasks/classify/#train

# Added this to import YOLO
from ultralytics import YOLO

# Load the YOLOv8 model, this is loading our custom trained weights for our model.
model = YOLO("best_v2.pt")
 
cap = cv.VideoCapture(0)

captured_text = []
captured_confidence = []
confidence_requirement = 0.75
count = 1
last = None

if not cap.isOpened():
    print("Cannot open camera")
    exit()
while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
 
    # if frame is read correctly ret is True
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break
    
    # These two lines are found here https://docs.opencv.org/4.x/dd/d43/tutorial_py_video_display.html
    # They reduce the size of the video
    
    ret = cap.set(cv.CAP_PROP_FRAME_WIDTH,240)
    ret = cap.set(cv.CAP_PROP_FRAME_HEIGHT,240)
    
    # Our operations on the frame come here
    gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
    # Display the resulting frame
    cv.imshow('frame', gray)
    
    # If you scroll to the very bottom of this link
    # https://docs.ultralytics.com/modes/predict/#thread-safe-inference
    # You will find the next 3 lines of code which I took from their and applied to this similar example
    
    # Run YOLOv8 inference on the frame
    results = model(frame)
    top_class = results[0].probs.top1
    top_confidence = results[0].probs.top1conf  # Get confidence of the top-class prediction
    
    if (top_confidence >= confidence_requirement):
        
        if (top_class == last):
            
            count=count+1
            
            if ((count>=4) and (appended==False)):
            
                captured_text.append(top_class)
                captured_confidence.append(top_confidence)
                appended=True
                
        else:
            count=1
            appended=False
                
        last = top_class
    
    # Visualize the results on the frame
    annotated_frame = results[0].plot()
    
    
    # Display the annotated frame
    cv.imshow("YOLOv8 Inference", annotated_frame)
    
    
    
    if cv.waitKey(1) == ord('q'):
        print("Captured top class: ", captured_text)
        
        # ASCII ref
        #https://simple.m.wikipedia.org/wiki/File:ASCII-Table-wide.svg
        
        # https://stackoverflow.com/questions/3673428/convert-int-to-ascii-and-back-in-python
        # Integer to ASCII
        
        translated_text = []
        for i in captured_text:
            
            if i<26:
                translated_text.append(chr(i+65))
            elif i==26:
                translated_text.append("del")
            elif i==27:
                translated_text.append("nothing")
            elif i==28:
                translated_text.append("space")
        
        for i in range(len(translated_text)):
            
            translated_text[i] = (translated_text[i], captured_confidence[i].item())
        
        print("Translated text: ", translated_text)
        break
        
        
 
# When everything done, release the capture
cap.release()
cv.destroyAllWindows()


0: 224x224 G 0.17, M 0.12, Y 0.11, N 0.09, E 0.08, 22.1ms
Speed: 25.7ms preprocess, 22.1ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 G 0.10, U 0.09, S 0.08, X 0.07, V 0.06, 7.3ms
Speed: 4.0ms preprocess, 7.3ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 G 0.10, U 0.10, S 0.08, V 0.07, W 0.06, 6.4ms
Speed: 4.6ms preprocess, 6.4ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 G 0.10, U 0.09, S 0.08, W 0.07, V 0.07, 6.9ms
Speed: 3.9ms preprocess, 6.9ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 G 0.12, U 0.08, W 0.08, V 0.06, S 0.06, 7.2ms
Speed: 4.6ms preprocess, 7.2ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 G 0.10, U 0.09, S 0.07, W 0.07, V 0.07, 11.0ms
Speed: 6.2ms preprocess, 11.0ms inference, 0.0ms postprocess per image at shape (1, 3, 224, 224)

0: 224x224 G 0.13, U 0.08, W 0.07, Y 0.06, V 0.06, 17.8ms
Speed: 8.5ms