In [22]:
import cv2
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator, colors
import numpy as np

In [23]:
model = YOLO("Hand_alphabet.pt")
names = model.model.names

In [24]:
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("Cannot access the webcam.")
    exit()

last_letter = None
letter_buffer = []
no_detection_counter = 0
SAME_LETTER_THRESHOLD = 5  
NO_DETECTION_THRESHOLD = 50  

while True:
    ret, frame = cap.read()
    
    if not ret:
        print("Failed to grab frame")
        break

    results = model(frame)
    boxes = results[0].boxes.xyxy.cpu().numpy()  
    clss = results[0].boxes.cls.cpu().numpy().astype(int) 

    annotator = Annotator(frame, line_width=2)

    current_letter = None
    if len(clss) > 0:

        current_letter = names[clss[0]]
        no_detection_counter = 0

        if current_letter == last_letter:
            SAME_LETTER_THRESHOLD -= 1
            if SAME_LETTER_THRESHOLD <= 0:
                if not letter_buffer or letter_buffer[-1] != current_letter:
                    letter_buffer.append(current_letter)
                SAME_LETTER_THRESHOLD = 10 
        else:
            SAME_LETTER_THRESHOLD = 10 
    else:
        no_detection_counter += 1

    if no_detection_counter >= NO_DETECTION_THRESHOLD and letter_buffer:
        word = ''.join(letter_buffer)
        print(f"Recognized word: {word}")
        letter_buffer = [] 
        no_detection_counter = 0

    last_letter = current_letter

    for box, cls in zip(boxes, clss):
        annotator.box_label(box, label=names[cls], color=colors(cls))

    output_frame = annotator.result()

    current_word = ''.join(letter_buffer)
    cv2.putText(output_frame, f"Current: {current_word}", (10, 30), 
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow("Sign Language Detection", output_frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



0: 480x640 (no detections), 12.8ms
Speed: 1.0ms preprocess, 12.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 12.5ms
Speed: 2.0ms preprocess, 12.5ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 10.5ms
Speed: 1.0ms preprocess, 10.5ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 11.5ms
Speed: 1.0ms preprocess, 11.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 11.6ms
Speed: 1.1ms preprocess, 11.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 12.3ms
Speed: 1.0ms preprocess, 12.3ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 11.9ms
Speed: 0.0ms preprocess, 11.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 12.1ms
Speed: 0.0ms preprocess, 12.1ms i