In [None]:
import cv2
import numpy as np
import tensorflow as tf
from ultralytics import YOLO
import pickle

yolo = YOLO("yolov8n.pt")
COCO_CLASSES = yolo.names

mobilenet_model = tf.keras.models.load_model(r"C:\Users\moham\Downloads\Robotech\Summer_Training\Project\Note_Book\Small_Data_model\YOLO\Small_Model_Final2.h5")

with open(r"C:\Users\moham\Downloads\Robotech\Summer_Training\Project\Note_Book\Small_Data_model\YOLO\label_dict.pkl", "rb") as f:
    label_dict = pickle.load(f)

cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    results = yolo(frame, stream=True)

    for r in results:
        for box in r.boxes:
            
            class_id = int(box.cls[0])
            label = COCO_CLASSES[class_id]

            # if label != "person":
            #     continue
            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)

            hand_img = frame[y1:y2, x1:x2]

            if hand_img.size > 0:

                hand_resized = cv2.resize(hand_img, (224, 224))
                hand_resized = hand_resized.astype("float32") / 255.0
                hand_resized = np.expand_dims(hand_resized, axis=0)

                preds = mobilenet_model.predict(hand_resized)
                class_id = int(np.argmax(preds))
                confidence = float(np.max(preds))

                if confidence > 0.6:
                    letter = label_dict[class_id]
                    label = f"{letter}: {confidence:.2f}"

                    cv2.putText(frame, label, (x1, y1 - 10),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
                    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)


    cv2.imshow("Sign Language Detection", frame)

    if cv2.waitKey(1) & 0xFF == 27:
        break

cap.release()
cv2.destroyAllWindows()





0: 480x640 1 person, 12.2ms
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Speed: 2.5ms preprocess, 12.2ms inference, 4.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.0ms
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
Speed: 2.3ms preprocess, 7.0ms inference, 2.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 16.2ms
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
Speed: 2.1ms preprocess, 16.2ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 15.2ms
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
Speed: 1.9ms preprocess, 15.2ms inference, 5.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 8.9ms
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Speed: 3.2ms preprocess, 8.9ms inference, 1.8ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 p