In [1]:
import cv2
import mediapipe as mp
import numpy as np
import torch
import os
import pyttsx3
from torch import nn
from torchvision import transforms

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Conv2d(16, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.25)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Conv2d(32, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.25)
        )
        self.layer3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Dropout(0.25)
        )
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64*12*12, 128)
        self.fc2 = nn.Linear(128, 31)
 
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

# Load the state dictionary
state_dict = torch.load(r'model_best_arabic_15_last.ckpt', map_location=torch.device('cpu'))

# Create an instance of the model
model = ConvNet()

# Load the state dictionary into the model
model.load_state_dict(state_dict)

# Set the model to evaluation mode
model.eval()

arabic_letters = ['Ain', 'Al', 'Alef', 'Beh', 'Dad', 'Dal', 'Feh', 'Ghain', 'Hah', 'Heh', 'Jeem', 'Kaf', 'Khah', 'Laa', 'Lam', 'Meem', 'Noon', 'Qaf', 'Reh', 'Sad', 'Seen', 'Sheen', 'Tah', 'Teh', 'Teh_Marbuta', 'Thal', 'Theh', 'Waw', 'Yeh', 'Zah', 'Zain']
labels = arabic_letters

# Video capture setup
cap = cv2.VideoCapture(0)
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((50, 50)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

mp_hands = mp.solutions.hands

hands = mp_hands.Hands()
while True:
    ret, frame = cap.read()
    if not ret:
        continue

    image_height, image_width, _ = frame.shape
    frame = cv2.flip(frame, 1)
    frame_RGB = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Detect hands
    results = hands.process(frame_RGB)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Calculate the bounding box center and size
            x_coords = [landmark.x for landmark in hand_landmarks.landmark]
            y_coords = [landmark.y for landmark in hand_landmarks.landmark]
            center_x = np.mean(x_coords)
            center_y = np.mean(y_coords)
            center_x, center_y = int(center_x * image_width), int(center_y * image_height)

            # Define the desired size of the bounding box
            box_size = [250, 250]  # Adjust box size as needed

            # Calculate the top left and bottom right coordinates of the box
            min_x = max(0, center_x - box_size[0] // 2)
            min_y = max(0, center_y - box_size[1] // 2)
            max_x = min(image_width, center_x + box_size[0] // 2)
            max_y = min(image_height, center_y + box_size[1] // 2)

            # Draw a red rectangle around the ROI
            cv2.rectangle(frame, (min_x, min_y), (max_x, max_y), (0, 0, 255), 2)

            # Extract the ROI
            roi = frame[min_y:max_y, min_x:max_x]
            img = transform(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)).unsqueeze(0)

            with torch.no_grad():
                outputs = model(img)
                _, predicted = torch.max(outputs, 1)
                confidence = torch.nn.functional.softmax(outputs, dim=1).max().item() * 100

            predicted_char = labels[predicted.item()]
            engine = pyttsx3.init()
            engine.say(predicted_char)
            engine.runAndWait()

            font = cv2.FONT_HERSHEY_TRIPLEX
            fontScale = 1
            color = (0, 255, 255)
            thickness = 2
            msg = f'{predicted_char}, Conf: {confidence:.1f}%'
            cv2.putText(frame, msg, (min_x, min_y - 20), font, fontScale, color, thickness)

    cv2.imshow('frame', frame)

    if cv2.waitKey(10) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


