In [3]:
import cv2
import numpy as np
import torch
import os
import pyttsx3
from torch import nn
from torchvision import transforms

class ConvNet(nn.Module):

    def __init__(self):

        super(ConvNet, self).__init__()

        self.layer1 = nn.Sequential(

            nn.Conv2d(3, 16, kernel_size=3, padding=1),

            nn.ReLU(),

            nn.BatchNorm2d(16),  # Batch normalization

            nn.Conv2d(16, 16, kernel_size=3, padding=1),

            nn.ReLU(),

            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Dropout(0.25))  # Dropout

        self.layer2 = nn.Sequential(

            nn.Conv2d(16, 32, kernel_size=3, padding=1),

            nn.ReLU(),

            nn.BatchNorm2d(32),

            nn.Conv2d(32, 32, kernel_size=3, padding=1),

            nn.ReLU(),

            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Dropout(0.25))

        self.layer3 = nn.Sequential(

            nn.Conv2d(32, 64, kernel_size=3, padding=1),

            nn.ReLU(),

            nn.BatchNorm2d(64),

            nn.Conv2d(64, 64, kernel_size=3, padding=1),

            nn.ReLU(),

            nn.Dropout(0.25))

        self.flatten = nn.Flatten()

        self.fc1 = nn.Linear(64*12*12, 128)

        self.fc2 = nn.Linear(128, 31)
 
    def forward(self, x):

        x = self.layer1(x)

        x = self.layer2(x)

        x = self.layer3(x)

        x = self.flatten(x)

        x = self.fc1(x)

        x = self.fc2(x)

        return x


import torch

# Load the state dictionary
state_dict = torch.load(r"C:\Users\rasha\Downloads\model_best_arabic_after_augmantation.ckpt", map_location=torch.device('cpu'))

# Create an instance of the model
model = ConvNet()

# Load the state dictionary into the model
model.load_state_dict(state_dict)

# Set the model to evaluation mode
model.eval()









arabic_letters = ['Ain', 'Al', 'Alef', 'Beh', 'Dad', 'Dal', 'Feh', 'Ghain', 'Hah', 'Heh', 'Jeem', 'Kaf', 'Khah', 'Laa', 'Lam', 'Meem', 'Noon', 'Qaf', 'Reh', 'Sad', 'Seen', 'Sheen', 'Tah', 'Teh', 'Teh_Marbuta', 'Thal', 'Theh', 'Waw', 'Yeh', 'Zah', 'Zain']


labels = arabic_letters

# Video capture setup
cap = cv2.VideoCapture(0)
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((50, 50)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

while True:
    ret, frame = cap.read()
    # Flip the frame horizontally to invert the camera
    frame = cv2.flip(frame, 1)
    cv2.rectangle(frame, (100, 100), (300, 300), (0, 0, 255), 5)
    roi = frame[100:300, 100:300]
    #cv2.imshow('roi', roi)
    img = transform(roi).unsqueeze(0)


    with torch.no_grad():
        outputs = model(img)
        _, predicted = torch.max(outputs, 1)
        confidence = torch.nn.functional.softmax(outputs, dim=1).max().item() * 100

    predicted_char = labels[predicted.item()]
    engine = pyttsx3.init()
    engine.say(predicted_char)
    engine.runAndWait()

    font = cv2.FONT_HERSHEY_TRIPLEX
    fontScale = 1
    color = (0, 255, 255)
    thickness = 2
    msg = f'{predicted_char}, Conf: {confidence:.1f}%'
    cv2.putText(frame, msg, (80, 80), font, fontScale, color, thickness)
    cv2.imshow('frame', frame)

    if cv2.waitKey(10) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
