In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
import os


class ImageClassificationBase(nn.Module):
    def training_step(self, batch):
        images, labels = batch 
        out = self(images)                  # Generate predictions
        loss = F.cross_entropy(out, labels) # Calculate loss
        return loss
    
    def validation_step(self, batch):
        images, labels = batch 
        out = self(images)                    # Generate predictions
        loss = F.cross_entropy(out, labels)   # Calculate loss
        acc = accuracy(out, labels)           # Calculate accuracy
        return {'val_loss': loss.detach(), 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format(epoch, result['val_loss'], result['val_acc']))

In [2]:
class ASLModel(ImageClassificationBase):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 128 x 16 x 16
            
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Dropout(0.25),
            
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 256 x 8 x 8
            
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 512 x 4 x 4
            
            nn.Flatten(), 
            nn.Linear(512*4*4, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 29))
        
    def forward(self, xb):
        return self.network(xb)

In [3]:
model = ASLModel()
model.load_state_dict(torch.load(r"C:\Users\humay\Jupyter Workspace\BSL CNN 2.pth", map_location=torch.device('cpu')) )

<All keys matched successfully>

In [4]:
import numpy as np
import cv2
#import tensorflow as tf
#mport keras

word_dict = {0:'অ', 1:'আ', 2:'া', 3:'ই', 4:'ি', 5:'ঈ', 6:'ী', 7:'উ', 8:'ু', 9:'ঊ', 10:'ূ', 11:'ঋ', 12:'ৃ', 13:'এ', 14:'ে', 15:'ঐ', 16:'ৈ', 17:'ও', 18:'ো', 19:'ঔ', 20:'ৌ', 21:'ক', 22:'খ', 23:'গ', 24:'ঘ', 25:'ঙ', 26:'চ', 27:'ছ', 28:'জ', 29:'ঝ', 30:'ঞ', 31:'ট', 32:'ঠ', 33:'ড', 34:'ঢ', 35:'ণ', 36:'ত', 37:'থ', 38:'দ', 39:'ধ', 40:'ন', 41:'প', 42:'ফ', 43:'ব', 44:'ভ', 45:'ম', 46:'য', 47:'র', 48:'ল', 49:'শ', 50:'ষ', 51:'স', 52:'হ', 53:'ড়', 54:'ঢ়', 55:'হ', 56:'য়', 57:'ৎ', 58:'ঃ', 59:'ং', 60:'ঁ'}


In [5]:
def predict(input, model):
    predictions = model(input)
    _, preds = torch.max(predictions, dim=1)
    
    #print("Prediction:", preds)
    return (preds[0])

In [6]:
background = None
accumulated_weight = 0.5

def cal_accum_avg(frame, accumulated_weight):

    global background
    
    if background is None:
        background = frame.copy().astype("float")
        return None

    cv2.accumulateWeighted(frame, background, accumulated_weight)

In [7]:
def is_hand_found(frame, threshold=25):
    global background
    
    diff = cv2.absdiff(background.astype("uint8"), frame)
    _ , thresholded = cv2.threshold(diff, threshold,255,cv2.THRESH_BINARY)
    
    # Grab the external contours for the image
    (version, _, _) = cv2.__version__.split('.')
    
    if version == '3':
        image, contours, hierarchy = cv2.findContours(thresholded.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
    elif version == '4':
        contours, hierarchy = cv2.findContours(thresholded.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
    
    
        
    
    if len(contours) == 0:
        return False
    else:
        return True

In [None]:
# Using cv2.rectangle() method
# Draw a rectangle with blue line borders of thickness of 2 px
#image = cv2.rectangle(image, start_point, end_point, color, thickness)

# Start coordinate, here (ROI_left, ROI_top)
# represents the top left corner of rectangle

# Ending coordinate, here (ROI-right, ROI_bottom)
# represents the bottom right corner of rectangle

ROI_top = 100
ROI_bottom = 300
ROI_right = 150
ROI_left = 350

cap = cv2.VideoCapture(0)
#cap.set(3, 1280)
#cap.set(4, 720)
num_frames = 0

while(True):
    # Capture frame-by-frame
    ret, frame = cap.read()
    
    
    # Our operations on the frame come here
    
    #flipping the frame
    #frame = cv2.flip(frame, 1)
    
    frame_copy = frame.copy()
    
    # Draw ROI on frame_copy
    cv2.rectangle(frame_copy, (ROI_left, ROI_top), (ROI_right, ROI_bottom), (255,0,0), 2)    
    crop_frame = frame_copy[ROI_top:ROI_bottom, ROI_right:ROI_left]
    
    crop_frame = cv2.resize(crop_frame, (32, 32))
    
    """
    
    gray = cv2.cvtColor(crop_frame, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (9, 9), 0)
      
    
    if num_frames < 70:
        cal_accum_avg(gray, accumulated_weight)
        cv2.putText(frame_copy, "FETCHING BACKGROUND...PLEASE WAIT", (80, 400), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0,0,255), 2)
    
    else:
        
        if (is_hand_found(gray)):
            cv2.imshow("Found hand", crop_frame)
    
    
    """
       
    #gesture = cv2.cvtColor(crop_frame, cv2.COLOR_BGR2RGB)
            
    
    
    #NORMALIZING     
    MEAN = 255 * torch.tensor([0.5, 0.5, 0.5])
    STD = 255 * torch.tensor([0.5, 0.5, 0.5])
    #converting to tensor from np_ndarray
    x = torch.from_numpy(crop_frame)
    #converting to float32
    x = x.type(torch.float32)
    x = x.permute(-1, 0, 1)
    x = (x - MEAN[:, None, None]) / STD[:, None, None]
    
    #print(x.view(1, 3, 32, 32))    
    
    #first dimension of tensor = batchsize, second = colour, third = height, fourth = widt
                    
    pred = predict(x.view(1, 3, 32, 32), model)
    
    
    cv2.putText(frame_copy, word_dict[pred.item()], (170, 45), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
       
    
    
    # Display the frame with segmented hand
    cv2.putText(frame_copy, "Place handsign in region of interest...", (10, 20), cv2.FONT_ITALIC, 0.5, (51,255,51), 1)
    cv2.imshow("Sign Recognition", frame_copy)
    
    # incrementing the number of frames for tracking
    num_frames += 1
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# When everything done, release the capture
cap.release()
cv2.destroyAllWindows()
