# Data 690 Practical AI: Final Project
Jessica Conroy
## Notebook 3: Run Detection App
Contents:
- <b>Notebook 1</b> in this series collects the dataset for the project. 
- <b>Notebook 2</b> takes the next step by preparing and transforming the data for use in a pytorch model. It includes the training of the model with updatable constants for testing different parameters and configurations.
- <b>Notebook 3</b> This is the code used in the App.py script below, but with additional documentation added
- <b>App.py</b> represents the 3rd piece of the project which will launch the sign language detector. 

### Sources
https://docs.opencv.org/4.x/dd/d43/tutorial_py_video_display.html

https://learnopencv.com/cropping-an-image-using-opencv/

https://www.analyticsvidhya.com/blog/2021/07/building-a-hand-tracking-system-using-opencv/

https://www.codegrepper.com/code-examples/python/python+to+read+text+aloud\

https://python.tutorialink.com/create-a-rectangle-around-all-the-points-returned-from-mediapipe-hand-landmark-detection-just-like-cv2-boundingrect-does/


In [1]:
!pip install pyttsx3



In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torchvision.models import resnet50
from torch.nn import CrossEntropyLoss
from torch.nn import Dropout
from torch.nn import Identity
from torch.nn import Linear
from torch.nn import Module
from torch.nn import ReLU
from torch.nn import Sequential
from torch.nn import Sigmoid
from torch.nn import Flatten
import torch.nn as nn
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import cv2
import glob
import numpy
import random
import xml.etree.ElementTree as ET

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
import os

import time

os.getcwd()

'C:\\Users\\15856\\Data 690 AI\\Final Project'

In [3]:
train_data_path = './data/collectedimgs/train' 
test_data_path = './data/collectedimgs/test'


train_image_paths = []
train_annotation_paths = []
test_image_paths = []
test_annotations_paths = []

train_annotation_paths = glob.glob(train_data_path+"/*.xml")
train_annotation_paths = sorted(train_annotation_paths)
train_image_paths = glob.glob(train_data_path+"/*.jpg")
train_image_paths = sorted(train_image_paths)

test_annotations_paths = glob.glob(test_data_path+"/*.xml")
test_annotations_paths = sorted(test_annotations_paths)
test_image_paths = glob.glob(test_data_path+"/*.jpg")
test_image_paths = sorted(test_image_paths)

classes = []
for path in train_image_paths:
    filename = path.split('\\')[-1]
    class_name = filename.split('.')[0]
    if class_name == 'ThankYou':
        class_name = 'thank you'
    if class_name in classes:
        continue
    else:
        classes.append(class_name)

print(classes)

['thank you', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'hello', 'i', 'j', 'k', 'l', 'm', 'my', 'n', 'name', 'no', 'o', 'p', 'project', 'q', 'r', 's', 't', 'this', 'u', 'v', 'w', 'x', 'y', 'yes', 'z']


In [4]:
#Uncomment to select only a few classes to train

my_classes = ['ThankYou', 'hello','my', 'name', 'project','this', 'j', 'e', 's']

train_annotation_paths_lite = []
test_annotation_paths_lite = []

train_image_paths_lite = []
test_image_paths_lite = []

for i,path in enumerate(train_image_paths):
    filename = path.split('\\')[-1]
    class_name = filename.split('.')[0]
    for CN in my_classes:
        if CN == class_name:
            train_image_paths_lite.append(path)
            train_annotation_paths_lite.append(train_annotation_paths[i])

for i,path in enumerate(test_image_paths):
    filename = path.split('\\')[-1]
    class_name = filename.split('.')[0]
    for CN in my_classes:
        if CN == class_name:
            test_image_paths_lite.append(path)
            test_annotation_paths_lite.append(test_annotations_paths[i])

classes = []
for path in train_annotation_paths_lite:
    filename = path.split('\\')[-1]
    class_name = filename.split('.')[0]
    if class_name == 'ThankYou':
        class_name = 'thank you'
    if class_name in classes:
        continue
    else:
        classes.append(class_name)
        
print(classes)

['thank you', 'e', 'hello', 'j', 'my', 'name', 'project', 's', 'this']


In [5]:
idx_to_class = {i:j for i, j in enumerate(classes)}
class_to_idx = {value:key for key,value in idx_to_class.items()}
idx_to_class

{0: 'thank you',
 1: 'e',
 2: 'hello',
 3: 'j',
 4: 'my',
 5: 'name',
 6: 'project',
 7: 's',
 8: 'this'}

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
MEAN = [0.485, 0.456, 0.406]
STD = [0.229, 0.224, 0.225]

In [7]:
class ObjectClassifier(Module):
    def __init__(self, baseModel, numClasses):
        super(ObjectClassifier, self).__init__()
        # initialize the base model and the number of classes
        self.baseModel = baseModel
        self.numClasses = numClasses

        # build the classifier head to predict the class labels
        self.classifier = Sequential(
            Linear(2048, 512),
            ReLU(),
            Dropout(),
            Linear(512, 512),
            ReLU(),
            Dropout(),
            Linear(512, self.numClasses)
            )
        # set the classifier of our base model to produce outputs
        # from the last convolution block
        self.baseModel.fc = Identity()
    def forward(self, x):
    # pass the inputs through the base model and then obtain
    # predictions from two different branches of the network
        features = self.baseModel(x)
        classLogits = self.classifier(features)
    # return the outputs as a tuple
        return (classLogits)

In [8]:
resnet = resnet50(pretrained=True)
# rcnn = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
# freeze all ResNet50 layers so they will *not* be updated during the
# training process
for param in resnet.parameters():
    param.requires_grad = False
    
ObjectClassifier = ObjectClassifier(resnet, len(idx_to_class))
# ObjectClassifier.load_state_dict(torch.load("classifier_200epochs.pth", map_location=torch.device('cpu')))


In [9]:
checkpoint_dict = torch.load('classification_200epochs_croppedTI_checkpoint (2).pth', map_location=device)
ObjectClassifier.load_state_dict(checkpoint_dict['model_state_dict'])
ObjectClassifier.eval()

ObjectClassifier(
  (baseModel): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(

In [10]:
Transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(mean=MEAN, std=STD)
])
import pyttsx3, time 
import cv2
import mediapipe as mp

mphands = mp.solutions.hands
hands = mphands.Hands()
mp_drawing = mp.solutions.drawing_utils

cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    
    h, w, c = frame.shape
    framergb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(framergb)
    hand_landmarks = result.multi_hand_landmarks

    if hand_landmarks:
        if len(hand_landmarks) == 2: #When there are two hands in the view, I want a single box containing both hands
            x_max = 0
            y_max = 0
            x_min = w
            y_min = h
            combined = []
            for handLMs in hand_landmarks:
                for lm in handLMs.landmark: #create combined list of hand landmarks
                    combined.append(lm)
            for lm in combined:
                x, y = int(lm.x * w), int(lm.y * h)
                if x > x_max:
                    x_max = x
                if x < x_min:
                    x_min = x
                if y > y_max:
                    y_max = y
                if y < y_min:
                    y_min = y
                    
            #Crop based on identified hands and pas through model for prediction
            
            try:
                image = frame[x_max+10:y_max+10, x_min-10:y_min-10] #crop to just hands
    #             print(image)
    #             try:
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                image = cv2.resize(image, (224, 224))
                image = Transform(image)
                image = image.unsqueeze(0)
                print(image)
                # determine the class label with the largest predicted
                # probability
                prediction = ObjectClassifier(image)
    #             print(prediction)
                prediction = torch.nn.Softmax(dim=1)(prediction)
    #             print(prediction)
                i = prediction.argmax(dim=-1).cpu()
                label = idx_to_class[i.item()]
            except:
                image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                image = cv2.resize(image, (224, 224))
                image = Transform(image)
                image = image.unsqueeze(0)
#                 print(image)
                # determine the class label with the largest predicted
                # probability
                prediction = ObjectClassifier(image)
    #             print(prediction)
                prediction = torch.nn.Softmax(dim=1)(prediction)
    #             print(prediction)
                i = prediction.argmax(dim=-1).cpu()
                label = idx_to_class[i.item()]
                
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                #https://stackoverflow.com/questions/56108183/python-opencv-cv2-drawing-rectangle-with-text
            cv2.putText(frame, label, (x_min, y_min-10),cv2.FONT_HERSHEY_COMPLEX, .5, (0,255,0),1)
            engine = pyttsx3.init() 
            engine.say(label) 
            engine.runAndWait()
#             mp_drawing.draw_landmarks(frame, handLMs, mphands.HAND_CONNECTIONS)
        else:
            for handLMs in hand_landmarks:
                x_max = 0
                y_max = 0
                x_min = w
                y_min = h
                for lm in handLMs.landmark:
                    x, y = int(lm.x * w), int(lm.y * h)
                    if x > x_max:
                        x_max = x
                    if x < x_min:
                        x_min = x
                    if y > y_max:
                        y_max = y
                    if y < y_min:
                        y_min = y
                        
                #Crop based on identified hands and pas through model for prediction
                image = frame[x_max-10:y_max+10, x_min-10:y_min+10] #crop to just hands
                print(image)
                try:
                    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                    image = cv2.resize(image, (224, 224))
                    image = Transform(image)
                    image = image.unsqueeze(0)
                #     print(image)
                    # determine the class label with the largest predicted
                    # probability
                    prediction = ObjectClassifier(image)
        #             print(prediction)
                    prediction = torch.nn.Softmax(dim=1)(prediction)
        #             print(prediction)
                    i = prediction.argmax(dim=-1).cpu()
                    label = idx_to_class[i.item()]
                except:
                    label = 'No Sign'
                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                #https://stackoverflow.com/questions/56108183/python-opencv-cv2-drawing-rectangle-with-text
                cv2.putText(frame, label, (x_min, y_min-10),cv2.FONT_HERSHEY_COMPLEX, .5, (0,255,0),1)
                if label != 'No Sign':
                    engine = pyttsx3.init() 
                    engine.say(label) 
                    engine.runAndWait()
#                 mp_drawing.draw_landmarks(frame, handLMs, mphands.HAND_CONNECTIONS)
    # show the output image 
    cv2.imshow("Output", frame)
    if cv2.waitKey(1) == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[[[28 23  5]
  [37 31 13]
  [43 36 18]
  ...
  [21 21 10]
  [20 20  9]
  [14 14  3]]

 [[37 29 16]
  [41 32 20]
  [42 34 21]
  ...
  [20 20  7]
  [18 19  6]
  [15 15  4]]

 [[40 32 15]
  [42 32 15]
  [42 30 12]
  ...
  [22 20  9]
  [21 19  7]
  [19 17  6]]

 ...

 [[16 19  7]
  [10 13  1]
  [13 16  2]
  ...
  [16 13  6]
  [16 15  6]
  [10 11  0]]

 [[15 15  4]
  [12 13  1]
  [ 9 13  0]
  ...
  [18 19  7]
  [19 18  9]
  [13 12  5]]

 [[17 20  9]
  [12 15  2]
  [11 14  0]
  ...
  [17 18  6]
  [13 13  2]
  [15 15  4]]]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[

[[[178 193 216]
  [182 197 221]
  [185 200 224]
  ...
  [  3   3   0]
  [  0   1   0]
  [  0   1   0]]

 [[173 192 216]
  [184 198 224]
  [186 200 227]
  ...
  [  0   1   0]
  [  0   2   0]
  [  0   1   0]]

 [[171 192 215]
  [178 198 221]
  [180 198 221]
  ...
  [  0   2   0]
  [  0   2   0]
  [  0   1   0]]

 ...

 [[140 143 152]
  [138 140 149]
  [136 139 148]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[138 142 150]
  [139 142 150]
  [138 140 149]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[137 140 149]
  [141 144 153]
  [140 143 150]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]]
tensor([[[[1.6838, 1.7865, 1.8550,  ..., 1.2214, 1.2899, 1.2899],
          [1.7009, 1.7865, 1.8550,  ..., 1.2214, 1.2899, 1.2899],
          [1.7180, 1.7865, 1.8379,  ..., 1.2728, 1.3242, 1.3242],
          ...,
          [0.4508, 0.4679, 0.5022,  ..., 2.2489, 2.2489, 2.2489],
          [0.4337, 0.4679, 0.4851,  ..., 2.2489, 2.2489, 2.2489],
          [0.4337, 0.4679

[[[109  95  86]
  [116 102  92]
  [122 107  96]
  ...
  [108 104  84]
  [105 101  81]
  [104  99  79]]

 [[108  95  83]
  [117 104  92]
  [120 105  93]
  ...
  [110 106  84]
  [103  99  75]
  [ 96  96  67]]

 [[107  94  80]
  [115 102  89]
  [118 105  90]
  ...
  [106 105  82]
  [103 104  79]
  [101 102  76]]

 ...

 [[ 97 111 135]
  [ 97 112 136]
  [100 115 138]
  ...
  [ 15  18   5]
  [ 15  18   4]
  [ 15  18   4]]

 [[104 115 138]
  [102 115 138]
  [101 115 139]
  ...
  [ 11  14   0]
  [ 12  15   1]
  [ 12  15   1]]

 [[110 120 144]
  [109 122 145]
  [109 122 146]
  ...
  [ 13  17   0]
  [ 16  21   2]
  [ 16  21   2]]]
[[[127 115  96]
  [131 119  97]
  [129 116  92]
  ...
  [115 107  94]
  [118 107  94]
  [115 102  89]]

 [[130 116  97]
  [131 117  98]
  [130 116  97]
  ...
  [114 108  95]
  [113 106  93]
  [113 105  92]]

 [[134 116  97]
  [132 116  95]
  [128 115  91]
  ...
  [119 110  98]
  [116 107  95]
  [118 109  98]]

 ...

 [[ 96 107 117]
  [ 94 105 119]
  [ 99 109 126]
  ..

[[[110 104  78]
  [105  99  75]
  [103 100  78]
  ...
  [114 109  92]
  [113 108  90]
  [118 113  95]]

 [[107 102  76]
  [105 100  73]
  [104 100  73]
  ...
  [113 108  90]
  [113 108  92]
  [116 109  96]]

 [[108 103  76]
  [100  95  68]
  [103  97  71]
  ...
  [114 106  93]
  [121 115  99]
  [125 120 102]]

 ...

 [[134 137 150]
  [136 140 151]
  [130 134 145]
  ...
  [  3  11   0]
  [  3  11   0]
  [  0   8   0]]

 [[141 144 157]
  [140 145 156]
  [137 141 152]
  ...
  [  6  14   2]
  [  6  14   1]
  [  3  10   0]]

 [[141 147 160]
  [138 147 158]
  [137 147 156]
  ...
  [ 17  19  12]
  [ 14  19   9]
  [ 11  18   6]]]
[[[119 107  83]
  [114 100  79]
  [109  95  76]
  ...
  [122 114  94]
  [127 115  96]
  [130 116  98]]

 [[121 108  84]
  [120 107  86]
  [120 108  89]
  ...
  [122 114  95]
  [128 116  99]
  [128 115 101]]

 [[118 106  80]
  [118 106  81]
  [121 108  84]
  ...
  [125 117  97]
  [127 115  96]
  [130 117  99]]

 ...

 [[119 129 144]
  [124 133 150]
  [126 133 151]
  ..