# Data 690 Practical AI: Final Project
Jessica Conroy
## Notebook 3: Run Detection App
Contents:
- <b>Notebook 1</b> in this series collects the dataset for the project. 
- <b>Notebook 2</b> takes the next step by preparing and transforming the data for use in a pytorch model. It includes the training of the model with updatable constants for testing different parameters and configurations.
- <b>Notebook 3</b> This is the code used in the App.py script below, but with additional documentation added
- <b>App.py</b> represents the 3rd piece of the project which will launch the sign language detector. 

### Sources
https://docs.opencv.org/4.x/dd/d43/tutorial_py_video_display.html

https://learnopencv.com/cropping-an-image-using-opencv/

https://www.analyticsvidhya.com/blog/2021/07/building-a-hand-tracking-system-using-opencv/

https://www.codegrepper.com/code-examples/python/python+to+read+text+aloud\

https://python.tutorialink.com/create-a-rectangle-around-all-the-points-returned-from-mediapipe-hand-landmark-detection-just-like-cv2-boundingrect-does/


In [10]:
!pip install pyttsx3

Collecting pyttsx3
  Downloading pyttsx3-2.90-py3-none-any.whl (39 kB)
Collecting pypiwin32
  Downloading pypiwin32-223-py3-none-any.whl (1.7 kB)
Installing collected packages: pypiwin32, pyttsx3
Successfully installed pypiwin32-223 pyttsx3-2.90


In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torchvision.models import resnet50
from torch.nn import CrossEntropyLoss
from torch.nn import Dropout
from torch.nn import Identity
from torch.nn import Linear
from torch.nn import Module
from torch.nn import ReLU
from torch.nn import Sequential
from torch.nn import Sigmoid
from torch.nn import Flatten
import torch.nn as nn
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import cv2
import glob
import numpy
import random
import xml.etree.ElementTree as ET

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
import os

import time

os.getcwd()

'C:\\Users\\15856\\Data 690 AI\\Final Project'

In [2]:
train_data_path = './data/collectedimgs/train' 
test_data_path = './data/collectedimgs/test'


train_image_paths = []
train_annotation_paths = []
test_image_paths = []
test_annotations_paths = []

train_annotation_paths = glob.glob(train_data_path+"/*.xml")
train_annotation_paths = sorted(train_annotation_paths)
train_image_paths = glob.glob(train_data_path+"/*.jpg")
train_image_paths = sorted(train_image_paths)

test_annotations_paths = glob.glob(test_data_path+"/*.xml")
test_annotations_paths = sorted(test_annotations_paths)
test_image_paths = glob.glob(test_data_path+"/*.jpg")
test_image_paths = sorted(test_image_paths)

classes = []
for path in train_image_paths:
    filename = path.split('\\')[-1]
    class_name = filename.split('.')[0]
    if class_name == 'ThankYou':
        class_name = 'thank you'
    if class_name in classes:
        continue
    else:
        classes.append(class_name)

print(classes)

['thank you', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'hello', 'i', 'j', 'k', 'l', 'm', 'my', 'n', 'name', 'no', 'o', 'p', 'project', 'q', 'r', 's', 't', 'this', 'u', 'v', 'w', 'x', 'y', 'yes', 'z']


In [3]:
#Uncomment to select only a few classes to train

my_classes = ['ThankYou', 'hello','my', 'name', 'project','this', 'j', 'e', 's']

train_annotation_paths_lite = []
test_annotation_paths_lite = []

train_image_paths_lite = []
test_image_paths_lite = []

for i,path in enumerate(train_image_paths):
    filename = path.split('\\')[-1]
    class_name = filename.split('.')[0]
    for CN in my_classes:
        if CN == class_name:
            train_image_paths_lite.append(path)
            train_annotation_paths_lite.append(train_annotation_paths[i])

for i,path in enumerate(test_image_paths):
    filename = path.split('\\')[-1]
    class_name = filename.split('.')[0]
    for CN in my_classes:
        if CN == class_name:
            test_image_paths_lite.append(path)
            test_annotation_paths_lite.append(test_annotations_paths[i])

classes = []
for path in train_annotation_paths_lite:
    filename = path.split('\\')[-1]
    class_name = filename.split('.')[0]
    if class_name == 'ThankYou':
        class_name = 'thank you'
    if class_name in classes:
        continue
    else:
        classes.append(class_name)
        
print(classes)

['thank you', 'e', 'hello', 'j', 'my', 'name', 'project', 's', 'this']


In [4]:
idx_to_class = {i:j for i, j in enumerate(classes)}
class_to_idx = {value:key for key,value in idx_to_class.items()}
idx_to_class

{0: 'thank you',
 1: 'e',
 2: 'hello',
 3: 'j',
 4: 'my',
 5: 'name',
 6: 'project',
 7: 's',
 8: 'this'}

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
MEAN = [0.485, 0.456, 0.406]
STD = [0.229, 0.224, 0.225]

In [6]:
class ObjectClassifier(Module):
    def __init__(self, baseModel, numClasses):
        super(ObjectClassifier, self).__init__()
        # initialize the base model and the number of classes
        self.baseModel = baseModel
        self.numClasses = numClasses

        # build the classifier head to predict the class labels
        self.classifier = Sequential(
            Linear(2048, 512),
            ReLU(),
            Dropout(),
            Linear(512, 512),
            ReLU(),
            Dropout(),
            Linear(512, self.numClasses)
            )
        # set the classifier of our base model to produce outputs
        # from the last convolution block
        self.baseModel.fc = Identity()
    def forward(self, x):
    # pass the inputs through the base model and then obtain
    # predictions from two different branches of the network
        features = self.baseModel(x)
        classLogits = self.classifier(features)
    # return the outputs as a tuple
        return (classLogits)

In [7]:
resnet = resnet50(pretrained=True)
# rcnn = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
# freeze all ResNet50 layers so they will *not* be updated during the
# training process
for param in resnet.parameters():
    param.requires_grad = False
    
ObjectClassifier = ObjectClassifier(resnet, len(idx_to_class))
# ObjectClassifier.load_state_dict(torch.load("classifier_200epochs.pth", map_location=torch.device('cpu')))


In [8]:
checkpoint_dict = torch.load('classification_200epochs_croppedTI_checkpoint (2).pth', map_location=device)
ObjectClassifier.load_state_dict(checkpoint_dict['model_state_dict'])
ObjectClassifier.eval()

ObjectClassifier(
  (baseModel): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(

In [12]:
Transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(mean=MEAN, std=STD)
])
import pyttsx3, time 
import cv2
import mediapipe as mp

mphands = mp.solutions.hands
hands = mphands.Hands()
mp_drawing = mp.solutions.drawing_utils

cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    
    h, w, c = frame.shape
    framergb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(framergb)
    hand_landmarks = result.multi_hand_landmarks

    if hand_landmarks:
        if len(hand_landmarks) == 2: #When there are two hands in the view, I want a single box containing both hands
            x_max = 0
            y_max = 0
            x_min = w
            y_min = h
            combined = []
            for handLMs in hand_landmarks:
                for lm in handLMs.landmark: #create combined list of hand landmarks
                    combined.append(lm)
            for lm in combined:
                x, y = int(lm.x * w), int(lm.y * h)
                if x > x_max:
                    x_max = x
                if x < x_min:
                    x_min = x
                if y > y_max:
                    y_max = y
                if y < y_min:
                    y_min = y
                    
            #Crop based on identified hands and pas through model for prediction
            
            try:
                image = frame[x_max+10:y_max+10, x_min-10:y_min-10] #crop to just hands
    #             print(image)
    #             try:
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                image = cv2.resize(image, (224, 224))
                image = Transform(image)
                image = image.unsqueeze(0)
                print(image)
                # determine the class label with the largest predicted
                # probability
                prediction = ObjectClassifier(image)
    #             print(prediction)
                prediction = torch.nn.Softmax(dim=1)(prediction)
    #             print(prediction)
                i = prediction.argmax(dim=-1).cpu()
                label = idx_to_class[i.item()]
            except:
                image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                image = cv2.resize(image, (224, 224))
                image = Transform(image)
                image = image.unsqueeze(0)
#                 print(image)
                # determine the class label with the largest predicted
                # probability
                prediction = ObjectClassifier(image)
    #             print(prediction)
                prediction = torch.nn.Softmax(dim=1)(prediction)
    #             print(prediction)
                i = prediction.argmax(dim=-1).cpu()
                label = idx_to_class[i.item()]
                
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                #https://stackoverflow.com/questions/56108183/python-opencv-cv2-drawing-rectangle-with-text
            cv2.putText(frame, label, (x_min, y_min-10),cv2.FONT_HERSHEY_COMPLEX, .5, (0,255,0),1)
            engine = pyttsx3.init() 
            engine.say(label) 
            engine.runAndWait()
#             mp_drawing.draw_landmarks(frame, handLMs, mphands.HAND_CONNECTIONS)
        else:
            for handLMs in hand_landmarks:
                x_max = 0
                y_max = 0
                x_min = w
                y_min = h
                for lm in handLMs.landmark:
                    x, y = int(lm.x * w), int(lm.y * h)
                    if x > x_max:
                        x_max = x
                    if x < x_min:
                        x_min = x
                    if y > y_max:
                        y_max = y
                    if y < y_min:
                        y_min = y
                        
                #Crop based on identified hands and pas through model for prediction
                image = frame[x_max-10:y_max+10, x_min-10:y_min+10] #crop to just hands
                print(image)
                try:
                    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                    image = cv2.resize(image, (224, 224))
                    image = Transform(image)
                    image = image.unsqueeze(0)
                #     print(image)
                    # determine the class label with the largest predicted
                    # probability
                    prediction = ObjectClassifier(image)
        #             print(prediction)
                    prediction = torch.nn.Softmax(dim=1)(prediction)
        #             print(prediction)
                    i = prediction.argmax(dim=-1).cpu()
                    label = idx_to_class[i.item()]
                except:
                    label = 'No Sign'
                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                #https://stackoverflow.com/questions/56108183/python-opencv-cv2-drawing-rectangle-with-text
                cv2.putText(frame, label, (x_min, y_min-10),cv2.FONT_HERSHEY_COMPLEX, .5, (0,255,0),1)
                engine = pyttsx3.init() 
                engine.say(label) 
                engine.runAndWait()
#                 mp_drawing.draw_landmarks(frame, handLMs, mphands.HAND_CONNECTIONS)
    # show the output image 
    cv2.imshow("Output", frame)
    if cv2.waitKey(1) == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

[]
[]
[[[ 30  33  20]
  [ 35  38  24]
  [ 36  39  23]
  ...
  [111 131 175]
  [122 141 185]
  [125 145 186]]

 [[ 32  36  22]
  [ 35  38  24]
  [ 34  36  22]
  ...
  [109 128 173]
  [124 143 187]
  [127 147 191]]

 [[ 35  37  18]
  [ 36  38  19]
  [ 36  38  19]
  ...
  [113 135 181]
  [125 144 190]
  [129 148 194]]

 ...

 [[116 124 128]
  [120 129 138]
  [121 130 142]
  ...
  [181 189 222]
  [183 189 222]
  [184 190 223]]

 [[124 132 132]
  [126 137 138]
  [122 132 140]
  ...
  [185 195 223]
  [184 192 218]
  [189 196 223]]

 [[133 137 135]
  [132 139 140]
  [126 134 143]
  ...
  [191 199 225]
  [190 197 224]
  [192 200 226]]]
[]
[]
[]
[]
[[[ 77  88  95]
  [101 118 132]
  [128 145 165]
  ...
  [204 212 255]
  [212 223 255]
  [207 222 255]]

 [[ 92  97 102]
  [110 120 135]
  [137 149 169]
  ...
  [208 215 255]
  [208 217 255]
  [206 215 255]]

 [[ 98 101  99]
  [117 120 136]
  [142 150 171]
  ...
  [219 222 255]
  [216 221 255]
  [208 215 255]]

 ...

 [[126 136 158]
  [124 137 159]
  

[[[  3   2   0]
  [  3   2   0]
  [  5   2   2]
  ...
  [251 255 255]
  [247 255 255]
  [244 255 255]]

 [[  3   2   0]
  [  2   1   0]
  [  3   0   0]
  ...
  [249 255 255]
  [249 254 255]
  [249 253 255]]

 [[  8   6   3]
  [  6   3   4]
  [  1   0   2]
  ...
  [249 253 255]
  [249 253 255]
  [249 253 255]]

 ...

 [[145 153 169]
  [145 153 169]
  [148 156 171]
  ...
  [ 19  18   4]
  [ 21  20   7]
  [ 22  23   9]]

 [[147 155 170]
  [147 155 170]
  [148 156 171]
  ...
  [ 19  19   1]
  [ 22  22   2]
  [ 22  22   1]]

 [[147 155 170]
  [144 152 167]
  [146 153 169]
  ...
  [ 16  16   0]
  [ 16  16   0]
  [ 17  17   0]]]
[[[140 150 150]
  [142 153 154]
  [140 151 153]
  ...
  [160 156 172]
  [162 158 174]
  [161 159 176]]

 [[147 153 154]
  [149 155 157]
  [145 154 156]
  ...
  [155 154 170]
  [160 156 172]
  [161 159 176]]

 [[146 152 152]
  [147 154 155]
  [149 155 157]
  ...
  [152 153 169]
  [154 155 171]
  [153 154 170]]

 ...

 [[148 157 166]
  [147 156 166]
  [146 154 165]
  ..

[[[119 136 149]
  [118 135 152]
  [117 133 153]
  ...
  [236 248 255]
  [232 246 255]
  [224 240 255]]

 [[115 131 145]
  [117 134 151]
  [115 134 153]
  ...
  [236 249 255]
  [234 249 255]
  [232 249 255]]

 [[112 131 145]
  [111 133 146]
  [114 135 154]
  ...
  [236 250 255]
  [236 251 255]
  [236 251 255]]

 ...

 [[203 204 224]
  [205 205 227]
  [210 210 231]
  ...
  [ 33  35  24]
  [ 28  34  20]
  [ 26  34  20]]

 [[198 200 217]
  [203 203 222]
  [209 209 228]
  ...
  [ 21  29  17]
  [ 20  28  15]
  [ 23  30  16]]

 [[199 202 217]
  [204 205 223]
  [209 207 226]
  ...
  [ 20  28  16]
  [ 23  30  18]
  [ 23  30  18]]]
[[[133 148 168]
  [131 146 170]
  [130 148 171]
  ...
  [113 123 172]
  [ 86 100 151]
  [ 76  91 144]]

 [[129 150 170]
  [127 147 168]
  [129 146 170]
  ...
  [113 120 170]
  [ 82  94 148]
  [ 71  85 139]]

 [[128 141 166]
  [125 137 164]
  [127 139 164]
  ...
  [119 129 172]
  [ 78  87 134]
  [ 61  70 120]]

 ...

 [[214 214 230]
  [217 213 229]
  [215 209 225]
  ..

[[[208 224 255]
  [211 227 255]
  [211 227 255]
  ...
  [244 249 255]
  [244 248 255]
  [244 249 255]]

 [[214 225 255]
  [214 225 255]
  [213 224 255]
  ...
  [244 250 255]
  [244 250 255]
  [244 250 255]]

 [[213 223 255]
  [216 225 255]
  [217 224 255]
  ...
  [242 251 255]
  [243 250 255]
  [244 250 255]]

 ...

 [[211 218 255]
  [217 221 255]
  [222 226 255]
  ...
  [244 250 255]
  [244 250 255]
  [244 250 255]]

 [[206 216 254]
  [209 218 255]
  [217 225 255]
  ...
  [244 250 255]
  [244 250 255]
  [244 250 255]]

 [[204 215 248]
  [205 216 249]
  [211 219 253]
  ...
  [244 250 255]
  [244 250 255]
  [244 250 255]]]
[[[ 2  5  0]
  [ 1  4  0]
  [ 1  4  0]
  ...
  [43 35 22]
  [41 32 20]
  [40 31 18]]

 [[ 6  6  0]
  [ 2  4  0]
  [ 0  3  0]
  ...
  [40 31 19]
  [43 35 22]
  [42 34 21]]

 [[ 2  5  0]
  [ 0  3  0]
  [ 1  4  0]
  ...
  [35 28 13]
  [43 35 22]
  [42 35 20]]

 ...

 [[22 22 24]
  [37 39 44]
  [59 64 72]
  ...
  [ 9  7  2]
  [ 8  7  2]
  [ 6  5  0]]

 [[11 10 12]
  [32 3