In [1]:
!pip install tensorflow opencv-python mediapipe sklearn matplotlib jupyter



In [2]:
import cv2
import numpy as np
import os
os.environ['KMP_DUPLICATE_LTBL_OK']='True'
import time
import mediapipe as mp
from matplotlib import pyplot as plt

In [3]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [4]:
def media_pipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

In [5]:
def draw_landmarks(image, results):
    #mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, mp_drawing.DrawingSpec(color=(0,0,0), thickness=2, circle_radius=1), mp_drawing.DrawingSpec(color=(0,0,0), thickness=2, circle_radius=1))
    #mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, mp_drawing.DrawingSpec(color=(0,0,0), thickness=2, circle_radius=1), mp_drawing.DrawingSpec(color=(0,0,0), thickness=2, circle_radius=1))
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, mp_drawing.DrawingSpec(color=(0,0,0), thickness=2, circle_radius=1), mp_drawing.DrawingSpec(color=(0,0,0), thickness=2, circle_radius=1))
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, mp_drawing.DrawingSpec(color=(0,0,0), thickness=2, circle_radius=1), mp_drawing.DrawingSpec(color=(0,0,0), thickness=2, circle_radius=1))

In [6]:
cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()

        image, results = media_pipe_detection(frame, holistic)
        
        draw_landmarks(image, results)

        cv2.imshow('OpenCV Feed', image)
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

In [7]:
def extractPoints(r):
    rightHand = np.array([[res.x,res.y,res.z] for res in r.right_hand_landmarks.landmark]).flatten() if r.right_hand_landmarks else np.zeros(63)
    leftHand = np.array([[res.x,res.y,res.z] for res in r.left_hand_landmarks.landmark]).flatten() if r.left_hand_landmarks else np.zeros(63)
    return np.concatenate([rightHand, leftHand])

# Set up folder

In [8]:
DATA_PATH = os.path.join("MP_Data")
signs = np.array(['Me','Student','Good morning', 'Help me', 'Thank you', 'Hospital', 'Doctor'])
sqs = 50
sqLength = 30

In [9]:
for sign in signs:
    for sq in range(sqs):
        try:
            os.makedirs(os.path.join(DATA_PATH, sign, str(sq)))
        except:
            pass

In [10]:
cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    for sign in signs:
        for sq in range(sqs):
            for frameNum in range(sqLength):
                npyPath = os.path.join(DATA_PATH, sign, str(sq), str(frameNum))
                
                if os.path.isfile(npyPath + ".npy"):
                    break

                ret, frame = cap.read()

                image, results = media_pipe_detection(frame, holistic)

                draw_landmarks(image, results)
                
                if frameNum == 0:
                    cv2.putText(image, "Start collection", (120,200), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 4, cv2.LINE_AA)
                    cv2.putText(image, "Collecting frames for {} Video Number {}".format(sign, sq), (120,200), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 4, cv2.LINE_AA)
                    cv2.waitKey(2000)
                else:
                    cv2.putText(image, "Collecting frames for {} Video Number {}".format(sign, sq), (120,200), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 4, cv2.LINE_AA)

                points = extractPoints(results)
                
                np.save(npyPath, points)
                cv2.imshow('OpenCV Feed', image)
                
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
    cap.release()
    cv2.destroyAllWindows()

In [11]:
labelMap = {label:num for num, label in enumerate(signs)}

In [12]:
labelMap

{'Me': 0,
 'Student': 1,
 'Good morning': 2,
 'Help me': 3,
 'Thank you': 4,
 'Hospital': 5,
 'Doctor': 6}

In [13]:
sequences, labels = [],[]
for sign in signs:
    for sq in range(sqs):
        window = []
        for frameNum in range(sqLength):
            res = np.load(os.path.join(DATA_PATH, sign, str(sq), "{}.npy".format(frameNum)))
            window.append(res)
        sequences.append(window)
        labels.append(labelMap[sign])

In [14]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [15]:
X = np.array(sequences)

In [16]:
y = to_categorical(labels).astype(int)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.05)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [19]:
log_dir = os.path.join("Logs")
tb_callback = TensorBoard(log_dir=log_dir)

In [20]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,126)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(signs.shape[0], activation='softmax'))

In [21]:
X.shape

(350, 30, 126)

In [21]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [23]:
#model.fit(X_train, y_train, epochs=80, callbacks=[tb_callback])

In [24]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 30, 64)            48896     
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 128)           98816     
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 231       
Total params: 203,591
Trainable params: 203,591
Non-trainable params: 0
_________________________________________________________________


In [25]:
res = model.predict(X_test)

In [26]:
signs[np.argmax(res[3])]

'Hospital'

In [27]:
signs[np.argmax(y_test[3])]

'Student'

In [28]:
#model.save('ModelHermes')

In [22]:
model.load_weights('ModelHermes')

In [30]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [31]:
yhat = model.predict(X_train)

In [32]:
ytrue = np.argmax(y_train, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [33]:
np.argmax(X_test)

62938

In [34]:
ytrue

[4,
 0,
 5,
 4,
 2,
 0,
 6,
 1,
 3,
 3,
 6,
 6,
 4,
 4,
 5,
 4,
 6,
 2,
 1,
 2,
 4,
 4,
 4,
 3,
 1,
 0,
 2,
 4,
 2,
 4,
 1,
 0,
 3,
 0,
 2,
 4,
 6,
 0,
 0,
 1,
 2,
 4,
 4,
 3,
 0,
 5,
 1,
 0,
 6,
 1,
 0,
 1,
 0,
 3,
 0,
 6,
 3,
 4,
 0,
 0,
 1,
 5,
 3,
 1,
 1,
 4,
 2,
 1,
 0,
 5,
 2,
 5,
 6,
 6,
 3,
 5,
 1,
 6,
 4,
 2,
 4,
 2,
 5,
 3,
 1,
 6,
 1,
 3,
 6,
 1,
 2,
 5,
 3,
 1,
 4,
 0,
 5,
 5,
 5,
 5,
 5,
 5,
 0,
 2,
 3,
 6,
 6,
 2,
 5,
 4,
 3,
 6,
 4,
 2,
 4,
 4,
 4,
 5,
 0,
 2,
 6,
 0,
 3,
 5,
 3,
 0,
 6,
 0,
 4,
 3,
 0,
 1,
 2,
 5,
 3,
 0,
 4,
 6,
 6,
 5,
 0,
 3,
 5,
 3,
 2,
 3,
 6,
 2,
 4,
 3,
 4,
 0,
 6,
 4,
 4,
 2,
 3,
 2,
 4,
 6,
 6,
 6,
 3,
 5,
 6,
 0,
 5,
 3,
 1,
 3,
 4,
 6,
 1,
 1,
 6,
 5,
 6,
 0,
 5,
 1,
 1,
 4,
 1,
 4,
 3,
 1,
 4,
 6,
 5,
 5,
 0,
 2,
 4,
 0,
 5,
 4,
 6,
 3,
 5,
 5,
 3,
 4,
 3,
 1,
 1,
 3,
 2,
 6,
 4,
 0,
 3,
 4,
 4,
 1,
 2,
 5,
 2,
 0,
 1,
 1,
 6,
 3,
 3,
 2,
 4,
 6,
 1,
 2,
 6,
 2,
 6,
 1,
 4,
 1,
 3,
 0,
 4,
 2,
 2,
 2,
 5,
 1,
 3,
 6,
 2,
 0,
 3,
 4,
 3,
 2,


In [35]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[284,   0],
        [  1,  47]],

       [[286,   0],
        [  1,  45]],

       [[285,   0],
        [  0,  47]],

       [[283,   1],
        [  0,  48]],

       [[284,   0],
        [  1,  47]],

       [[282,   0],
        [  0,  50]],

       [[285,   2],
        [  0,  45]]])

In [28]:
accuracy_score(ytrue,yhat)

NameError: name 'accuracy_score' is not defined

In [40]:
#test in real time
sequence = []
sentence = []
threshold = 0.96

cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()

        image, results = media_pipe_detection(frame, holistic)
        
        draw_landmarks(image, results)
        
        #add result to sequence and once seguence got 30 frames, do the reconition
        keypoints = extractPoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(signs[np.argmax(res)])
            
            if res[np.argmax(res)] > threshold:
                if len(sentence) > 0:
                    if signs[np.argmax(res)] != sentence[-1]:
                        sentence.append(signs[np.argmax(res)])
                else:
                    sentence.append(signs[np.argmax(res)])
                
        if len(sentence) > 5:
            sentence = sentence[-5:]
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)

        cv2.imshow('OpenCV Feed', image)
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

Doctor
Doctor
Doctor
Doctor
Doctor
Doctor
Doctor
Doctor
Doctor
Doctor
Doctor
Doctor
Doctor
Doctor
Doctor
Doctor


In [38]:
sequence = []
sentence = []
threshold = 0.7

cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()

        image, results = media_pipe_detection(frame, holistic)
        
        draw_landmarks(image, results)
        
        #add result to sequence and once seguence got 30 frames, do the reconition
        keypoints = extractPoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            
            if res[np.argmax(res)] > threshold:
                sentence.append(signs[np.argmax(res)])
                cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
                cv2.putText(image, signs[np.argmax(res)], (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
                cv2.imshow('OpenCV Feed', image)
                print(signs[np.argmax(res)])
                if cv2.waitKey(5000) == ord('t'):
                    #the results are correct
                    for h in sequence:
                        #loop through sequence
                        lenofdir = len(os.listdir(os.path.join(DATA_PATH, signs[np.argmax(res)])))
                        thepath = os.path.join(DATA_PATH, signs[np.argmax(res)], str(lenofdir))
                        os.makedirs(thepath)
                        np.save(thepath, h)
                    sequence = sequence[:-30]
                else:
                    sequence = sequence[:-30]
                    sentence = sentence[:-1]
                
        if len(sentence) > 5:
            sentence = sentence[-5:]
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)

        cv2.imshow('OpenCV Feed', image)
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()