## 1. Import and Install Dependencies

In [None]:
%pip install --user tensorflow==2.6.0 tensorflow-gpu==2.6.0 opencv-python mediapipe sklearn matplotlib

In [None]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

## 2. Keypoints using Mediapipe Holistic

In [None]:
mp_holistic = mp.solutions.holistic #Holistic model 整体模型mp
mp_drawing = mp.solutions.drawing_utils # Drawing utilities 绘图工具

In [None]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR Conversion BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                  # Image is now writeable
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR Conversion RGB 2  BGR
    return image, results

def draw_styled_landmarks(image, results):
    
     # 轮廓线 Draw face connection
    #mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                             #mp_drawing.DrawingSpec(color=(80,110,10), thickness = 1, circle_radius = 1),
                             #mp_drawing.DrawingSpec(color=(80,256,121), thickness = 1, circle_radius = 1))
    
     # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness = 2, circle_radius = 4),
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness = 2, circle_radius = 2)) 
    
    # draw left hand connections,
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness = 2, circle_radius = 4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness = 2, circle_radius = 2))
    
    # draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness = 2, circle_radius = 4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness = 2, circle_radius = 2))

def extract_keypoints(results):
    #如果frame中没有左手关键点就会抛出错误 注意左右手没有 res.visibility参数
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)# 压平 33 * 4
    #face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose,face,lh,rh])

## 4. Setup folders for collection

In [None]:
# Path for exported data, numpy arrays 制作关键点数据集文件夹
DATA_PATH = os.path.join('MP_Data') 

DATA_PATH = os.path.join('data/train') 
wlasl = ['book', 'drink', 'computer', 'before', 'chair', 'go', 'clothes', 'who', 'candy', 'cousin','deaf', 'fine', 'help', 'no', 'thin', 'walk', 'year', 'yes', 'all', 'black', 'cool', 'finish', 'hot', 'like', 'many', 'mother', 'now', 'orange', 'table', 'thanksgiving', 'what', 'woman', 'bed', 'blue', 'bowling', 'can', 'dog', 'white', 'wrong', 'accident', 'apple', 'bird', 'change', 'color', 'corn', 'cow', 'dance', 'dark', 'doctor']
# Actions that we try to detect 只需要在这里加action即可 参考路径 data\WLASL_train下的文件名
actions = np.array(sorted(wlasl[0:10]))

# Thirty videos worth of data
#no_sequences = 40

# Videos are going to be 30 frames in length
sequence_length = 30

# Folder start
# start_folder = 30

In [None]:
# 获取视频数据集中每条视频帧的数量，i.e.,no_fps 并保存到 fps_list
for action in actions:
    fps_list = []
    for root, dirs, files in os.walk(r"C:\deep-learning\HKMU\Extrat_keypoints\data\WLASL_train\{}".format(action)):  # 这里就填文件夹目录就可以了
        for file in files:
            # 获取文件路径
            if ('.mp4' in file):
                path = os.path.join(root, file)
                video = cv2.VideoCapture(path)
                no_fps = video.get(7)
                # video_fps = int(video.get(cv2.CAP_PROP_FPS))
                fps_list.append(no_fps)
        print(action, "'s # of videos: ", len(files)) # 把这个数 可以作为no_sequences 视频数量 ！！注意必须与此Cell中#2 for 循环一起组合使用，否则len（files）数量不正确。原因是得不到正确的遍历，值只为最后一个动作的文件数总和
        print("The frames that each video contains: ", fps_list,'\n')

In [None]:
# for action in actions:
#     for sequence in range (no_sequences):
#         try:
#             os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
#         except:
#             pass
# 制作对应手语单词的视频数量的文件夹，index 0 ~ （len - 1）
for action in actions:
    # NEW For loop
    for root, dirs, files in os.walk(r"C:\deep-learning\HKMU\Extrat_keypoints\data\WLASL_train\{}".format(action)):
        for sequence in range (len(files)):
            try:
                os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
            except:
                pass

## 5. Collect Keypoint Values for Training and Testing|

In [None]:
cap  = cv2.VideoCapture(0)
# Set mdieapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    #NEW LOOP
    # Loop through actions
    for action in actions:
        # Loop through sequnece aka videos
        for sequence in range(no_sequences):
            # Loop through video length aka sequence length
            for frame_num in range(sequence_length):
        
                # Read feed 读取喂入模型的图片
                ret, frame = cap.read() # !!!ret 不知道是什么参数

                # Make detections
                image, results = mediapipe_detection(frame, holistic)
                print(results)

                # Draw landmarks
                draw_styled_landmarks(image, results)
                
                # NEW Apply wait logic
                if frame_num == 0:
                    cv2.putText(image, 'START COLLECTION', (120,200),
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frmaes for {} video Number {}'.format(action,sequence), (15,12),
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow("OpenCV feed", image)                    
                    cv2.waitKey(2000)
                else:
                    cv2.putText(image, 'Collecting frmaes for {} video Number {}'.format(action,sequence), (15,12),
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow("OpenCV feed", image)   
                    
                # NEW export keypoints   
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                # Break gracefully 优雅地中断
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break
    cap.release()
    cv2.destroyAllWindows()

## 6. Preprocess Data and Create Labels and Features

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.utils.np_utils import to_categorical ##与Up主不一致的新的import方法
import tensorflow as tf

In [None]:
def label_data(base_data_path, actions, sequence_length):
    label_map = {label: num for num, label in enumerate(actions)}
    sequences, labels = [], []
    for action in actions:
        source_folder = os.path.join(base_data_path, action)
        leng = len(os.listdir(source_folder))
        for sequence in range(leng):
            window = []
            for frame_num in range(sequence_length):
                frame_path = os.path.join(source_folder, str(sequence), "{}.npy".format(frame_num))
                res = np.load(frame_path)
                window.append(res)
            sequences.append(window)
            labels.append(label_map[action])
    print(label_map)
    return sequences, labels

In [None]:
train_data_path = "data/train"
val_data_path = "data/val"

In [None]:
train_sequences, train_labels = label_data(train_data_path, actions, sequence_length)
val_sequences, val_labels = label_data(val_data_path, actions, sequence_length)

In [None]:
# Apply to_categorical(labels).astype(int) on labels
train_labels_categorical = to_categorical(train_labels).astype(int)
val_labels_categorical = to_categorical(val_labels).astype(int)

In [None]:
X_train, y_train = np.array(train_sequences),train_labels_categorical
X_val, y_val = np.array(val_sequences), val_labels_categorical

## 7. Build and Train LSTM Neural Network

In [None]:
from tensorflow.python.keras.models import Sequential # allow us to build a sequential neural network
from tensorflow.python.keras.layers import LSTM, Dense,Dropout # LSTM: temoporal component, Dense: a normal fully connected layer
from tensorflow.python.keras.callbacks import TensorBoard, ModelCheckpoint # allow us to perform some logging inside, trace and monitor our model as its training
import datetime

In [None]:
now = datetime.datetime.now()
log_dir = os.path.join('Logs', "LSTM",now.strftime("%Y-%m-%d-%H-%M-L2"))
tb_callback = TensorBoard(log_dir=log_dir)
print(now.strftime("%Y-%m-%d-%H-%M"))

In [None]:
# dropout_rate = 0.5

# model = Sequential()
# model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 258))) # 64 LSTM units not layers 30 frames with 1662 param
# model.add(Dropout(dropout_rate))
# model.add(LSTM(128, return_sequences=True, activation='relu'))
# model.add(Dropout(dropout_rate))
# model.add(LSTM(64, return_sequences=False, activation='relu'))
# model.add(Dropout(dropout_rate))
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(dropout_rate))
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(dropout_rate))
# model.add(Dense(32, activation='relu'))
# model.add(Dropout(dropout_rate))
# model.add(Dense(actions.shape[0], activation='softmax'))

In [None]:
from keras.regularizers import l2

reg_strength = 0.001

model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 258), kernel_regularizer=l2(reg_strength))) # 64 LSTM units not layers 30 frames with 1662 param
model.add(LSTM(128, return_sequences=True, activation='relu', kernel_regularizer=l2(reg_strength)))
model.add(LSTM(64, return_sequences=False, activation='relu', kernel_regularizer=l2(reg_strength)))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(reg_strength)))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(reg_strength)))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(reg_strength)))
model.add(Dense(actions.shape[0], activation='softmax', kernel_regularizer=l2(reg_strength)))

In [None]:
model.compile(optimizer='adam',  
loss='categorical_crossentropy', 
metrics = ['accuracy']) #多类别的loss使用中间的，二分的类别使用binary_crossentropy

In [None]:
# Set up the ModelCheckpoint callback
checkpoint_filepath = ("models/LSTM/best_model_weights.h5")
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor="val_loss",
    mode="min",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

In [None]:
model.fit(
    X_train, 
    y_train, 
    epochs=200, 
    callbacks = [tb_callback],
    validation_data=(X_val, y_val))

In [None]:
model.summary()

## 8. Make Predictions

In [None]:
res = model.predict(X_test)

In [None]:
actions[np.argmax(res[0])]

In [None]:
actions[np.argmax(y_test[0])]

## 9. Save Weights

In [None]:
model.save('action_220_5.h5')

In [None]:
del model

In [None]:
model.load_weights('action_220_3.h5')

## 10. Evaluation using Confusion Matrix and Accuracy

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [None]:
yhat = model.predict(X_train)

In [None]:
ytrue = np.argmax(y_train, axis = 1).tolist()
yhat = np.argmax(yhat, axis = 1).tolist()

In [None]:
multilabel_confusion_matrix(ytrue,yhat)

In [None]:
accuracy_score(ytrue,yhat)

## 11 . Test in Real Time

In [None]:
colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [None]:
# 1. New detection variables
sequence = []
sentence = []
threshold = 0.95

cap  = cv2.VideoCapture(0)
# Set mdieapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        
        # Read feed 读取喂入模型的图片
        ret, frame = cap.read() # !!!ret 不知道是什么参数
        
        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
#         sequence.insert(0,keypoints)
#         sequence = sequence[:30]
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
            
        #3. Viz logic
            if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
            image = prob_viz(res, actions, image, colors)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow("OpenCV feed", image)

        # Break gracefully 优雅地中断
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

In [None]:
X_test[0].shape