In [1]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths
import tensorflowjs as tfjs

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os
import math
from keras.preprocessing import image
from keras.utils import np_utils

In [2]:
# ? define hyperparameters
IMG_SIZE = 224
# IMG_SIZE = 500
BATCH_SIZE = 64
EPOCHS = 100

MAX_SEQ_LENGTH = 500
# MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

In [3]:
# ? data preparation
train_df = pd.read_csv("./train-1.csv") # 3 classes
test_df = pd.read_csv("./test-1.csv") # 3 classes

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

Total videos for training: 188
Total videos for testing: 126


In [4]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]

def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")

In [5]:
feature_extractor = build_feature_extractor()

label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["classification"])
)

print(label_processor.get_vocabulary())

['BodyWeightSquats', 'PullUps', 'PushUps']


In [7]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["classification"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    index = 0
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        print(index)
        index += 1
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels

In [8]:
# train_data, train_labels = prepare_all_videos(train_df, "../../../../CNN_DATASET_1/train") # many classes
# test_data, test_labels = prepare_all_videos(test_df, "../../../../CNN_DATASET_1/test") # many classes
train_data, train_labels = prepare_all_videos(train_df, "./train-1") # 3 classes
test_data, test_labels = prepare_all_videos(test_df, "./test-1") # 3 classes

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116


In [9]:
# ? sequence model
# Utility for our sequence model.
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.GRU(16, return_sequences=True)(
        frame_features_input, mask=mask_input
    )
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model

# Utility for running experiments.
def run_experiment():
    filepath = "./tmp/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model

In [10]:
_, sequence_model = run_experiment()

Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.08147, saving model to ./tmp\video_classifier
Epoch 2/100
Epoch 00002: val_loss improved from 1.08147 to 1.07754, saving model to ./tmp\video_classifier
Epoch 3/100
Epoch 00003: val_loss improved from 1.07754 to 1.04099, saving model to ./tmp\video_classifier
Epoch 4/100
Epoch 00004: val_loss improved from 1.04099 to 1.03481, saving model to ./tmp\video_classifier
Epoch 5/100
Epoch 00005: val_loss improved from 1.03481 to 0.99911, saving model to ./tmp\video_classifier
Epoch 6/100
Epoch 00006: val_loss improved from 0.99911 to 0.95733, saving model to ./tmp\video_classifier
Epoch 7/100
Epoch 00007: val_loss improved from 0.95733 to 0.92639, saving model to ./tmp\video_classifier
Epoch 8/100
Epoch 00008: val_loss improved from 0.92639 to 0.90190, saving model to ./tmp\video_classifier
Epoch 9/100
Epoch 00009: val_loss improved from 0.90190 to 0.86232, saving model to ./tmp\video_classifier
Epoch 10/100
Epoch 00010: val_loss improv

Epoch 28/100
Epoch 00028: val_loss improved from 0.43733 to 0.42724, saving model to ./tmp\video_classifier
Epoch 29/100
Epoch 00029: val_loss improved from 0.42724 to 0.42665, saving model to ./tmp\video_classifier
Epoch 30/100
Epoch 00030: val_loss improved from 0.42665 to 0.40600, saving model to ./tmp\video_classifier
Epoch 31/100
Epoch 00031: val_loss improved from 0.40600 to 0.39215, saving model to ./tmp\video_classifier
Epoch 32/100
Epoch 00032: val_loss did not improve from 0.39215
Epoch 33/100
Epoch 00033: val_loss did not improve from 0.39215
Epoch 34/100
Epoch 00034: val_loss improved from 0.39215 to 0.37416, saving model to ./tmp\video_classifier
Epoch 35/100
Epoch 00035: val_loss improved from 0.37416 to 0.37358, saving model to ./tmp\video_classifier
Epoch 36/100
Epoch 00036: val_loss did not improve from 0.37358
Epoch 37/100
Epoch 00037: val_loss improved from 0.37358 to 0.37024, saving model to ./tmp\video_classifier
Epoch 38/100
Epoch 00038: val_loss improved from 0.3

Epoch 55/100
Epoch 00055: val_loss improved from 0.15397 to 0.14400, saving model to ./tmp\video_classifier
Epoch 56/100
Epoch 00056: val_loss improved from 0.14400 to 0.13763, saving model to ./tmp\video_classifier
Epoch 57/100
Epoch 00057: val_loss improved from 0.13763 to 0.12983, saving model to ./tmp\video_classifier
Epoch 58/100
Epoch 00058: val_loss improved from 0.12983 to 0.12396, saving model to ./tmp\video_classifier
Epoch 59/100
Epoch 00059: val_loss improved from 0.12396 to 0.12198, saving model to ./tmp\video_classifier
Epoch 60/100
Epoch 00060: val_loss did not improve from 0.12198
Epoch 61/100
Epoch 00061: val_loss improved from 0.12198 to 0.11627, saving model to ./tmp\video_classifier
Epoch 62/100
Epoch 00062: val_loss improved from 0.11627 to 0.09706, saving model to ./tmp\video_classifier
Epoch 63/100
Epoch 00063: val_loss improved from 0.09706 to 0.08744, saving model to ./tmp\video_classifier
Epoch 64/100
Epoch 00064: val_loss improved from 0.08744 to 0.08247, sav

Epoch 83/100
Epoch 00083: val_loss improved from 0.04532 to 0.03976, saving model to ./tmp\video_classifier
Epoch 84/100
Epoch 00084: val_loss improved from 0.03976 to 0.03716, saving model to ./tmp\video_classifier
Epoch 85/100
Epoch 00085: val_loss improved from 0.03716 to 0.03353, saving model to ./tmp\video_classifier
Epoch 86/100
Epoch 00086: val_loss did not improve from 0.03353
Epoch 87/100
Epoch 00087: val_loss did not improve from 0.03353
Epoch 88/100
Epoch 00088: val_loss did not improve from 0.03353
Epoch 89/100
Epoch 00089: val_loss did not improve from 0.03353
Epoch 90/100
Epoch 00090: val_loss did not improve from 0.03353
Epoch 91/100
Epoch 00091: val_loss did not improve from 0.03353
Epoch 92/100
Epoch 00092: val_loss did not improve from 0.03353
Epoch 93/100
Epoch 00093: val_loss did not improve from 0.03353
Epoch 94/100
Epoch 00094: val_loss did not improve from 0.03353
Epoch 95/100
Epoch 00095: val_loss did not improve from 0.03353
Epoch 96/100
Epoch 00096: val_loss d

In [11]:
# ? inference
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask

def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()
    print('inside: ', os.path.join(os.getcwd(), path))
    frames = load_video(os.path.join(os.getcwd(), path))
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames

In [17]:
test_video = np.random.choice(test_df["video_name"].values.tolist())
# test_video = 'squat_2.mp4'
# test_video = 'wrong_squat.mp4'
# test_video = 'pushup2.mp4' # rotated video para hindi magmukhang wall pushups
# test_video = 'pushup.mp4' # not rotated video, higher probability on body weight squats for some reason. I guess malaking factor ung orientation ng video
# test_video = 'pull-ups.mp4'
test_video = 'pike_pushups.mp4' # pike push ups, just for fun
print(f"Test video path: {test_video}")
path = os.path.join("test_videos", test_video)
print('outside: ', path)
test_frames = sequence_prediction(path)

Test video path: pike_pushups.mp4
outside:  new_dataset\unary_test\new_test\pike_pushups.mp4
inside:  C:\Users\kevin\Desktop\ay2021-2022-1st-sem-cmsc190-sp1-KevinGines1\code\squat_tensorflow_model\cnn_model\new_dataset\unary_test\new_test\pike_pushups.mp4
  PushUps: 96.84%
  BodyWeightSquats:  2.04%
  PullUps:  1.12%


In [18]:
sequence_model.save('./cnn_model_exports/cnn_model_3class')



INFO:tensorflow:Assets written to: ./cnn_model_exports/cnn_model_3class\assets


INFO:tensorflow:Assets written to: ./cnn_model_exports/cnn_model_3class\assets
