In [16]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
import numpy as np

import os

In [17]:
import sys
print(sys.version)

3.9.1 (default, Dec 11 2020, 09:29:25) [MSC v.1916 64 bit (AMD64)]


In [18]:
import tensorflow as tf
from glob import glob
import pandas as pd
import numpy as np

class TFRecordReader:
    def __init__(self, dir, batch_size, classes, shuffle_buffer_size=512, isRGB=True):
        self.batch_size = batch_size
        self.dir = dir
        self.isRGB = isRGB
        self.shuffle_buffer_size = shuffle_buffer_size
        # label_df = pd.read_csv(label_dir, sep=";", names=["gesture"])
        # self.label_encoder = preprocessing.LabelEncoder()
        # self.label_encoder.fit(label_df["gesture"])
        # print("No of classes:", len(self.label_encoder.classes_))

        self.label_processor = tf.keras.layers.StringLookup(
            num_oov_indices=0, vocabulary=classes,
            mask_token=None 
        )
        self.classes = self.label_processor.vocabulary_size()

       

        self.num_segments = 8
        self.new_length = 4
        self.num_frames = 36
    
    def encode_label(self, label):
        
        label = self.label_processor(label).numpy()
  
        return label

    def parse_tfr_element(self, element):
        data = {
            'n_clips': tf.io.FixedLenFeature([], tf.int64),
            'height': tf.io.FixedLenFeature([], tf.int64),
            'width': tf.io.FixedLenFeature([], tf.int64),
            'depth': tf.io.FixedLenFeature([], tf.int64),
            'raw_image': tf.io.FixedLenFeature([], tf.string),
            'label': tf.io.FixedLenFeature([], tf.string),
        }

        content = tf.io.parse_single_example(element, data)

        n_clips = content['n_clips']
        height = content['height']
        width = content['width']
        depth = content['depth']
        raw_image = content['raw_image']
        label = content['label']

        # get out 'feature' --our image and reshape appropriatey
        feature = tf.io.parse_tensor(raw_image, out_type=tf.uint8)
        feature = tf.reshape(feature, shape=[n_clips, height, width, depth])
        if not self.isRGB:
            feature = tf.image.rgb_to_grayscale(feature)
        feature = tf.image.convert_image_dtype(feature, tf.float32)

        label = tf.py_function(self.encode_label, inp=(label,), Tout=tf.int32)

        return (feature, label)

    def get_dataset(self,  pattern:str="*_jester.tfrecords"):

        files = glob(self.dir+pattern, recursive=False)

        dataset = tf.data.TFRecordDataset(files, compression_type="ZLIB")
        dataset = dataset.shuffle(self.shuffle_buffer_size)

        dataset = dataset.map(lambda x: self.parse_tfr_element(x), num_parallel_calls=tf.data.AUTOTUNE)
        # dataset = dataset.filter(lambda x, y: tf.py_function(filter, inp=(x,y), Tout=tf.bool ) )
        # dataset = dataset.map(self._sample_indices)

        dataset = dataset.batch(self.batch_size)
        dataset = dataset.repeat()
        
        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        return dataset







    def _non_random_indices(self):
        if self.num_frames > self.num_segments + self.new_length - 1:
            tick = (self.num_frames - self.new_length + 1) / float(self.num_segments)
            offsets = np.array([int(tick / 2.0 + tick * x) for x in range(self.num_segments)])
        else:
            offsets = np.zeros((self.num_segments,))
        return offsets + 1
    
    def _random_shift_indices(self):
        average_duration = (self.num_frames - self.new_length + 1) // self.num_segments
        if average_duration > 0:
            offsets = np.multiply(list(range(self.num_segments)), average_duration) + np.random.randint(average_duration,
                                                                                                        size=self.num_segments)
        elif self.num_frames > self.num_segments:
            offsets = np.sort(np.random.randint(self.num_frames - self.new_length + 1, size=self.num_segments))
        else:
            offsets = np.zeros((self.num_segments,))
        return offsets+1

    
    def _get_sampled_indices(self):
        new_indices = []
        #  indices = offsets
        # indices =  _random_shift()
        indices =  self._non_random_indices()
        # print(indices)
        for seg_ind in indices:
            p = int(seg_ind)
            for i in range(self.new_length):
                # seg_imgs = self._load_image(record.path, p)
                # images.extend(seg_imgs)
                new_indices.append(p)
                if p < self.num_frames:
                    p += 1
        return new_indices


    def _sample_indices(self, feature, label):
        indices = self._get_sampled_indices()
        # print("wtf")
        # print(indices)
        return tf.gather(feature, indices=indices), label
        # print(new_inp.shape)
        # print(new_inp)
        # return new_inp
    



In [19]:
class TubeletEmbedding(layers.Layer):
    def __init__(self, embed_dim, patch_size, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.patch_size = patch_size
        self.projection = layers.Conv3D(
            filters=embed_dim,
            kernel_size=patch_size,
            strides=patch_size,
            padding="VALID",
        )
        self.flatten = layers.Reshape(target_shape=(-1, embed_dim))

    def call(self, videos):
        projected_patches = self.projection(videos)
        flattened_patches = self.flatten(projected_patches)
        return flattened_patches
    
    def get_config(self):
        config = super(TubeletEmbedding, self).get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "patch_size": self.patch_size
        })
        return config
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [20]:
class PositionalEncoder(layers.Layer):
    def __init__(self, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim

    def build(self, input_shape):
        _, num_tokens, _ = input_shape
        self.num_tokens = num_tokens
        self.position_embedding = layers.Embedding(
            input_dim=num_tokens, output_dim=self.embed_dim
        )

    def call(self, encoded_tokens):
        # Encode the positions and add it to the encoded tokens
        positions = tf.range(start=0, limit=self.num_tokens, delta=1)

        encoded_positions = self.position_embedding(positions)
        encoded_tokens = encoded_tokens + encoded_positions
        return encoded_tokens
    def get_config(self):
        return {
            "embed_dim": self.embed_dim,
        }
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [21]:
# DATA
DATASET_NAME = "jester"
BATCH_SIZE = 32
AUTO = tf.data.AUTOTUNE
INPUT_SHAPE = (36, 100, 100, 1) 


# OPTIMIZER
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-5

# TRAINING
EPOCHS = 60

# TUBELET EMBEDDING
PATCH_SIZE = (8, 8, 8)
NUM_PATCHES = (INPUT_SHAPE[0] // PATCH_SIZE[0]) ** 2

# ViViT ARCHITECTURE
LAYER_NORM_EPS = 1e-6
PROJECTION_DIM = 128
NUM_HEADS = 8
NUM_LAYERS = 8

In [22]:
def create_vivit_classifier(
    tubelet_embedder,
    positional_encoder,
    input_shape=INPUT_SHAPE,
    transformer_layers=NUM_LAYERS,
    num_heads=NUM_HEADS,
    embed_dim=PROJECTION_DIM,
    layer_norm_eps=LAYER_NORM_EPS,
    num_classes=2,
):
    # Get the input layer
    inputs = layers.Input(shape=input_shape)
    # Create patches.
    patches = tubelet_embedder(inputs)
    # Encode patches.
    encoded_patches = positional_encoder(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization and MHSA
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=0.1
        )(x1, x1)

        # Skip connection
        x2 = layers.Add()([attention_output, encoded_patches])

        # Layer Normalization and MLP
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        x3 = keras.Sequential(
            [
                layers.Dense(units=embed_dim * 4, activation=tf.nn.gelu),
                layers.Dense(units=embed_dim, activation=tf.nn.gelu),
            ]
        )(x3)

        # Skip connection
        encoded_patches = layers.Add()([x3, x2])

    # Layer normalization and Global average pooling.
    representation = layers.LayerNormalization(epsilon=layer_norm_eps)(encoded_patches)
    representation = layers.GlobalAvgPool1D()(representation)

    # Classify outputs.
    outputs = layers.Dense(units=num_classes, activation="softmax")(representation)

    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [23]:
filter_ges = [
"Swiping Up", "Swiping Down", "Swiping Left", "Swiping Right",
"Zooming In With Two Fingers","Zooming Out With Two Fingers",
"Zooming In With Full Hand","Zooming Out With Full Hand","Turning Hand Clockwise",
"Turning Hand Counterclockwise","Thumb Up","Thumb Down","Shaking Hand",
"Stop Sign","Doing other things","No gesture"
]

In [24]:
BATCH_SIZE = 8
NUM_CLASSES = len(filter_ges)

train_ds_gen = TFRecordReader(dir="../input/20bn-jester-tfrecord-16-classes/train/",batch_size=BATCH_SIZE,shuffle_buffer_size=32,classes=filter_ges, isRGB=False)
train_ds = train_ds_gen.get_dataset()

val_ds_gen = TFRecordReader(dir="../input/20bn-jester-tfrecord-16-classes/validation/",batch_size=BATCH_SIZE, shuffle_buffer_size=32,classes=filter_ges, isRGB=False)
val_ds = val_ds_gen.get_dataset()

print(f"TRAIN Class:{train_ds_gen.classes} VAL Class:{val_ds_gen.classes}")

TRAIN Class:16 VAL Class:16


In [25]:
model = create_vivit_classifier(
        tubelet_embedder=TubeletEmbedding(
            embed_dim=PROJECTION_DIM, patch_size=PATCH_SIZE
        ),
        positional_encoder=PositionalEncoder(embed_dim=PROJECTION_DIM),
        num_classes=NUM_CLASSES
    )
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 36, 100, 10  0           []                               
                                0, 1)]                                                            
                                                                                                  
 tubelet_embedding_1 (TubeletEm  (None, 576, 128)    65664       ['input_2[0][0]']                
 bedding)                                                                                         
                                                                                                  
 positional_encoder_1 (Position  (None, 576, 128)    73728       ['tubelet_embedding_1[0][0]']    
 alEncoder)                                                                                 

In [26]:
# Instantiate an optimizer to train the model.
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
# Instantiate a loss function.
# loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Prepare the metrics.
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
train_top5_acc_metric = keras.metrics.SparseTopKCategoricalAccuracy(k=5)
# val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
filepath ="./checkpoint"
checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

In [27]:
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=[train_acc_metric, train_top5_acc_metric])

In [28]:
# Classes 16
steps_per_epoch = np.ceil(71724/ train_ds_gen.batch_size)
val_steps_per_epoch = np.ceil(9036/ val_ds_gen.batch_size)
# Classees3
# steps_per_epoch = np.ceil(12524/ train_ds_gen.batch_size)
# val_steps_per_epoch = np.ceil(1513/ val_ds_gen.batch_size)

# model.fit(
#         train_ds,
#         validation_data=val_ds,
#         steps_per_epoch = steps_per_epoch,
#         validation_steps=val_steps_per_epoch,
#         callbacks=[checkpoint],
#         epochs=10
# )

In [29]:
# model.save( os.path.join(os.getcwd(),  "saved_model", "classes16_model.h5"))

In [34]:
# loaded_model = tf.keras.models.load_model("./classes16_model (1)")
loaded_model = tf.keras.models.load_model(os.path.join(os.getcwd(), "classes16_model (1)"))

ValueError: Unknown layer: TubeletEmbedding. Please ensure this object is passed to the `custom_objects` argument. See https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object for details.

In [None]:
images, labels  = next(iter(val_ds))

In [None]:

pred = loaded_model.predict(images)

pred_gesture = np.argmax(pred, axis=1)

In [None]:
print(pred_gesture)

In [None]:
import matplotlib.pyplot as plt
filter_ges = [
"Swiping Up", "Swiping Down", "Swiping Left", "Swiping Right",
"Zooming In With Two Fingers","Zooming Out With Two Fingers",
"Zooming In With Full Hand","Zooming Out With Full Hand","Turning Hand Clockwise",
"Turning Hand Counterclockwise","Thumb Up","Thumb Down","Shaking Hand",
"Stop Sign","Doing other things","No gesture"
]

for i in range(8):
    fig, axs = plt.subplots(1, 6)
    for j in range(6):

        axs[ j].imshow(images[i][(j+1)*6-1])
        axs[j].axis("off")
        # axs[ j].set_title(f'{filter_ges[pred_gesture[i]]}')
    plt.title(f'Predicted:: {filter_ges[pred_gesture[i]]}         Label::{filter_ges[labels[i]]}')
 
    plt.show()

