In [None]:
!pip install -q git+https://github.com/tensorflow/docs

In [None]:
from tensorflow_docs.vis import embed
from tensorflow.keras import layers
from tensorflow import keras

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
# import imageio
# import cv2
import os

In [None]:
# Hyperparams
MAX_SEQ_LENGTH = 20
NUM_FEATURES = 1024
IMG_SIZE = 128

EPOCHS = 5

In [None]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from tqdm import tqdm
import json
import gc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from pathlib import Path

DATA_ROOT = Path('..')
DF_TRAIN = DATA_ROOT / 'train.csv'
train = pd.read_csv(DF_TRAIN)

print("train.shape = {} rows, {} cols".format(*train.shape))


class CFG:
    data_path = "../"
    quick_experiment = False
    is_training = True
    use_aggregation_dataset = True
    num_classes = 250
    rows_per_frame = 543 

def load_relevant_data_subset_with_imputation(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    data.replace(np.nan, 0, inplace=True)
    n_frames = int(len(data) / CFG.rows_per_frame)
    data = data.values.reshape(n_frames, CFG.rows_per_frame, len(data_columns))
    return data.astype(np.float32)

def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / CFG.rows_per_frame)
    data = data.values.reshape(n_frames, CFG.rows_per_frame, len(data_columns))
    return data.astype(np.float32)

def read_dict(file_path):
    path = os.path.expanduser(file_path)
    with open(path, "r") as f:
        dic = json.load(f)
    return dic

In [None]:
train = pd.read_csv(f"{CFG.data_path}train.csv")
label_index = read_dict(f"{CFG.data_path}sign_to_prediction_index_map.json")
index_label = {label_index[key]: key for key in label_index}
train["label"] = train["sign"].map(lambda sign: label_index[sign])

In [None]:
path = f"{CFG.data_path}{train.iloc[0].path}"
data0 = load_relevant_data_subset_with_imputation(path)
path = f"{CFG.data_path}{train.iloc[1].path}"
data1 = load_relevant_data_subset_with_imputation(path)


data0.shape, data1.shape

In [None]:
def load_frames(
    frames_data,
    max_frames: int = MAX_SEQ_LENGTH,
    crop_method: str = 'nearest',
):
    '''Take frames data shape=(n_frames, pts, dims) to fixed num of frames.
    
    - Pads data with fewer frames with zeros
    - Reduces max number of allowed frames by crop method:
        * 'nearest' (default): 
    '''
    if len(frames_data) < max_frames:
        diff = max_frames - len(frames_data)
        padding = np.zeros((diff, 543, 3))
        frames = np.concatenate((frames_data, padding))
    else:
        if crop_method == 'nearest':
            frames = tf.image.resize(
                frames_data,
                (max_frames, 543),
                method='nearest',
            ).numpy()
        elif crop_method == 'cut':
            frames = frames_data[:max_frames]
        else:
            raise Exception(f'{crop_method=} not found')
    return frames

In [None]:
def compress_frames(frames):
    '''Make a video of shape (n_frames, pts, dims) --> (n_frames, pts*dims) '''
    n_frames = frames.shape[0]
    columns = frames.shape[1]*frames.shape[2]
    return frames.reshape(n_frames, columns)

In [None]:
MAX_FRAMES = 20
frames_reduced = load_frames(data1, max_frames=MAX_FRAMES)
data_resize = compress_frames(frames_reduced)
data_resize.shape

In [None]:
MAX_FRAMES = 20
N_PTS = 543
N_DIMS = 3
X_npy_fname = f'X-{MAX_FRAMES}_frames-pts_flattened.npy'
y_npy_fname = f'y.npy'


if CFG.is_training:
    try:
        X = np.load(X_npy_fname)
        y = np.load(y_npy_fname)
    except:
        X = np.zeros((len(train), MAX_FRAMES, N_PTS*N_DIMS))
        y = np.zeros((len(train),))
        num_frames = np.zeros(len(train))
        for i in tqdm(range(len(train))):
            path = f"{CFG.data_path}{train.iloc[i].path}"
            data = load_relevant_data_subset_with_imputation(path)
            ## Mean Aggregation
            frames_reduced = load_frames(data, max_frames=MAX_FRAMES)
            data_resize = compress_frames(frames_reduced)
            X[i] = data_resize
            y[i] = train.iloc[i].label
            num_frames[i] = data.shape[0]
        ## Save number of frames of each training sample for data analysis
        train["num_frames"] = num_frames
        print(train["num_frames"].describe())
        train.to_csv("train.csv", index=False)
        np.save(X_npy_fname, X)
        np.save(y_npy_fname, y)

    print(X.shape, y.shape)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=27, stratify=y)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

# Building Model

> https://keras.io/examples/vision/video_transformers/

## Transformer Based Model

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, sequence_length, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.position_embeddings = tf.keras.layers.Embedding(
            input_dim=sequence_length,
            output_dim=output_dim,
        )
        self.sequence_length = sequence_length
        self.output_dim = output_dim

    def call(self, inputs):
        # The inputs are of shape: `(batch_size, frames, num_features)`
        length = tf.shape(inputs)[1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        return inputs + embedded_positions

    def compute_mask(self, inputs, mask=None):
        mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
        return mask

In [None]:
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.3
        )
        self.dense_proj = tf.keras.Sequential([
            tf.keras.layers.Dense(dense_dim, activation=tf.nn.gelu),
            tf.keras.layers.Dense(embed_dim),
        ])
        self.layernorm_1 = tf.keras.layers.LayerNormalization()
        self.layernorm_2 = tf.keras.layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]

        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

## Utility functions for training

In [None]:
# Hyperparams
MAX_SEQ_LENGTH = 20
NUM_FEATURES = N_PTS*N_DIMS #1024

EPOCHS = 5
def get_compiled_model(
    sequence_length=MAX_FRAMES,
    embed_dim=NUM_FEATURES,
    dense_dim=4,
    num_heads=1,
):
    sequence_length = MAX_SEQ_LENGTH
    embed_dim = NUM_FEATURES
    dense_dim = 4
    num_heads = 1
    classes = 250 #len(label_processor.get_vocabulary())

    inputs = keras.Input(shape=(None, None))
    x = PositionalEmbedding(
        sequence_length, embed_dim, name="frame_position_embedding"
    )(inputs)
    x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(classes, activation="softmax")(x)
    model = keras.Model(inputs, outputs)

    model.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )
    return model

In [None]:
def run_experiment(
    train_data,
    train_labels,
    validation_data = None,
    validation_split: int = 0.2,
    model_path: str = 'temp',
    epochs: int = 10,
    monitor_metric: str = 'val_accuracy',
    patience: int = 4,
):
    checkpoint = [
        keras.callbacks.ModelCheckpoint(
            model_path,
            save_weights_only=True,
            save_best_only=True,
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor=monitor_metric,
            patience=patience,
            restore_best_weights=True,
        ),
    ]
    
    model = get_compiled_model()
    print(model.summary())
        
    fit_params = dict(
        x=train_data,
        y=train_labels,
        validation_split=0.15,
        epochs=epochs,
    )
    if validation_data:
        fit_params['validation_data'] = validation_data
    else:
        fit_params['validation_split'] = validation_split
    history = model.fit(
        **fit_params,
        callbacks=[checkpoint],
    )

    model.load_weights(model_path)
    _, accuracy = model.evaluate(X_val, y_val)
    print(f'Validation accuracy: {accuracy:.2%}')

    return model, history

In [None]:
model, history = run_experiment(
    train_data=X_train,
    train_labels=y_train,
    validation_data=(X_val, y_val),
    model_path='model_test-enc.h5',
    epochs=200,
)