In [1]:
import tensorflow as tf
from tensorflow.keras import Model
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

tf.random.set_seed(42)
type_train_ds = tf.data.experimental.load("/kaggle/input/cmi-tf-datasets/type_train_ds")
type_valid_ds = tf.data.experimental.load("/kaggle/input/cmi-tf-datasets/type_valid_ds")
gesture_train_ds = tf.data.experimental.load("/kaggle/input/cmi-tf-datasets/gesture_train_ds")
gesture_valid_ds = tf.data.experimental.load("/kaggle/input/cmi-tf-datasets/gesture_valid_ds")
test_ds = tf.data.experimental.load("/kaggle/input/cmi-tf-datasets/test_ds")

2025-07-23 04:07:16.678121: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753243636.901600      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753243636.959250      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-23 04:07:33.729321: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [2]:
# print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [3]:
print(len(type_train_ds))
print(len(type_valid_ds))
print(len(gesture_train_ds))
print(len(gesture_valid_ds))

print(len(test_ds))

380
95
380
95
1


In [4]:
for i, (inputs, targets) in enumerate(type_train_ds):
    if i==0:
        print(inputs[0].shape)
        print(targets.shape)
        break

(16, 100, 8, 8, 5)
(16, 1)


In [5]:
from tensorflow.keras.layers import Conv2D, Conv1D, BatchNormalization, TimeDistributed
from tensorflow.keras.layers import MaxPool2D, GlobalMaxPool2D, MaxPool1D, Dropout
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Flatten, Dense
from tensorflow.keras.regularizers import L1, L2, L1L2
from tensorflow.keras import Sequential

# from keras.src import tree
# from keras.src.trainers.data_adapters import data_adapter_utils
# from keras.saving import register_keras_serializable

# @register_keras_serializable("RNNModel")
class RNNModel(Model):
    def __init__(self, kernel_size2d=3, kernel_size1d=3, filters_2d=[16, 32], dropout=0.2, 
                 filters_1d=[16, 32], rnn_hidden_size=32, mode="RNN", bidirectional=False,
                 hidden_size=64, regularizer="l1", l1_penalty=0.1, l2_penalty=0.1, 
                 binary=True, **kwargs):
        super().__init__(**kwargs)
        self.hidden_size = hidden_size
        
        self.conv2d_nets = []
        for i in range(len(filters_2d)):
            conv2d_net = Sequential([
                TimeDistributed(Conv2D(filters_2d[i], kernel_size2d, padding="same", activation="relu")),
                TimeDistributed(BatchNormalization()),
                TimeDistributed(MaxPool2D(pool_size=3, strides=2, padding="same")),
                TimeDistributed(Dropout(dropout))
            ], name=f"conv2d_net_{i}")
            self.conv2d_nets.append(conv2d_net)
        self.global_maxpool = TimeDistributed(GlobalMaxPool2D(), name="global_maxpool")

        self.conv1d_nets = []
        for i in range(len(filters_1d)):
            conv1d_net = Sequential([
                Conv1D(filters_1d[i], kernel_size1d, padding="same", activation="relu"),
                BatchNormalization(),
                MaxPool1D(pool_size=3, strides=2, padding="same"),
                Dropout(dropout)
            ], name=f"conv1d_net_{i}")
            self.conv1d_nets.append(conv1d_net)

        if mode == "RNN":
            self.rnn = SimpleRNN(rnn_hidden_size)
        elif mode == "LSTM":
            self.rnn = LSTM(rnn_hidden_size)
        else:
            self.rnn = GRU(rnn_hidden_size)
        
        if bidirectional:
            self.rnn = Bidirectional(self.rnn, name=mode)

        self.flatten = Flatten()

        if regularizer == "l1":
            self.dense0 = Dense(hidden_size, activation="relu", 
                                kernel_regularizer=L1(l1_penalty),
                                bias_regularizer=L1(l1_penalty))
            self.dense1_0 = Dense(1, activation="sigmoid",
                                  kernel_regularizer=L1(l1_penalty),
                                  bias_regularizer=L1(l1_penalty))
            self.dense1_1 = Dense(18, activation="softmax", 
                                  kernel_regularizer=L1(l1_penalty),
                                  bias_regularizer=L1(l1_penalty))
        elif regularizer == "l2":
            self.dense0 = Dense(hidden_size, activation="relu", 
                                kernel_regularizer=L2(l2_penalty),
                                bias_regularizer=L2(l2_penalty))
            self.dense1_0 = Dense(1, activation="sigmoid", 
                                  kernel_regularizer=L2(l2_penalty),
                                  bias_regularizer=L2(l2_penalty))
            self.dense1_1 = Dense(18, activation="softmax", 
                                  kernel_regularizer=L2(l2_penalty),
                                  bias_regularizer=L2(l2_penalty))
        elif regularizer == "l1l2":
            self.dense0 = Dense(hidden_size, activation="relu", 
                                kernel_regularizer=L1L2(l1_penalty, l2_penalty),
                                bias_regularizer=L1L2(l1_penalty, l2_penalty))
            self.dense1_0 = Dense(1, activation="sigmoid", 
                                  kernel_regularizer=L1L2(l1_penalty, l2_penalty),
                                  bias_regularizer=L1L2(l1_penalty, l2_penalty))
            self.dense1_1 = Dense(18, activation="softmax", 
                                  kernel_regularizer=L1L2(l1_penalty, l2_penalty),
                                  bias_regularizer=L1L2(l1_penalty, l2_penalty))
        else:
            self.dense0 = Dense(hidden_size, activation="relu")
            self.dense1_0 = Dense(1, activation="sigmoid")
            self.dense1_1 = Dense(18, activation="softmax")

        self.binary = binary

    def build(self, input_shapes, training=False):
        shape = input_shapes[0]
        for i in range(len(self.conv2d_nets)):
            self.conv2d_nets[i].build(shape)
            shape = self.conv2d_nets[i].compute_output_shape(shape)
        self.global_maxpool.build(shape)
        shape = self.global_maxpool.compute_output_shape(shape)
        shape = (shape[0], shape[1], shape[2] + input_shapes[1][2])

        for i in range(len(self.conv1d_nets)):
            self.conv1d_nets[i].build(shape)
            shape = self.conv1d_nets[i].compute_output_shape(shape)

        self.rnn.build(shape)
        shape = self.rnn.compute_output_shape(shape)

        self.flatten.build(shape)
        shape = self.flatten.compute_output_shape(shape)

        shape = (shape[0], shape[1] + input_shapes[2][1])
        self.dense0.build(shape)
        shape = self.dense0.compute_output_shape(shape)

        self.dense1_0.build(shape)
        self.dense1_1.build(shape)
        

    def call(self, inputs, training=False):
        image_out = inputs[0]
        for i in range(len(self.conv2d_nets)):
            image_out = self.conv2d_nets[i](image_out, training=training)
        image_out = self.global_maxpool(image_out)   # (batch, timestep, filters)

        out = tf.concat([image_out, inputs[1]], axis=-1)
        for i in range(len(self.conv1d_nets)):
            out = self.conv1d_nets[i](out, training=training)
        
        out = self.rnn(out)        # (batch, downsampled_timestep, rnn_hidden_size)
        out = self.flatten(out)    # (batch, downsampled_timestep*rnn_hidden_size)
        
        out = tf.concat([out, inputs[2]], axis=-1)
        out = self.dense0(out)
        if self.binary:
            out = self.dense1_0(out)
        else:
            out = self.dense1_1(out)
        return out

    def set_binary(self):
        self.binary = True
        if self.dense1_0.build == False:
            self.dense1_0.build(input_shape=(None, self.hidden_size))

    def set_multi(self):
        self.binary = False
        if self.dense1_1.build == False:
            self.dense1_1.build(input_shape=(None, self.hidden_size))

    def freeze_conv_timeseries(self):
        for i in range(len(self.conv2d_nets)):
            self.conv2d_nets[i].trainable = False
        for i in range(len(self.conv1d_nets)):
            self.conv1d_nets[i].trainable = False
        self.rnn.trainable = False

In [6]:
for i, (inputs, targets) in enumerate(type_train_ds):
    if i == 0:
        print(inputs[0].shape)
        print(inputs[1].shape)
        print(inputs[2].shape)
        break

(16, 100, 8, 8, 5)
(16, 100, 12)
(16, 14)


In [7]:
import json

with open('/kaggle/input/cmi-tf-datasets/sample_weight.json') as f:
    sample_weight_dict = json.load(f)

type_sample_weight = sample_weight_dict["type_sample_weight"]
type_class_weight = {0: type_sample_weight[0], 1: type_sample_weight[1]}

gesture_sample_weight = sample_weight_dict["gesture_sample_weight"]
gesture_class_weight = {i: gesture_sample_weight[i] for i in range(len(gesture_sample_weight))}

In [8]:
def add_type_sample_weight(x, y):
    y_int = tf.cast(y, tf.int32)
    weight = tf.gather([type_class_weight[0], type_class_weight[1]], y_int)
    return x, y, weight

def add_gesture_sample_weight(x, y):
    y_arg = tf.argmax(y)
    weight = tf.gather(gesture_sample_weight, y_arg)
    return x, y, weight

In [9]:
type_train_ds = type_train_ds.unbatch().map(add_type_sample_weight).batch(16)
type_valid_ds = type_valid_ds.unbatch().map(add_type_sample_weight).batch(16)

In [10]:
# gesture_train_ds = gesture_train_ds.unbatch().map(add_gesture_sample_weight).batch(16)
# gesture_valid_ds = gesture_valid_ds.unbatch().map(add_gesture_sample_weight).batch(16)

In [11]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local')
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
    print("✅ TPU initialized.")
except Exception as e:
    print("❌ TPU not available, using default strategy.")
    strategy = tf.distribute.get_strategy()
    print(e)

❌ TPU not available, using default strategy.
TPUs not found in the cluster. Failed in initialization: Could not satisfy device specification '/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0'. enable_soft_placement=0. Supported device types [CPU]. All available devices [/job:localhost/replica:0/task:0/device:CPU:0]. [Op:__inference__tpu_init_fn_161]


In [12]:
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, CategoricalFocalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import BinaryAccuracy, F1Score, AUC

# kernel_size2d = 3
# kernel_size1d = 3
# filters_2d = [16, 32]
# dropout = 0.5
# filters_1d = [64, 128]
# rnn_hidden_size = 128
# mode = "GRU"
# bidirectional = True
# hidden_size = 256
# regularizer = "l2"
# l1_panalty = 1e-4
# l2_penalty = 1e-4
# binary = True

model = RNNModel(kernel_size2d=3, kernel_size1d=3, filters_2d=[16, 32], dropout=0.2,
                 filters_1d=[64, 128, 256], rnn_hidden_size=256, mode="GRU", bidirectional=True,
                 hidden_size=512, regularizer="l1l2", l1_penalty=1e-4, l2_penalty=1e-4, binary=True)
model.build(input_shapes=((None, 100, 8, 8, 5), (None, 100, 12), (None, 14)))
model.compile(loss=BinaryCrossentropy(),
              optimizer=Adam(learning_rate=0.0005),
              metrics=["accuracy", "auc"])

# with strategy.scope():
#     model = RNNModel(kernel_size2d=3, kernel_size1d=3, filters_2d=[16, 32], dropout=0.5,
#                      filters_1d=[64, 128], rnn_hidden_size=128, mode="GRU", bidirectional=True,
#                      hidden_size=256, regularizer="l2", l1_penalty=1e-5, l2_penalty=1e-5, binary=True)
#     model.build(input_shapes=((None, 100, 8, 8, 5), (None, 100, 12), (None, 14)))
#     model.compile(loss=BinaryCrossentropy(),
#                   optimizer=Adam(learning_rate=0.0001),
#                   metrics=["accuracy", "auc"])
model.summary()

In [13]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint_filepath = "/kaggle/working/best_binary_model.weights.h5"
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor="val_loss",
    save_best_only=True,
    save_weights_only=True,
    mode="min",
    verbose=1
)

history = model.fit(type_train_ds, epochs=20, 
                    validation_data=type_valid_ds,
                    callbacks=[checkpoint_callback])
model.load_weights(checkpoint_filepath)

Epoch 1/20
    380/Unknown [1m382s[0m 606ms/step - accuracy: 0.7499 - auc: 0.8302 - loss: 1.6002
Epoch 1: val_loss improved from inf to 1.08778, saving model to /kaggle/working/best_binary_model.weights.h5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m404s[0m 664ms/step - accuracy: 0.7500 - auc: 0.8303 - loss: 1.5996 - val_accuracy: 0.8855 - val_auc: 0.9445 - val_loss: 1.0878
Epoch 2/20
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 609ms/step - accuracy: 0.8822 - auc: 0.9512 - loss: 0.9979
Epoch 2: val_loss improved from 1.08778 to 0.85259, saving model to /kaggle/working/best_binary_model.weights.h5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 621ms/step - accuracy: 0.8822 - auc: 0.9512 - loss: 0.9977 - val_accuracy: 0.8953 - val_auc: 0.9570 - val_loss: 0.8526
Epoch 3/20
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 632ms/step - accuracy: 0.9201 - auc: 0.9727 - loss: 0.7099
Epoch 3: val_loss improved from 0

In [14]:
from tensorflow.keras.metrics import AUC
result = model.evaluate(type_valid_ds)

[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 45ms/step - accuracy: 0.9528 - auc: 0.9901 - loss: 0.2647


In [15]:
conf_tensor = tf.zeros((2, 2), dtype=tf.int32)
for i, (inputs, labels, sample_weight) in enumerate(type_valid_ds):
    labels_pred = model.predict(inputs, verbose=0)
    labels_pred[labels_pred < 0.5] = 0
    labels_pred[labels_pred >= 0.5] = 1
    conf_tensor += tf.math.confusion_matrix(tf.reshape(labels, -1), tf.reshape(labels_pred, -1))
conf_tensor

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[518,  48],
       [ 25, 928]], dtype=int32)>

In [16]:
conf_tensor

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[518,  48],
       [ 25, 928]], dtype=int32)>

In [17]:
precision = conf_tensor[1][1] / (conf_tensor[1][0] + conf_tensor[1][1])
recall = conf_tensor[1][1] / (conf_tensor[0][1] + conf_tensor[1][1])
inv_f1 = (1/precision + 1/recall)/2
f1 = 1/inv_f1
print(precision)
print(recall)
print(f1)

tf.Tensor(0.9737670514165793, shape=(), dtype=float64)
tf.Tensor(0.9508196721311475, shape=(), dtype=float64)
tf.Tensor(0.9621565578019698, shape=(), dtype=float64)


In [18]:
model.set_multi()
# model.freeze_conv_timeseries()

In [19]:
from tensorflow.keras.metrics import CategoricalAccuracy
model.compile(loss=CategoricalFocalCrossentropy(alpha=gesture_sample_weight, 
                                                gamma=3,
                                                label_smoothing=0.2),
              optimizer=Adam(learning_rate=0.0005),
              metrics=["accuracy"])

# with strategy.scope():
#     model.load_weights(checkpoint_filepath)
#     model.compile(loss=CategoricalCrossentropy(),
#                   optimizer=Adam(learning_rate=0.0005),
#                   metrics=["accuracy"])

In [20]:
checkpoint_filepath = "/kaggle/working/cmi_best_model.weights.h5"
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor="val_accuracy",
    save_best_only=True,
    save_weights_only=True,
    mode="max",
    verbose=1
)

history = model.fit(gesture_train_ds, epochs=50, 
                    validation_data=gesture_valid_ds,
                    callbacks=[checkpoint_callback])
model.load_weights(checkpoint_filepath)

Epoch 1/50
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 512ms/step - accuracy: 0.1260 - loss: 0.1563
Epoch 1: val_accuracy improved from -inf to 0.18038, saving model to /kaggle/working/cmi_best_model.weights.h5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 575ms/step - accuracy: 0.1261 - loss: 0.1562 - val_accuracy: 0.1804 - val_loss: 0.1288
Epoch 2/50
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 531ms/step - accuracy: 0.2130 - loss: 0.1251
Epoch 2: val_accuracy improved from 0.18038 to 0.22712, saving model to /kaggle/working/cmi_best_model.weights.h5
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 543ms/step - accuracy: 0.2130 - loss: 0.1251 - val_accuracy: 0.2271 - val_loss: 0.1172
Epoch 3/50
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 536ms/step - accuracy: 0.2612 - loss: 0.1135
Epoch 3: val_accuracy improved from 0.22712 to 0.26860, saving model to /kaggle/working/cmi_best_mo

In [21]:
import json

with open('/kaggle/input/cmi-tf-datasets/mapping.json') as f:
    mapping_dict = json.load(f)

gesture_mapping = mapping_dict["gesture_mapping"]
inv_gesture_mapping = {value: key for key, value in gesture_mapping.items()}

num2gesture = np.vectorize(lambda x: inv_gesture_mapping[x])

In [22]:
non_target_gestures = ["Drink from bottle/cup", "Glasses on/off", "Pull air toward your face",
                       "Pinch knee/leg skin", "Scratch knee/leg skin", "Write name on leg",
                       "Text on phone", "Feel around in tray and pull out an object",
                       "Write name in air", "Wave hello"]

def map_non_target(y_ind):
    y_pred = inv_gesture_mapping[y_ind]
    if y_ind == 3:
        y_ind = 2
    elif y_ind == 4:
        y_ind = 3
    elif y_ind == 6:
        y_ind = 4
    elif y_ind == 7:
        y_ind = 5
    elif y_ind == 9:
        y_ind = 6
    elif y_ind == 10:
        y_ind = 7
    if y_pred in non_target_gestures:
        y_ind = 8
    return y_ind

vectorize_map_non_target = np.vectorize(map_non_target)

In [23]:
conf_tensor = np.zeros((9, 9), dtype=np.int32)
for i, (inputs, labels) in enumerate(gesture_valid_ds):
    labels_pred = model.predict(inputs, verbose=0)
    labels_pred = tf.argmax(labels_pred, axis=-1).numpy()
    labels_pred = vectorize_map_non_target(labels_pred)
    labels_true = tf.argmax(labels, axis=-1).numpy()
    labels_true = vectorize_map_non_target(labels_true)
    conf_tensor += tf.math.confusion_matrix(labels_true, labels_pred, num_classes=9)

In [24]:
precisions = []
recalls = []
f1s = []

for i in range(9):
    column = conf_tensor[i, :]
    row = conf_tensor[:, i]
    precision = column[i] / tf.math.reduce_sum(column)
    recall = row[i] / tf.math.reduce_sum(row)
    inv_f1 = (1/precision + 1/recall)/2
    f1 = 1/inv_f1
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
print(f"F1 score mean: {np.round(np.mean(f1s), 3)}")

F1 score mean: 0.519


In [25]:
# import os
# save_path = os.path.join("/kaggle/working/", "cmi_model.weights.h5")
# model.save_weights(save_path)

In [26]:
new_model = RNNModel(kernel_size2d=3, kernel_size1d=3, filters_2d=[16, 32], dropout=0.2,
                     filters_1d=[64, 128, 256], rnn_hidden_size=256, mode="GRU", bidirectional=True,
                     hidden_size=512, regularizer="l1l2", l1_penalty=1e-4, l2_penalty=1e-4, binary=True)
new_model.build(input_shapes=((None, 100, 8, 8, 5), (None, 100, 12), (None, 14)))
new_model.load_weights(checkpoint_filepath)
new_model.summary()