# Transformer model

The Jupyter Notebook aims to train a transformer model for translating Hong Kong Sign Languages into a list of glosses.

## To Readers:

## Notes
- Batch size of 32 is **HARDWARE LIMIT**. On project machines, a tensor with batch size of 64 cannot be created.
- Cache should NOT be put on the mounted windows drive. Otherwise, it would take eternity to read the keypoint files.
    - Yes, I know it takes up C drive spaces, but I have no choice. Im not going to have 3 hours per Epoch.

List of trials:
- 

In [None]:
MODE = "train" # train |dev

# set to True if you want to cache Y values extracted from the split file
CACHE_Y = True

# set to True if you want to pre-generate and cache the batched data for training. 
CACHE_BATCH = False

# set to True if you want to use cached batch data to train. this will cause every epoch to train from the same data
# if you wish to train from transformed data every epoch, set this to False. this will replace the data input of model fitting process with a generator
USE_CACHE_BATCH = True

# set to True if you want to apply weighting while generating
GENERATE_WEIGHT = False

# set to True if you want to apply weighting to cache data. use if you have generated non-weighted data
USE_WEIGHT = False

# set to True if you do not want to use transformation. applies to cache data only
NO_TRANSFORM = False

# model parameters config
HEAD_SIZE = 16
NUM_LAYERS = 2
DROPOUT = 0.1
D_MODEL = 512
D_FF = 512

EPOCH = 128

weighted_suffix = "weighted" if GENERATE_WEIGHT or USE_WEIGHT else ""

# linux path
# MODEL_DIR = f"../model/trans_{MODE}_{D_MODEL}_{HEAD_SIZE}_{NUM_LAYERS}_{D_FF}_{DROPOUT}_{weighted_suffix}"
# RESULT_DIR = f"../results/trans_{MODE}_{D_MODEL}_{HEAD_SIZE}_{NUM_LAYERS}_{D_FF}_{DROPOUT}_{weighted_suffix}"
# CACHE_DIR = f"../dataset/tvb-hksl-news/keypoint_mediapipe_feat_select/" # dir structure: {CACHE_DIR}/{date}/{name}.npy

# windows pathh
MODEL_DIR = f"..\\..\\model"
RESULT_DIR = f"..\\..\\results\\trans_{MODE}_{D_MODEL}_{HEAD_SIZE}_{NUM_LAYERS}_{D_FF}_{DROPOUT}_{weighted_suffix}"
CACHE_DIR = f"F:\\dataset\\tvb-hksl-news\\keypoints_mediapipe_feat_select" # dir structure: {CACHE_DIR}/{date}/{name}.npy

# MODEL_PATH = f"../model/train_model.keras"
# ENCODER_PATH = f"../model/train_encoder.keras"
# DECODER_PATH = f"../model/train_decoder.keras"

## Declaration of Save Paths and Import of Libraries

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["GLOG_minloglevel"] ="3"

# only override for specific file generation
make_dir_override = True
if make_dir_override:
    print("Warning: Overriding existing directories. Files may be overwritten.")
os.makedirs(MODEL_DIR, exist_ok=make_dir_override)
os.makedirs(RESULT_DIR, exist_ok=make_dir_override)
os.makedirs(CACHE_DIR, exist_ok=True)

# tensorflow paths
# MODEL_PATH = f"{MODEL_DIR}/model.keras"
# ENCODER_PATH = f"{MODEL_DIR}/encoder.keras"
# DECODER_PATH = f"{MODEL_DIR}/decoder.keras"

# pytorch paths
# MODEL_PATH = f"{MODEL_DIR}/model.pt"
# ENCODER_PATH = f"{MODEL_DIR}/encoder.pt"
# DECODER_PATH = f"{MODEL_DIR}/decoder.pt"

MODEL_PATH = f"{MODEL_DIR}\\model.pt"
ENCODER_PATH = f"{MODEL_DIR}\\encoder.pt"
DECODER_PATH = f"{MODEL_DIR}\\decoder.pt"

# RESULT_FILE_NAME = f"{RESULT_DIR}/result.csv"
# HISTORY_FILE_NAME = f"{RESULT_DIR}/history.csv"
# ACC_PLOT_FILE_NAME = f"{RESULT_DIR}/acc_plot.png"
# LOSS_PLOT_FILE_NAME = f"{RESULT_DIR}/loss_plot.png"

RESULT_FILE_NAME = f"{RESULT_DIR}\\result.csv"
HISTORY_FILE_NAME = f"{RESULT_DIR}\\history.csv"
ACC_PLOT_FILE_NAME = f"{RESULT_DIR}\\acc_plot.png"
LOSS_PLOT_FILE_NAME = f"{RESULT_DIR}\\loss_plot.png"

# print all finalized file paths
print("MODEL_PATH:", MODEL_PATH)
print("ENCODER_PATH:", ENCODER_PATH)
print("DECODER_PATH:", DECODER_PATH)
print("RESULT_FILE_NAME:", RESULT_FILE_NAME)
print("HISTORY_FILE_NAME:", HISTORY_FILE_NAME)
print("ACC_PLOT_FILE_NAME:", ACC_PLOT_FILE_NAME)
print("LOSS_PLOT_FILE_NAME:", LOSS_PLOT_FILE_NAME)

In [None]:
import pandas as pd
import numpy as np
import json
import math
import copy
import matplotlib.pyplot as plt

In [None]:
# import tensorflow as tf
# import keras
# from keras.layers import Input, Dense, Dropout, LayerNormalization
# from keras.models import Model
# assert len(tf.config.list_physical_devices('GPU')) > 0, "CUDA not available. Please enable CUDA in GPU settings."
# keras.mixed_precision.set_global_policy('mixed_float16')

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
from torch.nn import functional as F
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

## Dataset Preparation

Parses the split files first

In [None]:
class tvb_hksl_split_parser():
    def __init__(self, file: str):
        self.file = file
        self.train_info = pd.read_csv(self.file, delimiter="|") 
        # extend the dataframe with extracted information
        self.train_info["glosses_tokenized"] = self.train_info["glosses"].str.split(' ')
        # self.train_info["date"] = self.train_info["id"].str.split('/').apply(lambda x: x[0])
        self.train_info["frames"] = self.train_info["id"].str.split('/').apply(lambda x: x[1])
        self.train_info["length"] = self.train_info["frames"].str.split('-').apply(lambda x: int(x[1]) - int(x[0]) + 1)
        # add <START> and <END> tokens to the glosses
        self.train_info["glosses_tokenized"] = self.train_info["glosses_tokenized"].apply(lambda x: ["<START>"] + x + ["<END>"])
        self.train_info["glosses_length"] = self.train_info["glosses_tokenized"].apply(lambda x: len(x))
        

    def get_train_id(self) -> pd.Series:
        if os.name == "nt": # for windows system only
            return self.train_info["id"].str.replace("/", "\\")
        return self.train_info["id"]

    # def get_train_date(self) -> pd.Series:
    #     return self.train_info["date"]
    
    # def get_train_frames(self) -> pd.Series:
    #     return self.train_info["frames"]

    # def get_train_length(self) -> pd.Series:
    #     return self.train_info["length"]

    def get_train_glosses_tokenized(self) -> pd.Series:
        return self.train_info["glosses_tokenized"]

    def get_max_length(self) -> int:
        return self.train_info["length"].max()

    def get_max_glosses_length(self) -> int:
        return self.train_info["glosses_length"].max()

    def pad_train_glosses_tokenized(self, max_length: int) -> pd.Series:
        self.train_info["glosses_tokenized"] = self.train_info["glosses_tokenized"].apply(lambda x: x + ["<END>"] * (max_length - len(x)))
        self.train_info["glosses_length"] = self.train_info["glosses_tokenized"].apply(lambda x: len(x))
        return self.train_info["glosses_tokenized"]
    
    def get_word_dict(self) -> dict:
        word_dict = {}
        for tokens in self.train_info["glosses_tokenized"]:
            for token in tokens:
                if token not in word_dict:
                    word_dict[token] = len(word_dict)
        return word_dict

Generate the word dictionary here.

In [None]:
# train_parser = tvb_hksl_split_parser("../dataset/tvb-hksl-news/split/train.csv")
# test_parser = tvb_hksl_split_parser("../dataset/tvb-hksl-news/split/test.csv")
# dev_parser = tvb_hksl_split_parser("../dataset/tvb-hksl-news/split/dev.csv")

train_parser = tvb_hksl_split_parser(r"F:\dataset\tvb-hksl-news\split\train.csv")
test_parser = tvb_hksl_split_parser(r"F:\dataset\tvb-hksl-news\split\test.csv")
dev_parser = tvb_hksl_split_parser(r"F:\dataset\tvb-hksl-news\split\dev.csv")

# make a word dictionary
word_dict = {}
word_dict["<END>"] = len(word_dict)
word_dict["<START>"] = len(word_dict)
word_dict["<X>"] = len(word_dict)
word_dict["<BAD>"] = len(word_dict)
word_dict["<MUMBLE>"] = len(word_dict)
word_dict["<STOP>"] = len(word_dict)
# word_dict["<UNK>"] = len(word_dict)

for parser in [train_parser, test_parser, dev_parser]:
    for glosses in parser.get_train_glosses_tokenized():
        for word in glosses:
            if word not in word_dict:
                word_dict[word] = len(word_dict)

# save the word dictionary
with open("../data/word_dict.json", "w") as f:
    json.dump(word_dict, f)
# save reverse word dictionary
reverse_word_dict = {v: k for k, v in word_dict.items()}
with open("../data/reverse_word_dict.json", "w+") as f:
    json.dump(reverse_word_dict, f)

In [None]:
# create a frequency map: word -> frequency
token_freq = {}
total_word_count = 0
for k, v in word_dict.items():
    token_freq[v] = 0
for parser in [train_parser, test_parser, dev_parser]:
    for glosses in parser.get_train_glosses_tokenized():
        for word in glosses:
            token_freq[word_dict[word]] += 1
            total_word_count += 1
print(token_freq)
print(len(token_freq))

In [None]:
# generate a weighting list, where lower frequency words have higher weight

# basic inverse frequency weighting
# weighting_list = [1 / token_freq[word] for word in token_freq]
# weighting_list = [x / max(weighting_list) for x in weighting_list]

# tf-idf weighting
tf_list = np.array([token_freq[word] / total_word_count for word in token_freq]) # freq / total for each word
idf_list = np.log(len(token_freq) / tf_list) # log(total / freq ratio) for each word
weighting_list = tf_list * idf_list 

# make sure that <START> and <END> have full weight
weighting_list[word_dict["<START>"]] = 1
weighting_list[word_dict["<END>"]] = 1
print(weighting_list)

# export weighting list
# convert numpy array to list
weighting_list = weighting_list.tolist()
with open("../data/weighting_list.json", "w") as f:
    json.dump(weighting_list, f)

In [None]:
# sample preprocessing
# train_parser.rare_sample_reduction(token_freq)
# test_parser.rare_sample_reduction(token_freq)
# dev_parser.rare_sample_reduction(token_freq)

if MODE == "train":
    actual_train_parser = train_parser
elif MODE == "dev":
    actual_train_parser = dev_parser

# if a word in test_parser is not in dev_parser or train_parser, remove that sample
# this is to prevent the model from predicting words that are not in the training set
# supposedly, this should not happen, but just in case
parser_word_dict = actual_train_parser.get_word_dict()
test_parser.train_info = test_parser.train_info[test_parser.train_info["glosses_tokenized"].apply(lambda x: all([word in parser_word_dict for word in x]))]
    
# assert that all words in test_parser are also in train_parser
test_word_dict = test_parser.get_word_dict()
assert all([word in parser_word_dict for word in test_word_dict])

# finally, print the number of samples in each parser
print(f"train_parser: {len(train_parser.train_info)}")
print(f"test_parser: {len(test_parser.train_info)}")
print(f"dev_parser: {len(dev_parser.train_info)}")

We do not need generate the decoder-input and encoder-input, thankfully.

Still, we have to pad the list of tokens to the same length, and replace it with numbers instead

In [None]:
# pad the glosses to the maximum length
train_gloss_max_length = actual_train_parser.get_max_glosses_length()
test_gloss_max_length = test_parser.get_max_glosses_length()
dev_gloss_max_length = dev_parser.get_max_glosses_length()

print(f"train_gloss_max_length: {train_gloss_max_length}")
print(f"test_gloss_max_length: {test_gloss_max_length}")
print(f"dev_gloss_max_length: {dev_gloss_max_length}")

max_gloss_length = max(train_gloss_max_length, test_gloss_max_length, dev_gloss_max_length)
train_parser.pad_train_glosses_tokenized(max_gloss_length)
test_parser.pad_train_glosses_tokenized(max_gloss_length)
dev_parser.pad_train_glosses_tokenized(max_gloss_length)

## NOTE: Skipping image generation code here for now since the keypoints are cached

## Keypoint Preprocessing

In [None]:
def get_mediapipe_keypoints_face_sublist() -> list[list[int]]:
    # face
    # NOTE: the following keypoint indices are HARD-CODED based on the visualization of the face mesh
    # reference: https://github.com/LearningnRunning/py_face_landmark_helper/blob/main/mediapipe_helper/config.py
    # image: https://raw.githubusercontent.com/google/mediapipe/master/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualization.png
    # related stack overflow post: https://stackoverflow.com/questions/74901522/can-mediapipe-specify-which-parts-of-the-face-mesh-are-the-lips-or-nose-or-eyes
    FACE_LIPS = [0, 267, 269, 270, 13, 14, 17, 402, 146, 405, 409, 415, 291, 37, 39, 40, 178, 308, 181, 310, 311, 312, 185, 314, 317, 318, 61, 191, 321, 324, 78, 80, 81, 82, 84, 87, 88, 91, 95, 375]
    LEFT_EYE = [384, 385, 386, 387, 388, 390, 263, 362, 398, 466, 373, 374, 249, 380, 381, 382]
    LEFT_EYEBROW = [293, 295, 296, 300, 334, 336, 276, 282, 283, 285]
    RIGHT_EYE = [160, 33, 161, 163, 133, 7, 173, 144, 145, 246, 153, 154, 155, 157, 158, 159]
    RIGHT_EYEBROW = [65, 66, 70, 105, 107, 46, 52, 53, 55, 63]
    FACE_NOSE = [1, 2, 4, 5, 6, 19, 275, 278, 294, 168, 45, 48, 440, 64, 195, 197, 326, 327, 344, 220, 94, 97, 98, 115]
    FACE_OVAL = [132, 389, 136, 10, 397, 400, 148, 149, 150, 21, 152, 284, 288, 162, 297, 172, 176, 54, 58, 323, 67, 454, 332, 338, 93, 356, 103, 361, 234, 109, 365, 379, 377, 378, 251, 127]
    return [
        FACE_LIPS, 
        LEFT_EYE, 
        LEFT_EYEBROW, 
        RIGHT_EYE, 
        RIGHT_EYEBROW, 
        # FACE_NOSE, 
        # FACE_OVAL
    ]

STATIC_FACE_KEYPOINT_INDEX = [i for sublist in get_mediapipe_keypoints_face_sublist() for i in sublist]

def get_mediapipe_keypoints_index() -> list[int]:
    POSE_UNPROCESSED = range(0, 33*4)
    # POSE = [i for i in POSE_UNPROCESSED if i % 4 != 3]
    # for x, y only
    # discard Z due to documentation https://github.com/google-ai-edge/mediapipe/blob/master/docs/solutions/holistic.md
    POSE = [i for i in POSE_UNPROCESSED if i % 4 != 2 and i % 4 != 3]

    FACE_UNPROCESSED = [i + 33*4 for i in STATIC_FACE_KEYPOINT_INDEX]
    # face keypoints are in x, y, z format flattened, so we need to capture all x, y, z values
    FACE = [i for j in range(0, len(FACE_UNPROCESSED), 3) for i in range(FACE_UNPROCESSED[j], FACE_UNPROCESSED[j] + 3)]
    # for x, y only
    # FACE = [i for j in range(0, len(FACE_UNPROCESSED), 3) for i in range(FACE_UNPROCESSED[j], FACE_UNPROCESSED[j] + 2)]

    # hands
    LEFT_HAND = list(range(33*4 + 468*3, 33*4 + 468*3 + 21*3))
    RIGHT_HAND = list(range(33*4 + 468*3 + 21*3, 33*4 + 468*3 + 21*3 + 21*3))
    # for x, y only
    # LEFT_HAND = [i for i in list(range(33*4 + 468*3, 33*4 + 468*3 + 21*3)) if i % 3 != 2]
    # RIGHT_HAND = [i for i in list(range(33*4 + 468*3 + 21*3, 33*4 + 468*3 + 21*3 + 21*3)) if i % 3 != 2]
    KEYPOINTS_INDEX = POSE + FACE + LEFT_HAND + RIGHT_HAND
    return KEYPOINTS_INDEX, [POSE, FACE, LEFT_HAND, RIGHT_HAND]

STATIC_KEYPOINTS_INDEX = get_mediapipe_keypoints_index()[0]

def preprocess_keypoints(keypoints, angle=0, tx=0, ty=0, tz=0, scale=1):
    POSE, FACE, LEFT_HAND, RIGHT_HAND = get_mediapipe_keypoints_index()[1]
    pose_indices = range(len(POSE))
    face_indices = range(len(POSE), len(POSE) + len(FACE))
    left_hand_indices = range(len(POSE) + len(FACE), len(POSE) + len(FACE) + len(LEFT_HAND))
    right_hand_indices = range(len(POSE) + len(FACE) + len(LEFT_HAND), len(POSE) + len(FACE) + len(LEFT_HAND) + len(RIGHT_HAND))
    # print(POSE, FACE, LEFT_HAND, RIGHT_HAND)
    keypoints = keypoints.copy()
    # print(keypoints.shape)
    pose_keypoints = keypoints[pose_indices]
    face_keypoints = keypoints[face_indices]
    left_hand_keypoints = keypoints[left_hand_indices]
    right_hand_keypoints = keypoints[right_hand_indices]

    # pose only has X and Y in flattened format
    pose_keypoints = pose_keypoints.reshape(-1, 2)
    # rotate each point by angle
    angle = np.radians(angle)
    rotation_matrix = np.array([
        [np.cos(angle), -np.sin(angle)], 
        [np.sin(angle), np.cos(angle)]
    ])
    pose_keypoints = np.dot(pose_keypoints, rotation_matrix)
    pose_keypoints[:, 0] += tx
    pose_keypoints[:, 1] += ty
    pose_keypoints[:, :2] = scale * (pose_keypoints[:, :2] - 0.5) + 0.5
    keypoints[pose_indices] = pose_keypoints.flatten()

    # other parts have X, Y, Z in flattened format
    face_keypoints = face_keypoints.reshape(-1, 3)
    left_hand_keypoints = left_hand_keypoints.reshape(-1, 3)
    right_hand_keypoints = right_hand_keypoints.reshape(-1, 3)
    angle = np.radians(angle)
    rotation_matrix = np.array([
        [np.cos(angle), -np.sin(angle), 0],
        [np.sin(angle), np.cos(angle), 0],
        [0, 0, 1]
    ])
    face_keypoints = np.dot(face_keypoints, rotation_matrix)
    left_hand_keypoints = np.dot(left_hand_keypoints, rotation_matrix)
    right_hand_keypoints = np.dot(right_hand_keypoints, rotation_matrix)
    face_keypoints[:, 0] += tx
    face_keypoints[:, 1] += ty
    face_keypoints[:, 2] += tz
    left_hand_keypoints[:, 0] += tx
    left_hand_keypoints[:, 1] += ty
    left_hand_keypoints[:, 2] += tz
    right_hand_keypoints[:, 0] += tx
    right_hand_keypoints[:, 1] += ty
    right_hand_keypoints[:, 2] += tz
    face_keypoints[:, :2] = scale * (face_keypoints[:, :2] - 0.5) + 0.5
    left_hand_keypoints[:, :2] = scale * (left_hand_keypoints[:, :2] - 0.5) + 0.5
    right_hand_keypoints[:, :2] = scale * (right_hand_keypoints[:, :2] - 0.5) + 0.5
    keypoints[face_indices] = face_keypoints.flatten()
    keypoints[left_hand_indices] = left_hand_keypoints.flatten()
    keypoints[right_hand_indices] = right_hand_keypoints.flatten()
    return keypoints

## Keypoint Generator

Create a generator by inheriting `keras.utils.Sequence`

In [None]:
# Preparation: get the largest length of sequences of x
train_max_length = train_parser.get_max_length()
test_max_length = test_parser.get_max_length()
dev_max_length = dev_parser.get_max_length()

X_max_length = max(train_max_length, test_max_length, dev_max_length)
print("Max length of sequences of X:", X_max_length)

In [None]:
# Implementation for Tensorflow

# import keras
# class CachedKeypointGenerator(keras.utils.Sequence):
#     def __init__(self, x_dir, parser: tvb_hksl_split_parser, batch_size=32):
#         self.x_dir = x_dir
#         self.batch_size = batch_size
#         self.list_x_files = sorted(os.listdir(self.x_dir))
#         if NO_TRANSFORM:
#             self.list_x_files = [file for file in self.list_x_files if file.startswith("control")]
#         self.total_files = len(self.list_x_files)
#         self.batch_index = 0
#         self.y = parser.get_train_glosses_tokenized().to_list()
#         self.y = [[word_dict[word] for word in words] for words in self.y]

#     def __len__(self):
#         return self.total_files

#     def __getitem__(self, idx: int):
#         batch_x = np.load(os.path.join(self.x_dir, self.list_x_files[idx]), mmap_mode="r")
#         batch_y = self.y[idx:idx + self.batch_size]
#         # one-hot encode the y values
#         batch_y = keras.utils.to_categorical(batch_y, num_classes=len(word_dict))
#         return batch_x, batch_y
    
# train_keypoint_generator = CachedKeypointGenerator(os.path.join(CACHE_DIR, "x"), train_parser)
# first_batch = train_keypoint_generator.__getitem__(0)
# X_shape = first_batch[0].shape
# Y_shape = np.array(first_batch[1]).shape
# print(first_batch[0])
# print(first_batch[1])
# del first_batch
# print(type(X_shape), X_shape)
# print(type(Y_shape), Y_shape)

In [None]:
# Implementation for PyTorch
class KeypointDataset(Dataset):
    def __init__(self, x_dir, parser: tvb_hksl_split_parser):
        self.x_dir = x_dir
        # path of each file is x_dir + get_train_id() + ".npy"
        self.list_x_files = parser.get_train_id().to_list()
        self.total_files = len(self.list_x_files)

        self.y = parser.get_train_glosses_tokenized().to_list()
        self.y = [[word_dict[word] for word in words] for words in self.y]

    def __len__(self):
        return self.total_files
    
    def __getitem__(self, idx: int):
        x = np.load(os.path.join(self.x_dir, self.list_x_files[idx] + ".npy"), mmap_mode="r")
        x = preprocess_keypoints(
            x,
            angle=np.random.uniform(-10, 10),
            tx=np.random.uniform(-0.1, 0.1),
            ty=np.random.uniform(-0.1, 0.1),
            tz=np.random.uniform(-0.1, 0.1),
            scale=np.random.uniform(0.8, 1.2)
        )
        y = self.y[idx]
        # one-hot encode the y values
        y = torch.nn.functional.one_hot(torch.tensor(y), num_classes=len(word_dict)).float()
        return x, y
    
train_keypoint_dataset = KeypointDataset(CACHE_DIR, train_parser)
train_keypoint_loader = DataLoader(train_keypoint_dataset, batch_size=32, shuffle=True)

dev_keypoint_dataset = KeypointDataset(CACHE_DIR, dev_parser)
dev_keypoint_loader = DataLoader(dev_keypoint_dataset, batch_size=32, shuffle=True)

test_keypoint_dataset = KeypointDataset(CACHE_DIR, test_parser)
test_keypoint_loader = DataLoader(test_keypoint_dataset, batch_size=32, shuffle=True)

## Model Definition

Create a transformer model with self-attention

In [None]:
class SignLanguageTransformer(nn.Module):
    def __init__(self, d_model, head_size, num_layers, d_ff, dropout, num_classes):
        super(SignLanguageTransformer, self).__init__()

        self.encoder_layer = nn.TransformerEncoderLayer(d_model, head_size, d_ff, dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers)

        self.decoder_layer = nn.TransformerDecoderLayer(d_model, head_size, d_ff, dropout)
        self.transformer_decoder = nn.TransformerDecoder(self.decoder_layer, num_layers)

        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, src, tgt):
        src = src.permute(1, 0, 2)
        tgt = tgt.permute(1, 0, 2)
        memory = self.transformer_encoder(src)
        output = self.transformer_decoder(tgt, memory)
        output = self.fc(output)
        return output.permute(1, 0, 2)

In [None]:
# # failed tensorflow model

# import tensorflow as tf
# from keras.layers import Input, Dense, Embedding, LayerNormalization, Dropout
# from keras.models import Model
# from keras.optimizers import Adam
# from keras.losses import SparseCategoricalCrossentropy
# from keras.metrics import SparseCategoricalAccuracy

# # Positional Encoding
# def positional_encoding(position, d_model):
#     angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
#     angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
#     angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
#     pos_encoding = angle_rads[np.newaxis, ...]
#     return tf.cast(pos_encoding, dtype=tf.float32)

# def get_angles(pos, i, d_model):
#     angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
#     return pos * angle_rates

# # Scaled Dot-Product Attention
# def scaled_dot_product_attention(q, k, v, mask):
#     matmul_qk = tf.matmul(q, k, transpose_b=True)
#     dk = tf.cast(tf.shape(k)[-1], tf.float32)
#     scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
#     if mask is not None:
#         scaled_attention_logits += (mask * -1e9)
#     attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
#     output = tf.matmul(attention_weights, v)
#     return output, attention_weights

# # Multi-Head Attention
# class MultiHeadAttention(tf.keras.layers.Layer):
#     def __init__(self, d_model, num_heads):
#         super(MultiHeadAttention, self).__init__()
#         self.num_heads = num_heads
#         self.d_model = d_model
#         assert d_model % self.num_heads == 0
#         self.depth = d_model // self.num_heads
#         self.wq = Dense(d_model)
#         self.wk = Dense(d_model)
#         self.wv = Dense(d_model)
#         self.dense = Dense(d_model)

#     def split_heads(self, x, batch_size):
#         x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
#         return tf.transpose(x, perm=[0, 2, 1, 3])

#     def call(self, v, k, q, mask):
#         batch_size = tf.shape(q)[0]
#         q = self.wq(q)
#         k = self.wk(k)
#         v = self.wv(v)
#         q = self.split_heads(q, batch_size)
#         k = self.split_heads(k, batch_size)
#         v = self.split_heads(v, batch_size)
#         scaled_attention, _ = scaled_dot_product_attention(q, k, v, mask)
#         scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
#         concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
#         output = self.dense(concat_attention)
#         return output

# # Point-wise Feed Forward Network
# def point_wise_feed_forward_network(d_model, dff):
#     return tf.keras.Sequential([
#         Dense(dff, activation='relu'),
#         Dense(d_model)
#     ])

# # Encoder Layer
# class EncoderLayer(tf.keras.layers.Layer):
#     def __init__(self, d_model, num_heads, dff, rate=0.1):
#         super(EncoderLayer, self).__init__()
#         self.mha = MultiHeadAttention(d_model, num_heads)
#         self.ffn = point_wise_feed_forward_network(d_model, dff)
#         self.layernorm1 = LayerNormalization(epsilon=1e-6)
#         self.layernorm2 = LayerNormalization(epsilon=1e-6)
#         self.dropout1 = Dropout(rate)
#         self.dropout2 = Dropout(rate)

#     def call(self, x, training, mask):
#         attn_output = self.mha(x, x, x, mask)
#         attn_output = self.dropout1(attn_output, training=training)
#         out1 = self.layernorm1(x + attn_output)
#         ffn_output = self.ffn(out1)
#         ffn_output = self.dropout2(ffn_output, training=training)
#         out2 = self.layernorm2(out1 + ffn_output)
#         return out2

# # Decoder Layer
# class DecoderLayer(tf.keras.layers.Layer):
#     def __init__(self, d_model, num_heads, dff, rate=0.1):
#         super(DecoderLayer, self).__init__()
#         self.mha1 = MultiHeadAttention(d_model, num_heads)
#         self.mha2 = MultiHeadAttention(d_model, num_heads)
#         self.ffn = point_wise_feed_forward_network(d_model, dff)
#         self.layernorm1 = LayerNormalization(epsilon=1e-6)
#         self.layernorm2 = LayerNormalization(epsilon=1e-6)
#         self.layernorm3 = LayerNormalization(epsilon=1e-6)
#         self.dropout1 = Dropout(rate)
#         self.dropout2 = Dropout(rate)
#         self.dropout3 = Dropout(rate)

#     def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
#         attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
#         attn1 = self.dropout1(attn1, training=training)
#         out1 = self.layernorm1(attn1 + x)
#         attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
#         attn2 = self.dropout2(attn2, training=training)
#         out2 = self.layernorm2(attn2 + out1)
#         ffn_output = self.ffn(out2)
#         ffn_output = self.dropout3(ffn_output, training=training)
#         out3 = self.layernorm3(ffn_output + out2)
#         return out3, attn_weights_block1, attn_weights_block2

# # Encoder
# class Encoder(tf.keras.layers.Layer):
#     def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
#         super(Encoder, self).__init__()
#         self.d_model = d_model
#         self.num_layers = num_layers
#         self.embedding = Dense(d_model)
#         self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
#         self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
#         self.dropout = Dropout(rate)

#     def call(self, x, training, mask):
#         seq_len = tf.shape(x)[1]
#         x = self.embedding(x)
#         x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
#         x += self.pos_encoding[:, :seq_len, :]
#         x = self.dropout(x, training=training)
#         for i in range(self.num_layers):
#             x = self.enc_layers[i](x, training, mask)
#         return x

# # Decoder
# class Decoder(tf.keras.layers.Layer):
#     def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1):
#         super(Decoder, self).__init__()
#         self.d_model = d_model
#         self.num_layers = num_layers
#         self.embedding = Embedding(target_vocab_size, d_model)
#         self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
#         self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
#         self.dropout = Dropout(rate)

#     def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
#         seq_len = tf.shape(x)[1]
#         attention_weights = {}
#         x = self.embedding(x)
#         x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
#         x += self.pos_encoding[:, :seq_len, :]
#         x = self.dropout(x, training=training)
#         for i in range(self.num_layers):
#             x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)
#             attention_weights[f'decoder_layer{i+1}_block1'] = block1
#             attention_weights[f'decoder_layer{i+1}_block2'] = block2
#         return x, attention_weights

# # Transformer
# class Transformer(tf.keras.Model):
#     def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
#         super(Transformer, self).__init__()
#         self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
#         self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)
#         self.final_layer = Dense(target_vocab_size)

#     def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
#         enc_output = self.encoder(inp, training, enc_padding_mask)
#         dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
#         final_output = self.final_layer(dec_output)
#         return final_output, attention_weights

# # Hyperparameters
# num_layers = 4
# d_model = 128
# num_heads = 8
# dff = 512
# input_vocab_size = X_shape[2]
# target_vocab_size = len(word_dict)
# pe_input = 1000
# pe_target = 1000
# dropout_rate = 0.1

# # Create the model
# transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, dropout_rate)

# # Compile the model
# transformer.compile(optimizer=Adam(learning_rate=0.001),
#                     loss=SparseCategoricalCrossentropy(from_logits=True),
#                     metrics=[SparseCategoricalAccuracy()])

# # Summary of the model
# transformer.summary()

In [None]:
# import tensorflow as tf
# from keras.layers import Input, Dense, Dropout, LayerNormalization, TimeDistributed, MultiHeadAttention
# from keras.models import Model

# input_shape = (None, None, None)  # (batch_size, max_num_frames, num_features)
# inputs = Input(shape=input_shape[1:])

# # Self-attention mechanism
# attention_output = MultiHeadAttention(num_heads=8, key_dim=inputs.shape[-1])(inputs, inputs)
# attention_output = LayerNormalization(epsilon=1e-6)(attention_output)
# attention_output = Dropout(0.1)(attention_output)

# # Add a feed-forward network
# ffn_output = Dense(2048, activation='relu')(attention_output)
# ffn_output = Dense(inputs.shape[-1])(ffn_output)
# ffn_output = LayerNormalization(epsilon=1e-6)(ffn_output)
# ffn_output = Dropout(0.1)(ffn_output)

# # Add a TimeDistributed layer for output
# outputs = TimeDistributed(Dense(len(word_dict), activation='softmax'))(ffn_output) 

# # Define the model
# model = Model(inputs=inputs, outputs=outputs)

# # Compile the model
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# # Print the model summary
# model.summary()

## Training

In [None]:
# # Initialize the loss function
# criterion = nn.CrossEntropyLoss()
# criterion = criterion.cuda()

# # Initialize the optimizer
# optimizer = optim.Adam(model.parameters(), lr=0.0001)

# # Initialize the learning rate scheduler
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# # Initialize the history dictionary
# history = {
#     "train_loss": [],
#     "train_acc": [],
#     "val_loss": [],
#     "val_acc": []
# }

# # Set model to training mode
# model.train()

In [None]:
# # Training loop
# for epoch in range(EPOCH):
#     # Initialize the epoch's loss
#     epoch_loss = 0
#     epoch_correct = 0
#     epoch_total = 0
    
#     # Train the model
#     for i, (x, y) in enumerate(train_keypoint_generator):
#         # Convert the data to PyTorch tensors and move to GPU
#         x = torch.from_numpy(x).type(torch.FloatTensor).to("cuda")
#         y = torch.from_numpy(np.array(y)).type(torch.LongTensor).to("cuda")
        
#         # Zero the gradients
#         optimizer.zero_grad()
        
#         # Forward pass
#         output = model(x, y)
        
#         # Calculate the loss
#         loss = criterion(output.view(-1, len(word_dict)), y.view(-1))
#         epoch_loss += loss.item()
        
#         # Calculate the accuracy
#         _, predicted = torch.max(output, 2)
#         epoch_total += y.size(0) * y.size(1)
#         epoch_correct += (predicted == y).sum().item()
        
#         # Backward pass
#         loss.backward()
        
#         # Update the weights
#         optimizer.step()
        
#         # Print statistics
#         print(f"Epoch {epoch + 1}, Batch {i + 1}, Loss: {loss.item()}")
    
#     # Calculate the epoch loss and accuracy
#     epoch_loss /= len(train_keypoint_generator)
#     epoch_acc = epoch_correct / epoch_total
#     print(f"Epoch {epoch + 1}, Loss: {epoch_loss}, Accuracy: {epoch_acc}")
    
#     # Update the history dictionary
#     history["train_loss"].append(epoch_loss)
#     history["train_acc"].append(epoch_acc)
    
#     # Skip validation for now

# # Save the model
# torch.save(model.state_dict(), MODEL_PATH)

In [None]:
# # Create masks

# class MaksedKeypointGenerator(keras.utils.Sequence):
#     def __init__(self, cached_keypoint_generator: CachedKeypointGenerator):
#         self.cached_keypoint_generator = cached_keypoint_generator

#     def __len__(self):
#         return self.cached_keypoint_generator.__len__()
    
#     def create_padding_mask(self, seq):
#         seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
#         return seq[:, tf.newaxis, tf.newaxis, :]
    
#     def create_look_ahead_mask(self, size):
#         mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
#         return mask
    
#     def __getitem__(self, idx: int):
#         x, y = self.cached_keypoint_generator.__getitem__(idx)
        
#         # Create padding masks
#         enc_padding_mask = self.create_padding_mask(x)
#         dec_padding_mask = self.create_padding_mask(x)
#         look_ahead_mask = self.create_look_ahead_mask(y.shape[1])

#         return x, y, enc_padding_mask, look_ahead_mask, dec_padding_mask
    
#     def on_epoch_end(self):
#         self.cached_keypoint_generator.on_epoch_end()

# train_masked_keypoint_generator = MaksedKeypointGenerator(train_keypoint_generator)
# first_batch = train_masked_keypoint_generator.__getitem__(0)

In [None]:
model = SignLanguageTransformer(D_MODEL, HEAD_SIZE, NUM_LAYERS, D_FF, DROPOUT, len(word_dict))

# loss function: categorical crossentropy
# optimizer: Adam
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.0001)

history = {
    "train_loss": [],
    "train_acc": [],
    "val_loss": [],
    "val_acc": []
}

if device == "cuda":
    model = model.cuda()

# training loop
# use dev set for validation
num_epochs = 256
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    epoch_correct = 0
    epoch_total = 0
    for i, (x, y) in enumerate(train_keypoint_loader):
        if device == "cuda":
            x = x.cuda()
            y = y.cuda()
        optimizer.zero_grad()
        output = model(x, y)
        loss = criterion(output.view(-1, len(word_dict)), y.view(-1))
        epoch_loss += loss.item()
        _, predicted = torch.max(output, 2)
        epoch_total += y.size(0) * y.size(1)
        epoch_correct += (predicted == y).sum().item()
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch + 1}, Batch {i + 1}, Loss: {loss.item()}")
    epoch_loss /= len(train_keypoint_loader)
    epoch_acc = epoch_correct / epoch_total
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss}, Accuracy: {epoch_acc}")
    history["train_loss"].append(epoch_loss)
    history["train_acc"].append(epoch_acc)

    model.eval()
    epoch_loss = 0
    epoch_correct = 0
    epoch_total = 0
    for i, (x, y) in enumerate(dev_keypoint_loader):
        if device == "cuda":
            x = x.cuda()
            y = y.cuda()
        output = model(x, y)
        loss = criterion(output.view(-1, len(word_dict)), y.view(-1))
        epoch_loss += loss.item()
        _, predicted = torch.max(output, 2)
        epoch_total += y.size(0) * y.size(1)
        epoch_correct += (predicted == y).sum().item()
    epoch_loss /= len(dev_keypoint_loader)
    epoch_acc = epoch_correct / epoch_total
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss}, Accuracy: {epoch_acc}")
    history["val_loss"].append(epoch_loss)
    history["val_acc"].append(epoch_acc)

In [None]:
# save the model
torch.save(model.state_dict(), MODEL_PATH)

# save the history
# RESULT_FILE_NAME = f"{RESULT_DIR}/result.csv"
# HISTORY_FILE_NAME = f"{RESULT_DIR}/history.csv"

rows = []
for i in range(num_epochs):
    rows.append([i + 1, history["train_loss"][i], history["train_acc"][i], history["val_loss"][i], history["val_acc"][i]])

import csv
with open(HISTORY_FILE_NAME, "w") as f:
    writer = csv.writer(f)
    writer.writerow(["Epoch", "Train Loss", "Train Accuracy", "Validation Loss", "Validation Accuracy"])
    writer.writerows(rows)

In [None]:
# test the model
if device == "cuda":
    model = model.cuda()

model.eval()
test_loss = 0
test_correct = 0
test_total = 0

result_dict = {
    "prediction": [],
    "actual": []
}

for i, (x, y) in enumerate(test_keypoint_loader):
    if device == "cuda":
        x = x.cuda()
        y = y.cuda()
    output = model(x, y)
    loss = criterion(output.view(-1, len(word_dict)), y.view(-1))
    test_loss += loss.item()
    _, predicted = torch.max(output, 2)
    test_total += y.size(0) * y.size(1)
    test_correct += (predicted == y).sum().item()

    decoded_predicted = [list(word_dict.keys())[list(word_dict.values()).index(word)] for word in predicted.view(-1).cpu().numpy()]
    decoded_actual = [list(word_dict.keys())[list(word_dict.values()).index(word)] for word in y.view(-1).cpu().numpy()]
    result_dict["prediction"].extend(decoded_predicted)
    result_dict["actual"].extend(decoded_actual)

test_loss /= len(test_keypoint_loader)
test_acc = test_correct / test_total
print(f"Test Loss: {test_loss}, Test Accuracy: {test_acc}")

# save the result
rows = []
for i in range(len(result_dict["prediction"])):
    rows.append([result_dict["prediction"][i], result_dict["actual"][i]])

with open(RESULT_FILE_NAME, "w") as f:
    writer = csv.writer(f)
    writer.writerow(["Prediction", "Actual"])
    writer.writerows(rows)