## Preprocessing

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/AAI-521/Final Project/Models/combined_dataset.pkl")

train_data = data[data['split'] == "train"]
val_data = data[data['split'] == 'val']
test_data = data[data['split'] == 'test']

In [4]:
# Extract padded_keypoints and label_index for each split
def extract_data(split_data):
    keypoints = np.stack(split_data['padded_keypoints'].values)  # Shape: [num_samples, 90, 33, 3]
    labels = split_data['label_index'].values                   # Shape: [num_samples]
    return keypoints, labels

train_keypoints, train_labels = extract_data(train_data)
val_keypoints, val_labels = extract_data(val_data)
test_keypoints, test_labels = extract_data(test_data)

# Convert to PyTorch tensors
train_keypoints = torch.tensor(train_keypoints, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.long)

val_keypoints = torch.tensor(val_keypoints, dtype=torch.float32)
val_labels = torch.tensor(val_labels, dtype=torch.long)

test_keypoints = torch.tensor(test_keypoints, dtype=torch.float32)
test_labels = torch.tensor(test_labels, dtype=torch.long)

print(f"Train data: {train_keypoints.shape}, {train_labels.shape}")
print(f"Val data: {val_keypoints.shape}, {val_labels.shape}")
print(f"Test data: {test_keypoints.shape}, {test_labels.shape}")

Train data: torch.Size([8313, 90, 33, 3]), torch.Size([8313])
Val data: torch.Size([2253, 90, 33, 3]), torch.Size([2253])
Test data: torch.Size([1414, 90, 33, 3]), torch.Size([1414])


In [5]:
class PoseDataset(Dataset):
    def __init__(self, keypoints, labels):
        self.keypoints = keypoints  # Shape: [num_samples, 90, 33, 3]
        self.labels = labels        # Shape: [num_samples]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        keypoint = self.keypoints[idx]  # Shape: [90, 33, 3]

        # Flatten the keypoints to a single vector for each frame (33 * 3 = 99 features per frame)
        keypoint = keypoint.view(90, -1)  # Reshaping to [90, 99]

        label = self.labels[idx]        # Scalar label (gesture index)

        return keypoint, label




In [6]:
train_dataset = PoseDataset(train_keypoints, train_labels)
val_dataset = PoseDataset(val_keypoints, val_labels)
test_dataset = PoseDataset(test_keypoints, test_labels)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [7]:
# Inspect a batch of data
for inputs, labels in train_loader:
    print(f"Input shape: {inputs.shape}")
    print(f"Labels shape: {labels.shape}")
    break  # Just to check one batch


Input shape: torch.Size([32, 90, 99])
Labels shape: torch.Size([32])


In [8]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float('inf')
        self.counter = 0
        self.stop_training = False

    def __call__(self, val_loss):
        if self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.stop_training = True

# TimeSFormer

In [None]:
import torch
import torch.optim as optim
from sklearn.model_selection import ParameterGrid

In [9]:
class TimeSformer(nn.Module):
    def __init__(self, num_classes, input_dim=99, num_frames=90, num_heads=8, num_layers=6, hidden_dim=256):
        super(TimeSformer, self).__init__()

        # Positional Embedding for both spatial and temporal dimensions
        self.pos_embedding = nn.Parameter(torch.randn(1, num_frames, input_dim))  # Positional Encoding for frames

        # Temporal Attention (across frames)
        self.temporal_attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, batch_first=True)

        # Spatial Attention (within each frame)
        self.spatial_attention = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, batch_first=True)

        # Feedforward network for each attention layer
        self.feedforward = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )

        # Final classification layer
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        # Ensure that the input has the correct shape [batch_size, num_frames, input_dim]
        # x: Shape: [batch_size, num_frames, input_dim] -> [32, 90, 99]

        # Add positional encoding to the input
        x = x + self.pos_embedding

        # Apply Temporal Attention (across frames)
        temporal_out, _ = self.temporal_attention(x, x, x)  # Shape: [batch_size, num_frames, input_dim]

        # Apply Spatial Attention (within each frame)
        spatial_out, _ = self.spatial_attention(temporal_out, temporal_out, temporal_out)  # Shape: [batch_size, num_frames, input_dim]

        # Feedforward pass
        out = self.feedforward(spatial_out)

        # Global Average Pooling across frames (temporal dimension)
        out = out.mean(dim=1)  # Aggregate across frames to get a fixed-size representation

        # Final classification
        out = self.fc(out)
        return out

In [15]:
# Training function
def train(model, train_loader, criterion, optimizer, device):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  # Move data to device

        optimizer.zero_grad()  # Zero gradients

        # Forward pass
        outputs = model(inputs)

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Track loss and accuracy
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_preds += (predicted == labels).sum().item()
        total_preds += labels.size(0)

    avg_loss = running_loss / len(train_loader)
    accuracy = correct_preds / total_preds
    return avg_loss, accuracy


# Evaluation function
def evaluate(model, val_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():  # No gradient calculation during evaluation
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to device

            # Forward pass
            outputs = model(inputs)

            # Calculate loss
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            # Track accuracy
            _, predicted = torch.max(outputs, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)

    avg_loss = running_loss / len(val_loader)
    accuracy = correct_preds / total_preds
    return avg_loss, accuracy


In [None]:
num_classes = 2000   # Static from the 2000 words from WLASL
input_dim = 99       # Flattened keypoints per frame (33 keypoints × 3)
num_frames = 90      # Temporal dimension: number of frames

# Hyperparameter search space
param_grid = {
    'learning_rate': [1e-3, 5e-4, 1e-4],
    'num_heads': [3, 9, 11],
    'num_layers': [6, 8, 12],
    'hidden_dim': [256, 512, 1024],
    'weight_decay': [1e-5, 1e-4, 1e-3],
}



In [17]:
def train_and_evaluate_model(learning_rate, num_heads, num_layers, hidden_dim, weight_decay):
    # Create model instance with current hyperparameters
    model = TimeSformer(num_classes=num_classes, input_dim=input_dim, num_frames=num_frames,
                        num_heads=num_heads, num_layers=num_layers, hidden_dim=hidden_dim)

    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.1)

    # Early stopping
    early_stopping = EarlyStopping(patience=5, min_delta=0.01)

    # Track best validation accuracy
    best_val_acc = 0.0
    best_model_path = "/content/drive/MyDrive/Colab Notebooks/AAI-521/Final Project/Models/best_model.pth"

    # Training loop
    for epoch in range(50):  # You can set a max epoch count
        # Training
        train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)

        # Validation
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)

        # Check if this is the best model based on validation accuracy
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), best_model_path)
            #print(f"New best model saved with accuracy: {best_val_acc:.4f}")

        # Apply early stopping
        early_stopping(val_loss)
        scheduler.step(val_loss)  # Update learning rate

        # If early stopping is triggered, stop training
        if early_stopping.stop_training:
            break

    # Return validation accuracy, model path, and the model
    return best_val_acc, best_model_path, model


In [18]:
# Run Grid Search
best_val_acc = 0
best_hyperparams = {}
best_model_overall_path = "/content/drive/MyDrive/Colab Notebooks/AAI-521/Final Project/Models/best_model.pth"
best_model = None  # Track the best model globally

# Generate all combinations of hyperparameters
grid = ParameterGrid(param_grid)

# Loop through all hyperparameter combinations
for params in grid:
    print(f"Training with parameters: {params}")
    val_acc, model_path, model = train_and_evaluate_model(
        learning_rate=params['learning_rate'],
        num_heads=params['num_heads'],
        num_layers=params['num_layers'],
        hidden_dim=params['hidden_dim'],
        weight_decay=params['weight_decay']
    )

    print(f"Validation Accuracy: {val_acc:.4f}")

    # Update globally best hyperparameters and model path if current combination is better
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_hyperparams = params
        best_model = model  # Save the globally best model instance

        # Save the best model globally
        torch.save(best_model.state_dict(), best_model_overall_path)
        print(f"Globally best model updated with accuracy: {best_val_acc:.4f}")

print(f"Best Hyperparameters: {best_hyperparams}")
print(f"Best Validation Accuracy: {best_val_acc:.4f}")
print(f"Best Model Path: {best_model_overall_path}")




Training with parameters: {'hidden_dim': 256, 'learning_rate': 0.001, 'num_heads': 3, 'num_layers': 6, 'weight_decay': 1e-05}
Validation Accuracy: 0.0053
Globally best model updated with accuracy: 0.0053
Training with parameters: {'hidden_dim': 256, 'learning_rate': 0.001, 'num_heads': 3, 'num_layers': 6, 'weight_decay': 0.0001}
Validation Accuracy: 0.0036
Training with parameters: {'hidden_dim': 256, 'learning_rate': 0.001, 'num_heads': 3, 'num_layers': 6, 'weight_decay': 0.001}
Validation Accuracy: 0.0013
Training with parameters: {'hidden_dim': 256, 'learning_rate': 0.001, 'num_heads': 3, 'num_layers': 8, 'weight_decay': 1e-05}
Validation Accuracy: 0.0040
Training with parameters: {'hidden_dim': 256, 'learning_rate': 0.001, 'num_heads': 3, 'num_layers': 8, 'weight_decay': 0.0001}
Validation Accuracy: 0.0044
Training with parameters: {'hidden_dim': 256, 'learning_rate': 0.001, 'num_heads': 3, 'num_layers': 8, 'weight_decay': 0.001}
Validation Accuracy: 0.0004
Training with parameters