# Poseformer model

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/AAI-521/Final Project/Models/combined_dataset.pkl")

train_data = data[data['split'] == "train"]
val_data = data[data['split'] == 'val']
test_data = data[data['split'] == 'test']


In [4]:
# Extract padded_keypoints and label_index for each split
def extract_data(split_data):
    keypoints = np.stack(split_data['padded_keypoints'].values)  # Shape: [num_samples, 90, 33, 3]
    labels = split_data['label_index'].values                   # Shape: [num_samples]
    return keypoints, labels

train_keypoints, train_labels = extract_data(train_data)
val_keypoints, val_labels = extract_data(val_data)
test_keypoints, test_labels = extract_data(test_data)

# Convert to PyTorch tensors
train_keypoints = torch.tensor(train_keypoints, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.long)

val_keypoints = torch.tensor(val_keypoints, dtype=torch.float32)
val_labels = torch.tensor(val_labels, dtype=torch.long)

test_keypoints = torch.tensor(test_keypoints, dtype=torch.float32)
test_labels = torch.tensor(test_labels, dtype=torch.long)

print(f"Train data: {train_keypoints.shape}, {train_labels.shape}")
print(f"Val data: {val_keypoints.shape}, {val_labels.shape}")
print(f"Test data: {test_keypoints.shape}, {test_labels.shape}")

Train data: torch.Size([8313, 90, 33, 3]), torch.Size([8313])
Val data: torch.Size([2253, 90, 33, 3]), torch.Size([2253])
Test data: torch.Size([1414, 90, 33, 3]), torch.Size([1414])


In [5]:
class PoseDataset(Dataset):
    def __init__(self, keypoints, labels):
        self.keypoints = keypoints
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        keypoint = self.keypoints[idx]  # Shape: [90, 33, 3]
        label = self.labels[idx]       # Scalar
        return keypoint, label


train_dataset = PoseDataset(train_keypoints, train_labels)
val_dataset = PoseDataset(val_keypoints, val_labels)
test_dataset = PoseDataset(test_keypoints, test_labels)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [6]:
for batch_idx, (keypoints, labels) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}:")
    print(f"  Keypoints shape: {keypoints.shape}")  # [batch_size, 90, 33, 3]
    print(f"  Labels shape: {labels.shape}")        # [batch_size]
    break  # Stop after the first batch for demonstration

Batch 1:
  Keypoints shape: torch.Size([32, 90, 33, 3])
  Labels shape: torch.Size([32])


## Custom Classes for the model

In [None]:
class PoseFormer(nn.Module):
    def __init__(self, num_keypoints=33, num_features=3, num_classes=2000, num_frames=90, embed_dim=128, num_heads=8, num_layers=4, dropout=0.1):
        """
        PoseFormer implementation for video-based pose classification.

        Args:
            num_keypoints (int): Number of keypoints per frame.
            num_features (int): Features per keypoint (e.g., x, y, z).
            num_classes (int): Number of output classes.
            num_frames (int): Fixed length of video sequences.
            embed_dim (int): Embedding dimension for the transformer.
            num_heads (int): Number of attention heads.
            num_layers (int): Number of transformer encoder layers.
            dropout (float): Dropout rate.
        """
        super(PoseFormer, self).__init__()
        self.num_keypoints = num_keypoints
        self.num_features = num_features
        self.num_frames = num_frames
        self.embed_dim = embed_dim

        # Linear layer to embed keypoints into a higher-dimensional space
        self.embedding = nn.Linear(num_keypoints * num_features, embed_dim)

        # Positional encoding
        self.positional_encoding = nn.Parameter(torch.randn(num_frames, embed_dim))

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        """
        Forward pass for PoseFormer.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, num_frames, num_keypoints, num_features).

        Returns:
            torch.Tensor: Logits of shape (batch_size, num_classes).
        """
        batch_size = x.size(0)

        # Flatten keypoints and features
        x = x.view(batch_size, self.num_frames, -1)  # Shape: (batch_size, num_frames, num_keypoints * num_features)

        # Apply embedding
        x = self.embedding(x)  # Shape: (batch_size, num_frames, embed_dim)

        # Add positional encoding
        x = x + self.positional_encoding.unsqueeze(0)  # Shape: (batch_size, num_frames, embed_dim)

        # Permute for transformer (seq_len, batch_size, embed_dim)
        x = x.permute(1, 0, 2)  # Shape: (num_frames, batch_size, embed_dim)

        # Transformer encoder
        x = self.transformer_encoder(x)  # Shape: (num_frames, batch_size, embed_dim)

        # Take the mean over the temporal dimension
        x = x.mean(dim=0)  # Shape: (batch_size, embed_dim)

        # Classification
        logits = self.classifier(x)  # Shape: (batch_size, num_classes)

        return logits


## Model

In [None]:
# Define model
model = PoseFormer(num_keypoints=33, num_features=3, num_classes=2000, num_frames=90)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)

# Learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)



In [14]:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=20):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0

        # Training phase
        for keypoints, labels in train_loader:
            keypoints, labels = keypoints.to(device), labels.to(device)

            # Forward pass
            outputs = model(keypoints)
            loss = criterion(outputs, labels)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Metrics
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        # Adjust learning rate
        scheduler.step()

        # Validation phase
        val_loss, val_accuracy = evaluate_model(model, val_loader, criterion)

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}, "
              f"Train Acc: {100*correct/total:.2f}%, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")

def evaluate_model(model, loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for keypoints, labels in loader:
            keypoints, labels = keypoints.to(device), labels.to(device)

            outputs = model(keypoints)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return total_loss / len(loader), 100 * correct / total


In [15]:
num_epochs = 20
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs)


Epoch [1/20], Loss: 7.6263, Train Acc: 0.07%, Val Loss: 7.6015, Val Acc: 0.04%
Epoch [2/20], Loss: 7.5941, Train Acc: 0.08%, Val Loss: 7.6086, Val Acc: 0.09%
Epoch [3/20], Loss: 7.5801, Train Acc: 0.12%, Val Loss: 7.6182, Val Acc: 0.04%
Epoch [4/20], Loss: 7.5705, Train Acc: 0.11%, Val Loss: 7.6244, Val Acc: 0.09%
Epoch [5/20], Loss: 7.5541, Train Acc: 0.10%, Val Loss: 7.6182, Val Acc: 0.04%
Epoch [6/20], Loss: 7.4558, Train Acc: 0.16%, Val Loss: 7.4598, Val Acc: 0.00%
Epoch [7/20], Loss: 7.2115, Train Acc: 0.18%, Val Loss: 7.3512, Val Acc: 0.18%
Epoch [8/20], Loss: 7.0403, Train Acc: 0.19%, Val Loss: 7.3484, Val Acc: 0.27%
Epoch [9/20], Loss: 6.9099, Train Acc: 0.28%, Val Loss: 7.3757, Val Acc: 0.44%
Epoch [10/20], Loss: 6.8092, Train Acc: 0.26%, Val Loss: 7.4056, Val Acc: 0.27%
Epoch [11/20], Loss: 6.6585, Train Acc: 0.59%, Val Loss: 7.4784, Val Acc: 0.27%
Epoch [12/20], Loss: 6.6398, Train Acc: 0.67%, Val Loss: 7.4999, Val Acc: 0.27%
Epoch [13/20], Loss: 6.6265, Train Acc: 0.72%, Va

In [16]:
test_loss, test_accuracy = evaluate_model(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")


Test Loss: 7.6694, Test Accuracy: 0.14%
