In [1]:
!git clone https://github.com/khushboo-agarwal/Action-Recognition.git

Cloning into 'Action-Recognition'...
remote: Enumerating objects: 10700, done.[K
remote: Total 10700 (delta 0), reused 0 (delta 0), pack-reused 10700 (from 1)[K
Receiving objects: 100% (10700/10700), 628.22 MiB | 17.38 MiB/s, done.
Resolving deltas: 100% (20/20), done.
Updating files: 100% (10674/10674), done.


In [2]:
import os
import shutil

base_path = "/content/Action-Recognition/ucf_sports"
selected = ['Diving-Side', 'Golf-Swing-Side', 'Run-Side', 'Lifting']
target_path = "/content/ucf_selected_classes"

os.makedirs(target_path, exist_ok=True)

for cls in selected:
    src = os.path.join(base_path, cls)
    dst = os.path.join(target_path, cls)

    # Check if the source directory exists before attempting to copy
    if os.path.exists(src):
        shutil.copytree(src, dst)
        print(f"Successfully copied {src} to {dst}")
    else:
        print(f"Source directory not found: {src}. Skipping copy.")


Successfully copied /content/Action-Recognition/ucf_sports/Diving-Side to /content/ucf_selected_classes/Diving-Side
Successfully copied /content/Action-Recognition/ucf_sports/Golf-Swing-Side to /content/ucf_selected_classes/Golf-Swing-Side
Successfully copied /content/Action-Recognition/ucf_sports/Run-Side to /content/ucf_selected_classes/Run-Side
Successfully copied /content/Action-Recognition/ucf_sports/Lifting to /content/ucf_selected_classes/Lifting


In [3]:
import os

# Input: Folder containing folders like Diving-Side/001/*.jpg
frames_root = '/content/ucf_selected_classes'

# Output: Where the .npy pose sequences will be saved
pose_output_root = '/content/pose_sequences'
os.makedirs(pose_output_root, exist_ok=True)


In [4]:
pip install ultralytics


Collecting ultralytics
  Downloading ultralytics-8.3.152-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [5]:
from ultralytics import YOLO
import os
import numpy as np
from PIL import Image

# Load the pretrained YOLOv8 pose model
model = YOLO('yolov8n-pose.pt')  # 'yolov8n-pose.pt' is the lightweight pose model

frames_root = '/content/ucf_selected_classes'
pose_output_root = '/content/pose_sequences'
os.makedirs(pose_output_root, exist_ok=True)

for action in os.listdir(frames_root):
    action_path = os.path.join(frames_root, action)
    # Check if action_path is a directory
    if not os.path.isdir(action_path):
        print(f"Skipping {action_path} as it is not a directory.")
        continue

    for vid_folder in os.listdir(action_path):
        vid_path = os.path.join(action_path, vid_folder)
        # Check if vid_path is a directory
        if not os.path.isdir(vid_path):
            print(f"Skipping {vid_path} as it is not a directory.")
            continue

        image_files = sorted([f for f in os.listdir(vid_path) if f.endswith('.jpg')])

        all_keypoints = []
        for img_file in image_files:
            img_path = os.path.join(vid_path, img_file)

            # Check if img_path exists and is a file
            if not os.path.isfile(img_path):
                print(f"Skipping {img_path} as it does not exist or is not a file.")
                # Append zeros for this missing frame to maintain sequence length
                # Ensure the shape is (17, 3)
                kpts_with_conf = np.zeros((17, 3))
                all_keypoints.append(kpts_with_conf)
                continue

            try:
                results = model(img_path)

                # Initialize kpts_with_conf to zeros in case no valid keypoints are found
                kpts_with_conf = np.zeros((17, 3))

                # Check if results exist, contain keypoints, and have at least one detected object with xy data
                if results and results[0].keypoints is not None and len(results[0].keypoints.xy) > 0 and results[0].keypoints.xy[0].shape[0] == 17:
                    kpts = results[0].keypoints.xy[0].cpu().numpy()  # shape: (17, 2)

                    # Add a check specifically for the confidence scores being None or empty,
                    # and ensure they have the correct number of points.
                    if results[0].keypoints.conf is not None and len(results[0].keypoints.conf) > 0 and results[0].keypoints.conf[0].shape[0] == 17:
                        confs = results[0].keypoints.conf[0].cpu().numpy()  # shape: (17,)
                        # Ensure confs has the correct shape before concatenation
                        kpts_with_conf = np.concatenate([kpts, confs[:, None]], axis=1)  # (17,3)
                    else:
                        # Confidence scores are None, empty, or incorrect shape, append kpts and zero confidence scores
                        # print(f"Warning: Confidence scores issues for {img_path}. Appending zero confidences.")
                        kpts_with_conf = np.concatenate([kpts, np.zeros_like(kpts[:, :1])], axis=1) # (17,3)
                else:
                    # No person detected, no keypoints found, or xy data shape incorrect (e.g., empty xy array)
                    # kpts_with_conf is already initialized to zeros (17, 3)
                    pass # The zero initialization covers this case

                all_keypoints.append(kpts_with_conf)

            except Exception as e:
                # Handle potential errors during model inference or processing
                print(f"Error processing {img_path}: {e}. Appending zeros.")
                # Ensure the appended array has the expected shape (17, 3)
                kpts_with_conf = np.zeros((17, 3))
                all_keypoints.append(kpts_with_conf)


        # Convert the list of keypoints to a numpy array
        # The list should now only contain arrays of shape (17, 3) or (0, 3) if the list was empty.
        # If the list is not empty, np.array should work.
        if all_keypoints: # Only convert if there are keypoints collected
             # Check the shape of the first element to be safe, though with the fix, it should be (17, 3) or empty list case handled below
             if all_keypoints[0].shape == (17, 3) or all_keypoints[0].shape == (0, 3):
                 all_keypoints_array = np.array(all_keypoints)  # [T x 17 x 3]
             else:
                 # This case should ideally not be reached with the fix, but as a fallback
                 print(f"Warning: Inconsistent shape found in all_keypoints for {vid_folder}. Attempting conversion anyway.")
                 # np.array might still fail here if shapes are truly inconsistent beyond (0,3) or (17,3)
                 all_keypoints_array = np.array(all_keypoints) # Attempt conversion

        else:
            # If no keypoints were processed for this video folder (e.g., no images found or all failed)
            # create an empty array with the expected structure (T=0)
            all_keypoints_array = np.empty((0, 17, 3))


        save_dir = os.path.join(pose_output_root, action)
        os.makedirs(save_dir, exist_ok=True)

        # Save only if there are keypoints to save and they have the correct dimension for the time step (T > 0)
        if all_keypoints_array.shape[0] > 0 and all_keypoints_array.shape[1:] == (17, 3):
            np.save(os.path.join(save_dir, f'{vid_folder}.npy'), all_keypoints_array)
            print(f"Saved pose keypoints for {action}/{vid_folder}, shape: {all_keypoints_array.shape}")
        else:
            # print(f"No valid keypoints processed or shape mismatch for {action}/{vid_folder}. Not saving. Final array shape: {all_keypoints_array.shape}")
            pass # Reduced log noise for empty sequences

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n-pose.pt to 'yolov8n-pose.pt'...


100%|██████████| 6.52M/6.52M [00:00<00:00, 177MB/s]



image 1/1 /content/ucf_selected_classes/Golf-Swing-Side/001/RF1-13207_7015.jpg: 448x640 1 person, 45.1ms
Speed: 12.8ms preprocess, 45.1ms inference, 320.1ms postprocess per image at shape (1, 3, 448, 640)

image 1/1 /content/ucf_selected_classes/Golf-Swing-Side/001/RF1-13207_7016.jpg: 448x640 1 person, 7.6ms
Speed: 2.4ms preprocess, 7.6ms inference, 1.7ms postprocess per image at shape (1, 3, 448, 640)

image 1/1 /content/ucf_selected_classes/Golf-Swing-Side/001/RF1-13207_7017.jpg: 448x640 1 person, 7.7ms
Speed: 2.0ms preprocess, 7.7ms inference, 1.6ms postprocess per image at shape (1, 3, 448, 640)

image 1/1 /content/ucf_selected_classes/Golf-Swing-Side/001/RF1-13207_7018.jpg: 448x640 1 person, 6.9ms
Speed: 1.9ms preprocess, 6.9ms inference, 1.6ms postprocess per image at shape (1, 3, 448, 640)

image 1/1 /content/ucf_selected_classes/Golf-Swing-Side/001/RF1-13207_7019.jpg: 448x640 1 person, 7.5ms
Speed: 2.2ms preprocess, 7.5ms inference, 1.6ms postprocess per image at shape (1, 3, 

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class CosineClassifier(nn.Module):
    def __init__(self, dim, num_classes, scale=10):
        super().__init__()
        self.weight = nn.Parameter(torch.Tensor(num_classes, dim))
        nn.init.xavier_uniform_(self.weight)
        self.scale = scale

    def forward(self, x):
        x = F.normalize(x, dim=-1)
        w = F.normalize(self.weight, dim=-1)
        return self.scale * x @ w.T  # cosine similarity * scale

class FewShotTransformer(nn.Module):
    def __init__(self, input_size=51, num_classes=4, dim_model=256, num_heads=8, num_layers=4, dropout=0.2, max_len=100):
        super().__init__()
        self.input_fc = nn.Sequential(
            nn.Linear(input_size, dim_model),
            nn.LayerNorm(dim_model),
            nn.GELU(),
            nn.Dropout(dropout)
        )

        self.cls_token = nn.Parameter(torch.zeros(1, 1, dim_model))
        self.pos_encoder = PositionalEncoding(dim_model, max_len + 1)

        encoder_layer = nn.TransformerEncoderLayer(d_model=dim_model, nhead=num_heads,
                                                   dim_feedforward=dim_model*4, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.norm = nn.LayerNorm(dim_model)

        # Metric-based classifier
        self.classifier = CosineClassifier(dim=dim_model, num_classes=num_classes)

    def forward(self, x):
        B, T, _ = x.size()

        # Project input
        x = self.input_fc(x)

        # Add CLS token
        cls_tokens = self.cls_token.expand(B, 1, -1)
        x = torch.cat([cls_tokens, x], dim=1)

        # Add positional encoding
        x = self.pos_encoder(x)

        # Transformer
        x = self.transformer_encoder(x)

        # Final CLS token (summary)
        cls_out = self.norm(x[:, 0])  # shape [B, dim_model]

        # Metric-based classification
        out = self.classifier(cls_out)
        return out



In [7]:
import os
import numpy as np
from torch.utils.data import Dataset

class PoseSequenceDataset(Dataset):
    def __init__(self, data_root, class_map, max_len=50):
        self.samples = []
        self.class_map = class_map
        self.max_len = max_len

        for cls_name in os.listdir(data_root):
            cls_path = os.path.join(data_root, cls_name)
            if not os.path.isdir(cls_path):
                continue
            for fname in os.listdir(cls_path):
                if fname.endswith('.npy'):
                    self.samples.append((os.path.join(cls_path, fname), class_map[cls_name]))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        data = np.load(path)  # shape: [T x 17 x 3]
        data[:, :, :2] -= data[:, 0:1, :2]  # normalize to joint 0
        # Pad or crop
        if data.shape[0] < self.max_len:
            pad = np.zeros((self.max_len - data.shape[0], 17, 3))
            data = np.concatenate((data, pad), axis=0)
        else:
            data = data[:self.max_len]
        data = data.reshape(self.max_len, -1)  # shape: [T, 51]
        return torch.tensor(data, dtype=torch.float32), label


In [8]:
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

# Map class names to labels
class_map = {
    'Diving-Side': 0,
    'Golf-Swing-Side': 1,
    'Run-Side': 2,
    'Lifting': 3
}

# Load dataset
data_root = "/content/pose_sequences"
dataset = PoseSequenceDataset(data_root, class_map, max_len=50)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

# Create model
model = FewShotTransformer(input_size=51, num_classes=4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and Loss
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()


In [9]:
for epoch in range(20):
    model.train()
    total_loss = 0
    correct = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)  # [B, num_classes]
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += (predicted == labels).sum().item()

    acc = correct / len(dataset)
    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f} | Accuracy: {acc*100:.2f}%")


Epoch 1 | Loss: 7.5052 | Accuracy: 25.00%
Epoch 2 | Loss: 4.6172 | Accuracy: 75.00%
Epoch 3 | Loss: 3.2192 | Accuracy: 85.00%
Epoch 4 | Loss: 1.7980 | Accuracy: 95.00%
Epoch 5 | Loss: 1.1647 | Accuracy: 100.00%
Epoch 6 | Loss: 0.8427 | Accuracy: 100.00%
Epoch 7 | Loss: 0.5141 | Accuracy: 100.00%
Epoch 8 | Loss: 0.3939 | Accuracy: 100.00%
Epoch 9 | Loss: 0.2252 | Accuracy: 100.00%
Epoch 10 | Loss: 0.1605 | Accuracy: 100.00%
Epoch 11 | Loss: 0.1066 | Accuracy: 100.00%
Epoch 12 | Loss: 0.0998 | Accuracy: 100.00%
Epoch 13 | Loss: 0.0862 | Accuracy: 100.00%
Epoch 14 | Loss: 0.0638 | Accuracy: 100.00%
Epoch 15 | Loss: 0.0669 | Accuracy: 100.00%
Epoch 16 | Loss: 0.0500 | Accuracy: 100.00%
Epoch 17 | Loss: 0.0425 | Accuracy: 100.00%
Epoch 18 | Loss: 0.0361 | Accuracy: 100.00%
Epoch 19 | Loss: 0.0351 | Accuracy: 100.00%
Epoch 20 | Loss: 0.0238 | Accuracy: 100.00%


In [10]:
torch.save(model.state_dict(), "/content/pose_action_model.pth")


In [19]:
# Load YOLOv8 pose model
pose_model = YOLO("yolov8n-pose.pt")

# Run inference on the video
# Use stream=True for processing videos frame by frame efficiently
results = pose_model.predict("/content/b5ccfd6d-2238-4d93-aa6c-f0fd970f8ddc.mp4.mp4", save=False, stream=True)

keypoints_seq = []

for r in results:
    # r.keypoints will be None if no person is detected in the frame
    # r.keypoints.data contains the [num_people, num_keypoints, 3] tensor (x, y, conf)
    # r.keypoints.xy contains the [num_people, num_keypoints, 2] tensor (x, y)
    # r.keypoints.xyn contains the [num_people, num_keypoints, 2] tensor (normalized x, y)

    # Check if any keypoints were detected and if there's at least one person's data
    if r.keypoints is not None and r.keypoints.data is not None and len(r.keypoints.data) > 0:
        # Extract keypoints (x, y, conf) for the first detected person
        # Ensure the extracted keypoint array has the correct shape [17, 3]
        if r.keypoints.data[0].shape == (17, 3):
            # Append the keypoints data (x, y, confidence)
            kp = r.keypoints.data[0].cpu().numpy()  # shape: [17, 3]
            keypoints_seq.append(kp)
        else:
             # This case might happen if YOLO detects something but it's not a person with 17 keypoints
             print(f"Warning: Detected keypoints but shape mismatch for a frame. Expected (17, 3), got {r.keypoints.data[0].shape}. Skipping frame.")
             # Optionally, append zeros or handle as a missing frame if needed
             # keypoints_seq.append(np.zeros((17, 3))) # Example: append zeros
    else:
        # If no person is detected in the frame
        # print("No keypoints detected in a frame. Appending zeros.")
        # Append an array of zeros for this frame to maintain sequence length, shape [17, 3]
        keypoints_seq.append(np.zeros((17, 3)))


# Convert the list of keypoints to a numpy array
# This should now have shape [T, 17, 3] if all frames had keypoints or zeros
if keypoints_seq: # Only convert if the list is not empty
    keypoints_seq = np.array(keypoints_seq)  # [T, 17, 3]
    print("Keypoint sequence shape:", keypoints_seq.shape)
else:
    # Handle the case where no keypoints were found in any frame of the video
    print("No keypoints extracted from the video.")
    keypoints_seq = np.empty((0, 17, 3)) # Create an empty array with the expected structure

# Save keypoints for debug if needed
np.save("test_seq.npy", keypoints_seq)


video 1/1 (frame 1/70) /content/b5ccfd6d-2238-4d93-aa6c-f0fd970f8ddc.mp4.mp4: 384x640 1 person, 9.1ms
video 1/1 (frame 2/70) /content/b5ccfd6d-2238-4d93-aa6c-f0fd970f8ddc.mp4.mp4: 384x640 1 person, 9.4ms
video 1/1 (frame 3/70) /content/b5ccfd6d-2238-4d93-aa6c-f0fd970f8ddc.mp4.mp4: 384x640 1 person, 7.6ms
video 1/1 (frame 4/70) /content/b5ccfd6d-2238-4d93-aa6c-f0fd970f8ddc.mp4.mp4: 384x640 1 person, 7.4ms
video 1/1 (frame 5/70) /content/b5ccfd6d-2238-4d93-aa6c-f0fd970f8ddc.mp4.mp4: 384x640 1 person, 7.4ms
video 1/1 (frame 6/70) /content/b5ccfd6d-2238-4d93-aa6c-f0fd970f8ddc.mp4.mp4: 384x640 1 person, 10.5ms
video 1/1 (frame 7/70) /content/b5ccfd6d-2238-4d93-aa6c-f0fd970f8ddc.mp4.mp4: 384x640 1 person, 8.6ms
video 1/1 (frame 8/70) /content/b5ccfd6d-2238-4d93-aa6c-f0fd970f8ddc.mp4.mp4: 384x640 1 person, 8.7ms
video 1/1 (frame 9/70) /content/b5ccfd6d-2238-4d93-aa6c-f0fd970f8ddc.mp4.mp4: 384x640 1 person, 7.5ms
video 1/1 (frame 10/70) /content/b5ccfd6d-2238-4d93-aa6c-f0fd970f8ddc.mp4.mp4: 3

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class CosineClassifier(nn.Module):
    def __init__(self, dim, num_classes, scale=10):
        super().__init__()
        self.weight = nn.Parameter(torch.Tensor(num_classes, dim))
        nn.init.xavier_uniform_(self.weight)
        self.scale = scale

    def forward(self, x):
        x = F.normalize(x, dim=-1)
        w = F.normalize(self.weight, dim=-1)
        return self.scale * x @ w.T  # cosine similarity * scale

class FewShotTransformer(nn.Module):
    def __init__(self, input_size=51, num_classes=4, dim_model=256, num_heads=8, num_layers=4, dropout=0.2, max_len=100):
        super().__init__()
        self.input_fc = nn.Sequential(
            nn.Linear(input_size, dim_model),
            nn.LayerNorm(dim_model),
            nn.GELU(),
            nn.Dropout(dropout)
        )

        self.cls_token = nn.Parameter(torch.zeros(1, 1, dim_model))
        self.pos_encoder = PositionalEncoding(dim_model, max_len + 1)

        encoder_layer = nn.TransformerEncoderLayer(d_model=dim_model, nhead=num_heads,
                                                   dim_feedforward=dim_model*4, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.norm = nn.LayerNorm(dim_model)

        # Metric-based classifier
        self.classifier = CosineClassifier(dim=dim_model, num_classes=num_classes)

    def forward(self, x):
        B, T, _ = x.size()

        # Project input
        x = self.input_fc(x)

        # Add CLS token
        cls_tokens = self.cls_token.expand(B, 1, -1)
        x = torch.cat([cls_tokens, x], dim=1)

        # Add positional encoding
        x = self.pos_encoder(x)

        # Transformer
        x = self.transformer_encoder(x)

        # Final CLS token (summary)
        cls_out = self.norm(x[:, 0])  # shape [B, dim_model]

        # Metric-based classification
        out = self.classifier(cls_out)
        return out



In [21]:
model = FewShotTransformer(input_size=51, num_classes=4)
model.load_state_dict(torch.load("/content/pose_action_model.pth", map_location='cpu'))
model.eval()


FewShotTransformer(
  (input_fc): Sequential(
    (0): Linear(in_features=51, out_features=256, bias=True)
    (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (2): GELU(approximate='none')
    (3): Dropout(p=0.2, inplace=False)
  )
  (pos_encoder): PositionalEncoding()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=1024, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=1024, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )


In [22]:
# Preprocess pose sequence

data = keypoints_seq.copy()
data[:, :, :2] -= data[:, 0:1, :2]  # normalize w.r.t joint 0

# Pad or trim to fixed length
MAX_LEN = 50
if data.shape[0] < MAX_LEN:
    pad = np.zeros((MAX_LEN - data.shape[0], 17, 3))
    data = np.concatenate((data, pad), axis=0)
else:
    data = data[:MAX_LEN]

data = data.reshape(MAX_LEN, -1)  # [T, 51]

# Predict
with torch.no_grad():
    input_tensor = torch.tensor(data, dtype=torch.float32).unsqueeze(0)  # [1, T, 51]
    output = model(input_tensor)
    predicted_class = torch.argmax(output, dim=1).item()

# Map class index to name
class_map = {0: 'Diving-Side', 1: 'Golf-Swing', 2: 'Run-Side', 3: 'Lifting'}
print("🎯 Predicted Action:", class_map[predicted_class])


🎯 Predicted Action: Golf-Swing-Side


In [31]:
# Rerun inference for visualization
from ultralytics import YOLO
import cv2

# Reload YOLO model
pose_model = YOLO("yolov8n-pose.pt")

# Open original video for reading
cap = cv2.VideoCapture("/content/b5ccfd6d-2238-4d93-aa6c-f0fd970f8ddc.mp4.mp4")
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
fourcc = cv2.VideoWriter_fourcc(*'XVID')

# Output video writer
out_video = cv2.VideoWriter("/content/final_result.avi", fourcc, fps, (width, height))

# Map class index to name (redefined here for this cell's scope)
class_map = {0: 'Diving-Side', 1: 'Golf-Swing', 2: 'Run-Side', 3: 'Lifting'}

# Get the action label string from the predicted class before the loop
# The variable `predicted_class` is available from the previous cell's execution
if 'predicted_class' in globals() and predicted_class in class_map:
    action_label = class_map[predicted_class]
else:
    # Fallback label if predicted_class is not defined or invalid
    action_label = "Unknown Action"
    print("Warning: Could not determine predicted action label. Using 'Unknown Action'.")


# Read frame by frame and annotate
frame_idx = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Predict pose
    # Pass the frame directly to the model instead of the file path
    result = pose_model(frame, verbose=False)[0]
    annotated = result.plot()  # draw keypoints

    # Overlay action label (top-right corner)
    label = f"Action: {action_label}" # Now action_label is defined
    text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0]
    text_x = width - 620 - text_size[0]
    text_y = 140
    cv2.putText(annotated, label, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 3, (0, 255, 0), 4)

    out_video.write(annotated)
    frame_idx += 1

cap.release()
out_video.release()

print("✅ Final video saved at: /content/final_result.avi")


✅ Final video saved at: /content/final_result.avi
