<a href="https://colab.research.google.com/github/LeoDinga/DL_Project/blob/main/Project_DL_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone --filter=blob:none --no-checkout https://github.com/THETIS-dataset/dataset.git
%cd dataset
!git sparse-checkout init --cone
!git sparse-checkout set VIDEO_Skelet3D
!git checkout

fatal: destination path 'dataset' already exists and is not an empty directory.
/content/dataset
Your branch is up to date with 'origin/main'.


Run this only the first time


In [37]:
# !pip uninstall -y mediapipe-silicon
# !pip uninstall -y mediapipe
# !pip uninstall -y protobuf
# !pip install protobuf==3.20.3  # Specific version that often works well with MediaPipe
# !pip install mediapipe




In [4]:
import os
import cv2
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as T
import torchvision.models as models
from PIL import Image
import numpy as np
from sklearn.metrics import accuracy_score
import shutil
import random
import mediapipe as mp

In [5]:
print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

CUDA available: True
Device name: Tesla T4


In [6]:
import os
print(os.listdir('VIDEO_Skelet3D'))

['backhand_slice', 'forehand_volley', 'backhand_volley', 'backhand2hands', 'forehand_slice', 'slice_service', 'smash', 'backhand', 'forehand_openstands', 'flat_service', 'kick_service', 'forehand_flat']


In [7]:
#shows the videos in each folder
data_path = 'VIDEO_Skelet3D'

# Print out the directory structure
for root, dirs, files in os.walk(data_path):
    print(f"Root: {root}")
    print(f"Dirs: {dirs}")
    print(f"Files: {files}")
    print("-" * 40)

Root: VIDEO_Skelet3D
Dirs: ['backhand_slice', 'forehand_volley', 'backhand_volley', 'backhand2hands', 'forehand_slice', 'slice_service', 'smash', 'backhand', 'forehand_openstands', 'flat_service', 'kick_service', 'forehand_flat']
Files: []
----------------------------------------
Root: VIDEO_Skelet3D/backhand_slice
Dirs: []
Files: ['p11_bslice_skelet3D_s2.avi', 'p20_bslice_skelet3D_s1.avi', 'p30_bslice_skelet3D_s3.avi', 'p47_bslice_skelet3D_s3.avi', 'p51_bslice_skelet3D_s2.avi', 'p39_bslice_skelet3D_s3.avi', 'p29_bslice_skelet3D_s2.avi', 'p54_bslice_skelet3D_s2.avi', 'p4_bslice_skelet3D_s3.avi', 'p46_bslice_skelet3D_s2.avi', 'p20_bslice_skelet3D_s2.avi', 'p21_bslice_skelet3D_s2.avi', 'p17_bslice_skelet3D_s3.avi', 'p44_bslice_skelet3D_s3.avi', 'p35_bslice_skelet3D_s3.avi', 'p51_bslice_skelet3D_s3.avi', 'p7_bslice_skelet3D_s3.avi', 'p26_bslice_skelet3D_s2.avi', 'p49_bslice_skelet3D_s3.avi', 'p24_bslice_skelet3D_s3.avi', 'p1_bslice_skelet3D_s3.avi', 'p49_bslice_skelet3D_s2.avi', 'p43_bsli

In [8]:
def convert_video_to_npy(video_path, resize_shape=(224, 224)):
    """
    Converts a video to a numpy array of resized frames.

    Parameters:
    - video_path: The path to the video file.
    - resize_shape: The desired frame size (default is 224x224).

    Returns:
    - frames_array: Numpy array containing all frames.
    """
    cap = cv2.VideoCapture(video_path)
    frames = []

    if not cap.isOpened():
        raise ValueError(f"Error opening video file: {video_path}")

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Resize frame to the desired shape
        frame_resized = cv2.resize(frame, resize_shape)
        frames.append(frame_resized)

    cap.release()

    # Convert list of frames into a numpy array
    frames_array = np.array(frames)
    return frames_array

def create_subfolders(pastas_dir, output_dir):
    """
    Creates subfolders in the output directory and processes .avi files.

    Parameters:
    - pastas_dir: The root directory containing class folders.
    - output_dir: The directory where the .npy files will be saved.
    """
    for pastas_filename in os.listdir(pastas_dir):
        origem_path = os.path.join(pastas_dir, pastas_filename)

        # Check if it's a directory (ignore files)
        if os.path.isdir(origem_path):
            nova_pasta_path = os.path.join(output_dir, pastas_filename)
            os.makedirs(nova_pasta_path, exist_ok=True)

            # Process each video file in the subfolder
            videos_dir = os.path.join(pastas_dir, pastas_filename)
            for video_filename in os.listdir(videos_dir):
                if video_filename.endswith(".avi"):  # Process only .avi files
                    video_path = os.path.join(videos_dir, video_filename)
                    video_name = os.path.splitext(video_filename)[0]

                    try:
                        # Convert the video to numpy array
                        frames_array = convert_video_to_npy(video_path)

                        # Save the numpy array to a .npy file
                        output_filename = os.path.join(nova_pasta_path, f"{video_name}.npy")
                        np.save(output_filename, frames_array)
                    except ValueError as e:
                        print(f"Error processing {video_filename}: {e}")
                    except Exception as e:
                        print(f"Unexpected error with {video_filename}: {e}")

# Example usage
pastas_dir = 'VIDEO_Skelet3D'  # Directory with .avi videos
npy_dir = 'npy_videos'  # Directory to save .npy files
os.makedirs(npy_dir, exist_ok=True)

# Process videos and maintain class folders
create_subfolders(pastas_dir, npy_dir)

In [9]:
# Pad/truncate to 120 frames
def pad_or_truncate_keypoints(keypoints, target_length=120):
    """
    Pads or truncates a keypoints array to a fixed number of frames (target_length).
    If padding is needed, uses zeros.
    """
    current_length = keypoints.shape[0]

    if current_length == target_length:
        return keypoints
    elif current_length > target_length:
        return keypoints[:target_length]  # Truncate
    else:
        # Pad with zeros
        padding = np.zeros((target_length - current_length, keypoints.shape[1], keypoints.shape[2]))
        return np.concatenate([keypoints, padding], axis=0)

In [12]:
# In theory this will extract the keypoints AND pad/truncate
from tqdm import tqdm

root_dir = "npy_videos"

mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False) # This allows MediaPipe to track pose across frames, potentially leading to more stable and accurate keypoint estimations.

all_keypoints = {}

# Top-level loop with a single progress bar
action_folders = sorted([d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))])
for action_folder in tqdm(action_folders, desc="Processing actions"):
    action_path = os.path.join(root_dir, action_folder)
    all_keypoints[action_folder] = {}

    video_files = sorted([f for f in os.listdir(action_path) if f.endswith(".npy")])
    for video_file in video_files:  # No tqdm here
        video_path = os.path.join(action_path, video_file)
        try:
            sample = np.load(video_path)
            video_keypoints = []

            for frame in sample:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = pose.process(frame_rgb)

                if results.pose_landmarks:
                    keypoints = [[lm.x, lm.y, lm.z] for lm in results.pose_landmarks.landmark]
                else:
                    keypoints = np.zeros((33, 3)).tolist()

                video_keypoints.append(keypoints)

            video_keypoints = np.array(video_keypoints)
            video_keypoints = pad_or_truncate_keypoints(video_keypoints, target_length=120)
            all_keypoints[action_folder][video_file] = video_keypoints





        except Exception as e:
            pass  # Suppress per-video error printing for a clean log

# Final summary
print("\n Summary of extracted keypoints:")
for action, videos in all_keypoints.items():
    print(f"- {action}: {len(videos)} videos processed.")

# Jsut to see the calues for the first one
first_action = list(all_keypoints.keys())[0]
first_video = list(all_keypoints[first_action].keys())[0]
print(f"Keypoints for the first frame of {first_video} in {first_action}:")
print(all_keypoints[first_action][first_video][0])

Processing actions: 100%|██████████| 12/12 [25:53<00:00, 129.48s/it]


 Summary of extracted keypoints:
- backhand: 97 videos processed.
- backhand2hands: 107 videos processed.
- backhand_slice: 100 videos processed.
- backhand_volley: 103 videos processed.
- flat_service: 96 videos processed.
- forehand_flat: 110 videos processed.
- forehand_openstands: 101 videos processed.
- forehand_slice: 97 videos processed.
- forehand_volley: 93 videos processed.
- kick_service: 109 videos processed.
- slice_service: 100 videos processed.
- smash: 104 videos processed.
Keypoints for the first frame of p10_backhand_skelet3D_s2.npy in backhand:
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]





In [13]:
from collections import defaultdict
from sklearn.model_selection import train_test_split

def extract_player_id(filename):
    # Example: 'player1_video1.npy' → 'player1'
    return filename.split('_')[0]

# Organize videos by player
player_video_map = defaultdict(list)
for action, videos in all_keypoints.items():
    for video_file in videos:
        player_id = extract_player_id(video_file)
        player_video_map[player_id].append((action, video_file))

# Split player IDs
players = list(player_video_map.keys())
random.shuffle(players)

train_end = int(len(players) * 0.7)
val_end = int(len(players) * 0.85)

train_players = players[:train_end]
val_players = players[train_end:val_end]
test_players = players[val_end:]

def collect_by_players(player_ids):
    dataset = []
    for pid in player_ids:
        for action, filename in player_video_map[pid]:
            keypoints = all_keypoints[action][filename]
            dataset.append((keypoints, action))
    return dataset

train_set = collect_by_players(train_players)
val_set = collect_by_players(val_players)
test_set = collect_by_players(test_players)

print(f"Train: {len(train_set)}, Val: {len(val_set)}, Test: {len(test_set)}")


Train: 849, Val: 182, Test: 186


In [14]:
class STGCNDataset(Dataset):
    def __init__(self, data, label_map):
        self.data = data
        self.label_map = label_map

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        keypoints, label = self.data[idx]
        # Shape: [T, V, C] → [C, T, V, 1]
        keypoints = keypoints.transpose((2, 0, 1))  # [C, T, V]
        keypoints = np.expand_dims(keypoints, axis=-1)  # [C, T, V, 1]
        label_idx = self.label_map[label]
        return torch.tensor(keypoints, dtype=torch.float32), label_idx

# Create label map
all_labels = sorted({label for _, label in train_set + val_set + test_set})
label_map = {label: i for i, label in enumerate(all_labels)}

# Create datasets
train_dataset = STGCNDataset(train_set, label_map)
val_dataset = STGCNDataset(val_set, label_map)
test_dataset = STGCNDataset(test_set, label_map)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


In [None]:
!git clone https://github.com/lshiwjx/2s-AGCN-PyTorch.git
!cd 2s-AGCN-PyTorch

In [None]:
!pip install -r requirements.txt

In [None]:
num_class = len(label_map)

In [None]:
# number of classes
num_class = 12

In [None]:
model.fc = nn.Linear(in_features=256, out_features=len(label_map))

In [None]:
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(10):
    model.train()
    total_loss = 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        outputs = model(x)
        loss = F.cross_entropy(outputs, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    # Validation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            preds = model(x).argmax(1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    print(f"Validation Accuracy: {acc:.4f}")

In [None]:
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for x, y in test_loader:
        x, y = x.to(device), y.to(device)
        preds = model(x).argmax(1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y.cpu().numpy())

test_acc = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {test_acc:.4f}")

# STGCN model

In [None]:
!git clone https://github.com/yysijie/st-gcn.git
!cd st-gcn
!pip install -r requirements.txt

In [None]:
T = 120
C = 3
V = 33
M = 1



In [None]:
# To see if its' working tested it on only one video
# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=True)

# Path to your video (.npy file)
video_path = "npy_videos/backhand/p10_backhand_skelet3D_s2.npy"

try:
    sample = np.load(video_path)
    video_keypoints = []

    for frame in sample:
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = pose.process(frame_rgb)

        if results.pose_landmarks:
            keypoints = [[lm.x, lm.y, lm.z] for lm in results.pose_landmarks.landmark]
        else:
            keypoints = np.zeros((33, 3)).tolist()

        video_keypoints.append(keypoints)

    video_keypoints = np.array(video_keypoints)
    video_keypoints = pad_or_truncate_keypoints(video_keypoints, target_length=120)

    print(f"Extracted keypoints shape: {video_keypoints.shape}")  # Should be (120, 33, 3)

except Exception as e:
    print(f"Error processing the video: {e}")
