In [49]:
import os
import shutil
import random
from tqdm import tqdm
import glob

# --- CONFIGURATION ---
SOURCE_DIR = "/kaggle/input/humanml3d/HumanML3D/humanml/"
TARGET_DIR = "/kaggle/working/subset_humanml3d/"
SUBSET_SIZE_PER_TYPE = 2000  # Number of M-prefixed and numeric-only IDs each (Total 2*N)
SEED = 42
MIN_FRAMES, MAX_FRAMES = 60, 200

random.seed(SEED)
os.makedirs(TARGET_DIR, exist_ok=True)

print(f"Source Directory: {SOURCE_DIR}")
print(f"Target Directory: {TARGET_DIR}")

Source Directory: /kaggle/input/humanml3d/HumanML3D/humanml/
Target Directory: /kaggle/working/subset_humanml3d/


In [50]:
!rm -rf kaggle/working/subset_humanml3d

In [None]:
from concurrent.futures import ThreadPoolExecutor

def get_frames(filename):
    path = os.path.join(SOURCE_DIR, 'new_joint_vecs', filename)
    try:
        # np.load with mmap_mode='r' is fast as it only reads the header
        frames = np.load(path, mmap_mode='r').shape[0]
        if MIN_FRAMES <= frames <= MAX_FRAMES:
            return filename[:-4]
    except:
        return None

all_files = [f for f in os.listdir(os.path.join(SOURCE_DIR, 'new_joint_vecs')) if f.endswith('.npy')]

print("Scanning for valid IDs...")
# ThreadPool is usually faster than ProcessPool for I/O tasks on Kaggle
with ThreadPoolExecutor(max_workers=8) as executor:
    results = list(tqdm(executor.map(get_frames, all_files), total=len(all_files)))

valid_ids = [r for r in results if r is not None]

In [52]:
# 1. Load all valid IDs
m_ids = [i for i in valid_ids if i.startswith('M')]
numeric_ids = [i for i in valid_ids if not i.startswith('M')]

print(f"Total IDs in dataset: {len(valid_ids)}")
print(f"  - 'M' prefixed: {len(m_ids)}")
print(f"  - Numeric-only: {len(numeric_ids)}")

# 2. Sample equal numbers
random.seed(SEED)
n = min(SUBSET_SIZE_PER_TYPE, len(m_ids), len(numeric_ids))
selected_m = random.sample(m_ids, n)
selected_numeric = random.sample(numeric_ids, n)

subset_ids = sorted(selected_m + selected_numeric)
print(f"\nSelected subset of {len(subset_ids)} IDs ({n} 'M' IDs and {n} numeric IDs).")
subset_ids[:50]

Total IDs in dataset: 15068
  - 'M' prefixed: 7534
  - Numeric-only: 7534

Selected subset of 4000 IDs (2000 'M' IDs and 2000 numeric IDs).


['000016',
 '000017',
 '000024',
 '000029',
 '000052',
 '000062',
 '000070',
 '000075',
 '000076',
 '000079',
 '000080',
 '000099',
 '000103',
 '000118',
 '000127',
 '000135',
 '000137',
 '000149',
 '000150',
 '000151',
 '000182',
 '000196',
 '000197',
 '000199',
 '000203',
 '000211',
 '000212',
 '000213',
 '000215',
 '000226',
 '000232',
 '000256',
 '000261',
 '000267',
 '000270',
 '000280',
 '000281',
 '000287',
 '000288',
 '000301',
 '000307',
 '000308',
 '000309',
 '000315',
 '000325',
 '000326',
 '000328',
 '000332',
 '000335',
 '000336']

In [None]:
from concurrent.futures import ThreadPoolExecutor

# 3. Detect subdirectories and copy files
subdirs = ["joints", "new_joints", "new_joint_vecs", "texts"] 

for sd in subdirs:
    os.makedirs(os.path.join(TARGET_DIR, sd), exist_ok=True)

def copy_files_for_id(i):
    for sd in subdirs:
        for ext in ['.npy', '.txt']:
            src_file = os.path.join(SOURCE_DIR, sd, f"{i}{ext}")
            if os.path.exists(src_file):
                shutil.copy(src_file, os.path.join(TARGET_DIR, sd, f"{i}{ext}"))

print("\nCopying files for subset IDs in parallel...")
with ThreadPoolExecutor() as executor:
    list(tqdm(executor.map(copy_files_for_id, subset_ids), total=len(subset_ids)))

print("\nFile copying complete.")



Copying files for subset IDs in parallel...


 74%|███████▍  | 2975/4000 [00:09<00:03, 292.81it/s]

In [56]:
# 4. Update metadata files
metadata_files = ["all.txt", "train.txt", "val.txt", "test.txt"]

all_ids = subset_ids.copy()
random.seed(SEED)
random.shuffle(all_ids)

train_idx = int(0.8 * len(all_ids))
val_idx = int(0.9 * len(all_ids))

train_list = sorted(all_ids[:train_idx])    
val_list = sorted(all_ids[train_idx:val_idx])
test_list = sorted(all_ids[val_idx:])

for filename in metadata_files: 
    with open(os.path.join(TARGET_DIR, filename), 'w') as f:
        if filename == "all.txt":
            f.writelines(f"{i}\n" for i in subset_ids)
        elif filename == "train.txt":
            f.writelines(f"{i}\n" for i in train_list)
        elif filename == "val.txt":
            f.writelines(f"{i}\n" for i in val_list)
        elif filename == "test.txt":
            f.writelines(f"{i}\n" for i in test_list) 


In [66]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from typing import List, Tuple, Optional, Dict, Any
import json
from matplotlib.animation import FuncAnimation
import matplotlib.pyplot as plt
from IPython.display import HTML


# Missing HumanML3D Kinematic Chain (Standard SMPL/HumanML3D)
KINEMATIC_CHAIN = [
    [0, 2, 5, 8, 11],     # Right leg
    [0, 1, 4, 7, 10],     # Left leg
    [0, 3, 6, 9, 12, 15], # Spine
    [9, 14, 17, 19, 21],  # Right arm
    [9, 13, 16, 18, 20]   # Left arm
]

def plot_3d_motion(
    motion: np.ndarray,
    fps: int = 20,
    radius: float = 1.0,
    title: str = "Motion Visualization",
    follow_root: bool = False
) -> FuncAnimation:
    """
    Create a 3D animation of motion joint positions.
    
    Args:
        motion: Joint positions (nframe, 22, 3)
        fps: Frames per second
        radius: Radius of the viewing box around the character
        title: Plot title
        follow_root: Whether the camera should follow the root joint
        
    Returns:
        Matplotlib FuncAnimation object
    """
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111, projection='3d')
    ax.view_init(elev=20, azim=45)
    
    colors = ['#2980b9', '#c0392b', '#27ae60', '#f39c12', '#8e44ad']
    lines = [ax.plot([], [], [], color=c, marker='o', ms=2, lw=2)[0] for c in colors]

    ax.set_xlabel('X (Side)')
    ax.set_ylabel('Z (Forward)')
    ax.set_zlabel('Y (Height)')
    ax.set_title(title)

    # Pre-calculate global bounds for static centering if not following root
    pos_min = motion.min(axis=(0, 1))
    pos_max = motion.max(axis=(0, 1))

    def update(frame):
        root = motion[frame, 0, :]
        
        if follow_root:
            ax.set_xlim3d([root[0] - radius, root[0] + radius])
            ax.set_ylim3d([root[2] - radius, root[2] + radius])
            ax.set_zlim3d([pos_min[1], pos_max[1] + radius*0.5])
        else:
            ax.set_xlim3d([pos_min[0] - radius, pos_max[0] + radius])
            ax.set_ylim3d([pos_min[2] - radius, pos_max[2] + radius])
            ax.set_zlim3d([pos_min[1], pos_max[1] + radius*0.5])
        
        for i, c_indices in enumerate(KINEMATIC_CHAIN):
            joints = motion[frame, c_indices, :]
            # Map Data Y to Plot Z (Vertical)
            lines[i].set_data(joints[:, 0], joints[:, 2])
            lines[i].set_3d_properties(joints[:, 1])
        return lines

    ani = FuncAnimation(fig, update, frames=len(motion), interval=1000/fps, blit=False)
    plt.close()
    return ani
    
def visualize_motion(
    joint_positions: np.ndarray,
    ground_truth: Optional[np.ndarray] = None,
    title: str = "Motion Visualization",
    save_path: Optional[Path] = None,
    fps: int = 20,
    skip_frames: int = 1,
    notebook: bool = True
) -> Any:
    """
    Visualize motion from joint positions.
    
    Args:
        joint_positions: Joint positions (nframe, 22, 3)
        ground_truth: Optional ground truth for comparison
        title: Plot title
        save_path: Optional path to save visualization
        fps: Frames per second
        notebook: Whether to return HTML for notebook display
    """
    fps = fps/skip_frames
    ani = plot_3d_motion(joint_positions[::skip_frames], fps=fps, title=title)
    
    if save_path:
        save_path.parent.mkdir(parents=True, exist_ok=True)
        ani.save(str(save_path), writer='ffmpeg', fps=fps)
        print(f"Saved animation to {save_path}")
    
    if notebook:
        display_html = HTML(ani.to_html5_video())
        return display_html
    
    return ani

# 1. Load data
file_id = subset_ids[0]
data_path = f"/kaggle/working/subset_humanml3d/new_joints/{file_id}.npy"
motion_data = np.load(data_path)
text_path = f"/kaggle/working/subset_humanml3d/texts/{file_id}.txt"
with open(text_path, 'r') as f:
    text = f.read()

# 2. Visualize
ani = visualize_motion(motion_data, title=f"{file_id}.npy", fps=20, skip_frames=2)
ani