## InfoGCN++ Training Pipeline
This notebook mirrors the original InfoGCN++ data processing and optimisation setup for 2D skeleton `.npy` clips, making it easy to fine-tune the SODE backbone on custom datasets.

In [1]:
from pathlib import Path

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

from act_rec.datasets import SkeletonNpyDataset
from act_rec.model.losses import LabelSmoothingCrossEntropy, masked_recon_loss
from act_rec.model.sode import SODE
from act_rec.training import TrainConfig, evaluate, train_one_epoch

In [2]:
# Paths and label mapping
data_root = Path("../data/")
csv_path = data_root / "skeleton_labels.csv"

df = pd.read_csv(csv_path).dropna()
label_to_idx = {label: idx for idx, label in enumerate(sorted(df["label"].unique()))}
df["label_idx"] = df["label"].map(label_to_idx)
df["skeleton_path"] = df["skeleton_path"].apply(lambda p: str((data_root / p).resolve()))
print(f"Total samples: {len(df)} | classes: {len(label_to_idx)}")

Total samples: 2216 | classes: 14


In [3]:
# Train/validation split
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label_idx"],
    random_state=42,
)
print(f"Train: {len(train_df)} | Val: {len(val_df)}")

Train: 1772 | Val: 444


In [4]:
# Optimiser and schedule configuration mirroring InfoGCN++ defaults
train_hparams = {
    "epochs": 80,
    "base_lr": 1e-2,
    "optimizer": "SGD",
    "weight_decay": 1e-4,
    "warmup_epochs": 5,
    "lr_steps": [30, 45, 60],
    "lr_decay": 0.1,
    "grad_clip": 1.0,
    "batch_size": 32,
    "test_batch_size": 64,
    "num_workers": 4,
    "prefetch_factor": 2,
    "pin_memory": bool(torch.cuda.is_available()),
    "p_interval_train": (0.5, 1.0),
    "p_interval_val": (0.95,),
    "random_rotation": True,
    "use_velocity": False,
    "preload": True,
    "preload_to_tensor": True,
    "lambda_cls": 1.0,
    "lambda_recon": 0.1,
    "lambda_feature": 0.1,
    "lambda_kl": 0.0,
    "smoothing": 0.1,
    "checkpoint_path": "sode_best.pt",
}


def adjust_learning_rate(epoch: int, optimizer: torch.optim.Optimizer, cfg: dict) -> float:
    """Warm-up followed by step decay, as in the InfoGCN++ training script."""
    warmup = cfg["warmup_epochs"]
    base_lr = cfg["base_lr"]
    if warmup > 0 and epoch < warmup:
        lr = base_lr * float(epoch + 1) / float(warmup)
    else:
        steps = cfg["lr_steps"]
        decay = cfg["lr_decay"]
        num_decays = sum(epoch >= step for step in steps)
        lr = base_lr * (decay**num_decays)
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr
    return lr


In [5]:
# Dataset and dataloaders mirroring InfoGCN++ feeder logic
window_size = 64
num_workers = train_hparams["num_workers"]
train_dataset = SkeletonNpyDataset(
    train_df["skeleton_path"].tolist(),
    labels=train_df["label_idx"].tolist(),
    window_size=window_size,
    p_interval=train_hparams["p_interval_train"],
    random_rotation=train_hparams["random_rotation"],
    use_velocity=train_hparams["use_velocity"],
    preload=train_hparams["preload"],
    preload_to_tensor=train_hparams["preload_to_tensor"],
    repeat=1,
)
val_dataset = SkeletonNpyDataset(
    val_df["skeleton_path"].tolist(),
    labels=val_df["label_idx"].tolist(),
    window_size=window_size,
    p_interval=train_hparams["p_interval_val"],
    random_rotation=False,
    use_velocity=train_hparams["use_velocity"],
    preload=train_hparams["preload"],
    preload_to_tensor=train_hparams["preload_to_tensor"],
)

loader_kwargs = {
    "num_workers": num_workers,
    "pin_memory": train_hparams["pin_memory"],
}
if num_workers > 0:
    loader_kwargs["persistent_workers"] = True
    loader_kwargs["prefetch_factor"] = train_hparams.get("prefetch_factor", 2)

train_loader = DataLoader(
    train_dataset,
    batch_size=train_hparams["batch_size"],
    shuffle=True,
    drop_last=True,
    **loader_kwargs,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=train_hparams["test_batch_size"],
    shuffle=False,
    drop_last=False,
    **loader_kwargs,
)


In [None]:
# Training loop
num_epochs = train_hparams["epochs"]
best_top1 = 0.0
history = []
ckpt_path = Path(train_hparams["checkpoint_path"])
ckpt_path.parent.mkdir(parents=True, exist_ok=True)

for epoch in range(num_epochs):
    lr = adjust_learning_rate(epoch, optimizer, train_hparams)
    train_metrics = train_one_epoch(model, train_loader, optimizer, config)
    val_metrics = evaluate(model, val_loader, config)
    metrics = {**train_metrics, **val_metrics, "lr": lr}
    history.append(metrics)

    msg = (
        f"Epoch {epoch + 1}/{num_epochs} | lr={lr:.4e} | "
        f"train_tot={train_metrics['train_total_loss']:.4f} "
        f"train_cls={train_metrics['train_cls_loss']:.4f} "
        f"train_recon={train_metrics['train_recon_loss']:.4f} "
        f"train_feat={train_metrics['train_feature_loss']:.4f} | "
        f"val_tot={val_metrics['val_total_loss']:.4f} "
        f"val_cls={val_metrics['val_cls_loss']:.4f} "
        f"val_recon={val_metrics['val_recon_loss']:.4f} "
        f"val_feat={val_metrics['val_feature_loss']:.4f} "
        f"val_top1={val_metrics['val_top1']:.3f} "
        f"val_top5={val_metrics['val_top5']:.3f}"
    )
    print(msg)

    if val_metrics["val_top1"] > best_top1:
        best_top1 = val_metrics["val_top1"]
        torch.save({"model": model.state_dict(), "label_to_idx": label_to_idx}, ckpt_path)
        print(f"  -> New best checkpoint saved (top1={best_top1:.3f}).")
print("Training finished.")


Epoch 1/80 | lr=2.0000e-03 | train_tot=4.4948 train_cls=2.3084 train_recon=2.1632 train_feat=0.0233 | val_tot=4.2209 val_cls=2.2150 val_recon=1.9871 val_feat=0.0188 val_top1=0.279 val_top5=0.777
  -> New best checkpoint saved (top1=0.279).
Epoch 2/80 | lr=4.0000e-03 | train_tot=2.6659 train_cls=1.9090 train_recon=0.7406 train_feat=0.0163 | val_tot=3.8897 val_cls=1.7900 val_recon=2.0400 val_feat=0.0597 val_top1=0.509 val_top5=0.910
  -> New best checkpoint saved (top1=0.509).
Epoch 3/80 | lr=6.0000e-03 | train_tot=2.2678 train_cls=1.6109 train_recon=0.6381 train_feat=0.0188 | val_tot=2.1688 val_cls=1.8337 val_recon=0.3246 val_feat=0.0105 val_top1=0.554 val_top5=0.937
  -> New best checkpoint saved (top1=0.554).
Epoch 4/80 | lr=8.0000e-03 | train_tot=1.9149 train_cls=1.4202 train_recon=0.4834 train_feat=0.0114 | val_tot=1.8180 val_cls=1.3826 val_recon=0.4124 val_feat=0.0230 val_top1=0.709 val_top5=0.971
  -> New best checkpoint saved (top1=0.709).
Epoch 5/80 | lr=1.0000e-02 | train_tot=1

In [6]:
# Model and optimisation setup
device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
model = SODE(
    num_class=len(label_to_idx),
    num_point=17,
    num_person=1,
    graph="act_rec.graph.coco.Graph",
    in_channels=3,
    T=window_size,
    n_step=3,
    num_cls=4,
).to(device)

opt_name = train_hparams["optimizer"].lower()
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=train_hparams["base_lr"],
    weight_decay=train_hparams["weight_decay"],
)

config = TrainConfig(
    device=device,
    cls_loss=LabelSmoothingCrossEntropy(smoothing=train_hparams["smoothing"]),
    lambda_cls=train_hparams["lambda_cls"],
    lambda_recon=train_hparams["lambda_recon"],
    lambda_feature=train_hparams["lambda_feature"],
    lambda_kl=train_hparams["lambda_kl"],
    n_step=model.n_step,
    recon_loss_fn=masked_recon_loss,
    feature_loss_fn=masked_recon_loss,
)


In [None]:
# Training loop
num_epochs = train_hparams["epochs"]
best_top1 = 0.0
history = []
ckpt_path = Path(train_hparams["checkpoint_path"])
ckpt_path.parent.mkdir(parents=True, exist_ok=True)

for epoch in range(num_epochs):
    lr = adjust_learning_rate(epoch, optimizer, train_hparams)
    train_metrics = train_one_epoch(model, train_loader, optimizer, config)
    val_metrics = evaluate(model, val_loader, config)
    metrics = {**train_metrics, **val_metrics, "lr": lr}
    history.append(metrics)

    msg = (
        f"Epoch {epoch + 1}/{num_epochs} | lr={lr:.4e} | "
        f"train_tot={train_metrics['train_total_loss']:.4f} "
        f"train_cls={train_metrics['train_cls_loss']:.4f} "
        f"train_recon={train_metrics['train_recon_loss']:.4f} "
        f"train_feat={train_metrics['train_feature_loss']:.4f} | "
        f"val_tot={val_metrics['val_total_loss']:.4f} "
        f"val_cls={val_metrics['val_cls_loss']:.4f} "
        f"val_recon={val_metrics['val_recon_loss']:.4f} "
        f"val_feat={val_metrics['val_feature_loss']:.4f} "
        f"val_top1={val_metrics['val_top1']:.3f} "
        f"val_top5={val_metrics['val_top5']:.3f}"
    )
    print(msg)

    if val_metrics["val_top1"] > best_top1:
        best_top1 = val_metrics["val_top1"]
        torch.save({"model": model.state_dict(), "label_to_idx": label_to_idx}, ckpt_path)
        print(f"  -> New best checkpoint saved (top1={best_top1:.3f}).")
print("Training finished.")


Epoch 1/80 | lr=2.0000e-03 | train_tot=4.4948 train_cls=2.3084 train_recon=2.1632 train_feat=0.0233 | val_tot=4.2209 val_cls=2.2150 val_recon=1.9871 val_feat=0.0188 val_top1=0.279 val_top5=0.777
  -> New best checkpoint saved (top1=0.279).
Epoch 2/80 | lr=4.0000e-03 | train_tot=2.6659 train_cls=1.9090 train_recon=0.7406 train_feat=0.0163 | val_tot=3.8897 val_cls=1.7900 val_recon=2.0400 val_feat=0.0597 val_top1=0.509 val_top5=0.910
  -> New best checkpoint saved (top1=0.509).
Epoch 3/80 | lr=6.0000e-03 | train_tot=2.2678 train_cls=1.6109 train_recon=0.6381 train_feat=0.0188 | val_tot=2.1688 val_cls=1.8337 val_recon=0.3246 val_feat=0.0105 val_top1=0.554 val_top5=0.937
  -> New best checkpoint saved (top1=0.554).
Epoch 4/80 | lr=8.0000e-03 | train_tot=1.9149 train_cls=1.4202 train_recon=0.4834 train_feat=0.0114 | val_tot=1.8180 val_cls=1.3826 val_recon=0.4124 val_feat=0.0230 val_top1=0.709 val_top5=0.971
  -> New best checkpoint saved (top1=0.709).
Epoch 5/80 | lr=1.0000e-02 | train_tot=1

libc++abi: terminating due to uncaught exception of type std::__1::system_error: Broken pipe
libc++abi: terminating due to uncaught exception of type std::__1::system_error: Broken pipe
libc++abi: terminating due to uncaught exception of type std::__1::system_error: Broken pipe


KeyboardInterrupt: 

libc++abi: terminating due to uncaught exception of type std::__1::system_error: Broken pipe
libc++abi: terminating due to uncaught exception of type std::__1::system_error: Broken pipe
libc++abi: terminating due to uncaught exception of type std::__1::system_error: Broken pipe
libc++abi: terminating due to uncaught exception of type std::__1::system_error: Broken pipe
libc++abi: terminating due to uncaught exception of type std::__1::system_error: Broken pipe


Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/nikita/anaconda3/envs/act-rec/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/nikita/anaconda3/envs/act-rec/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/nikita/anaconda3/envs/act-rec/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/Users/nikita/anaconda3/envs/act-rec/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 211, in start
    self.asyncio_loop.run_forever()
  File "/Users/nikita/anaconda3/envs/act-rec/lib/python3.11/asyncio/base_events.py", line 608, in run_forever
    self._run_once()
  File "/Users/nikita/anaconda3/envs/act-rec/lib/python3.11/asyncio/base_events.py", line 1898, in _run_once
    event_list 

: 

In [None]:
# Training history as a DataFrame for quick inspection
pd.DataFrame(history)