In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
from transformers import CLIPModel, CLIPTokenizer
import sys
import os
import random
import numpy as np  

seed = 42  
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

sys.path.append(os.path.abspath('/home/user/dxc/motion/StableMoFusion/'))
from motion_loader import get_dataset_loader  
from tqdm import tqdm
import yaml
from argparse import Namespace
from model import *

In [2]:

with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

opt = Namespace(**config)


print(opt.batch_size)  
print(opt.lr)          
print(opt.device)      

32
0.0001
cuda


## load dataset

In [3]:
sys.path.append(os.path.abspath('/home/user/dxc/motion/StableMoFusion'))
train_loader = get_dataset_loader(
        opt,
        batch_size=opt.batch_size,
        split='train',
        mode='train'
    )
test_loader = get_dataset_loader(
    opt,
    batch_size=opt.batch_size,
    split='test',
    mode='gt_eval'
)


 Loading train mode HumanML3D dataset ...
11111111111111


  0%|          | 0/6960 [00:00<?, ?it/s]

Completing loading t2m dataset

 Loading gt_eval mode HumanML3D dataset ...
11111111111111


  0%|          | 0/1305 [00:00<?, ?it/s]

Completing loading t2m dataset


## load clip model for stage 1 training

In [5]:

clip_model = CLIPModel.from_pretrained(opt.clip_model_name)
clip_tokenizer = CLIPTokenizer.from_pretrained(opt.clip_model_name)

# 初始阶段：冻结整个 CLIP 文本编码器（stage 1）
for name, param in clip_model.named_parameters():
    if "text_model" in name:
        param.requires_grad = False


motion_encoder = MotionEncoder(
    input_dim=opt.input_dim,
    embed_dim=opt.embed_dim,
    num_heads=8,
    num_layers=4,         
    dim_feedforward=2048,
    dropout=0.2,
    max_seq_length=opt.max_seq_length
)
model = ClipMotionAlignModel(
    clip_model=clip_model,
    motion_encoder=motion_encoder,
    temperature=0.07
).to(opt.device)

In [6]:
optimizer = optim.AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=opt.lr,
        weight_decay=opt.weight_decay
    )

best_test_loss = float("inf")
no_improve_count = 0
max_no_improve = 3  # 连续3次验证无改进则早停

## Start training

In [7]:
for epoch in range(opt.num_epochs):
    
    if epoch + 1 == opt.pretrain_epochs + 1:
        
        for param in clip_model.text_model.encoder.layers[-1].parameters():
            param.requires_grad = True
        for param in clip_model.text_model.final_layer_norm.parameters():
            param.requires_grad = True
        
        optimizer = optim.AdamW(
            filter(lambda p: p.requires_grad, model.parameters()),
            lr=opt.lr_finetune,
            weight_decay=opt.weight_decay
        )
        print("Stage 2: Fine-tuning CLIP text encoder's last layer (and final_layer_norm) with lower lr.")

    model.train()
    total_loss = 0.0
    count = 0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{opt.num_epochs}")



    for step, batch_data in enumerate(pbar):
        caption, motion, m_length = batch_data

        
        caption = [c.lower() for c in caption]
        text_enc = clip_tokenizer(
            caption,
            padding=True,
            truncation=True,
            max_length=opt.max_length,
            return_tensors="pt"
        )
        input_ids = text_enc["input_ids"].to(opt.device)
        attention_mask = text_enc["attention_mask"].to(opt.device)

       
        if isinstance(motion, list):
            motion = torch.stack([torch.tensor(m, dtype=torch.float32) for m in motion], dim=0)
        else:
            motion = motion.float()
        motion = motion.to(opt.device)
        m_length = m_length.to(opt.device)

        
        motion_emb, text_emb = model(motion, m_length, input_ids, attention_mask)
        loss = clip_contrastive_loss(motion_emb, text_emb, model.logit_scale)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        count += 1
        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

    avg_loss = total_loss / max(count, 1)
    print(f"Epoch [{epoch+1}/{opt.num_epochs}] - Train Average Loss: {avg_loss:.4f}")

   
    print(f"[Validate at epoch {epoch+1}] ...")
    test_loss = evaluate_model(model, test_loader, clip_tokenizer, opt, desc=f"Epoch_{epoch+1}_Test")
    model_path = f"clip_motion_align_epoch_CMP_{epoch+1}.pt"
    torch.save(model.state_dict(), model_path)
    print(f"Model saved: {model_path}")

   
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        no_improve_count = 0
    else:
        no_improve_count += 1
        if no_improve_count >= max_no_improve:
            print("Early stopping triggered!")
            break





print("Training completed!")

Epoch 1/300:   0%|          | 0/177 [00:00<?, ?it/s]

Epoch 1/300: 100%|██████████| 177/177 [00:19<00:00,  9.27it/s, loss=2.3628]


Epoch [1/300] - Train Average Loss: 2.6968
[Validate at epoch 1] ...


Epoch_1_Test: 100%|██████████| 32/32 [00:02<00:00, 11.27it/s]


Epoch_1_Test Average Contrastive Loss: 2.1729
Epoch_1_Test M->T Retrieval (per 32 samples): R@1=0.313, R@2=0.496, R@3=0.598
Epoch_1_Test T->M Retrieval (per 32 samples): R@1=0.348, R@2=0.512, R@3=0.627
Model saved: clip_motion_align_epoch_CMP_1.pt


Epoch 2/300: 100%|██████████| 177/177 [00:18<00:00,  9.65it/s, loss=1.8785]


Epoch [2/300] - Train Average Loss: 1.9546
[Validate at epoch 2] ...


Epoch_2_Test: 100%|██████████| 32/32 [00:02<00:00, 15.82it/s]


Epoch_2_Test Average Contrastive Loss: 1.7686
Epoch_2_Test M->T Retrieval (per 32 samples): R@1=0.439, R@2=0.601, R@3=0.712
Epoch_2_Test T->M Retrieval (per 32 samples): R@1=0.454, R@2=0.646, R@3=0.738
Model saved: clip_motion_align_epoch_CMP_2.pt


Epoch 3/300: 100%|██████████| 177/177 [00:18<00:00,  9.80it/s, loss=1.7158]


Epoch [3/300] - Train Average Loss: 1.5356
[Validate at epoch 3] ...


Epoch_3_Test: 100%|██████████| 32/32 [00:02<00:00, 15.97it/s]


Epoch_3_Test Average Contrastive Loss: 1.5407
Epoch_3_Test M->T Retrieval (per 32 samples): R@1=0.498, R@2=0.688, R@3=0.782
Epoch_3_Test T->M Retrieval (per 32 samples): R@1=0.524, R@2=0.708, R@3=0.792
Model saved: clip_motion_align_epoch_CMP_3.pt


Epoch 4/300: 100%|██████████| 177/177 [00:18<00:00,  9.67it/s, loss=1.2073]


Epoch [4/300] - Train Average Loss: 1.2690
[Validate at epoch 4] ...


Epoch_4_Test: 100%|██████████| 32/32 [00:02<00:00, 15.59it/s]


Epoch_4_Test Average Contrastive Loss: 1.3562
Epoch_4_Test M->T Retrieval (per 32 samples): R@1=0.532, R@2=0.731, R@3=0.832
Epoch_4_Test T->M Retrieval (per 32 samples): R@1=0.568, R@2=0.751, R@3=0.844
Model saved: clip_motion_align_epoch_CMP_4.pt


Epoch 5/300: 100%|██████████| 177/177 [00:17<00:00,  9.94it/s, loss=0.8690]


Epoch [5/300] - Train Average Loss: 1.0643
[Validate at epoch 5] ...


Epoch_5_Test: 100%|██████████| 32/32 [00:02<00:00, 15.86it/s]


Epoch_5_Test Average Contrastive Loss: 1.2733
Epoch_5_Test M->T Retrieval (per 32 samples): R@1=0.590, R@2=0.761, R@3=0.838
Epoch_5_Test T->M Retrieval (per 32 samples): R@1=0.623, R@2=0.775, R@3=0.850
Model saved: clip_motion_align_epoch_CMP_5.pt
Stage 2: Fine-tuning CLIP text encoder's last layer (and final_layer_norm) with lower lr.


Epoch 6/300: 100%|██████████| 177/177 [00:19<00:00,  9.17it/s, loss=0.7835]


Epoch [6/300] - Train Average Loss: 0.7942
[Validate at epoch 6] ...


Epoch_6_Test: 100%|██████████| 32/32 [00:02<00:00, 15.90it/s]


Epoch_6_Test Average Contrastive Loss: 1.0701
Epoch_6_Test M->T Retrieval (per 32 samples): R@1=0.628, R@2=0.813, R@3=0.882
Epoch_6_Test T->M Retrieval (per 32 samples): R@1=0.676, R@2=0.833, R@3=0.899
Model saved: clip_motion_align_epoch_CMP_6.pt


Epoch 7/300: 100%|██████████| 177/177 [00:19<00:00,  9.21it/s, loss=0.6465]


Epoch [7/300] - Train Average Loss: 0.6765
[Validate at epoch 7] ...


Epoch_7_Test: 100%|██████████| 32/32 [00:02<00:00, 15.92it/s]


Epoch_7_Test Average Contrastive Loss: 1.0203
Epoch_7_Test M->T Retrieval (per 32 samples): R@1=0.652, R@2=0.819, R@3=0.888
Epoch_7_Test T->M Retrieval (per 32 samples): R@1=0.685, R@2=0.842, R@3=0.907
Model saved: clip_motion_align_epoch_CMP_7.pt


Epoch 8/300: 100%|██████████| 177/177 [00:19<00:00,  9.18it/s, loss=0.6388]


Epoch [8/300] - Train Average Loss: 0.6397
[Validate at epoch 8] ...


Epoch_8_Test: 100%|██████████| 32/32 [00:02<00:00, 15.68it/s]


Epoch_8_Test Average Contrastive Loss: 0.9695
Epoch_8_Test M->T Retrieval (per 32 samples): R@1=0.691, R@2=0.841, R@3=0.904
Epoch_8_Test T->M Retrieval (per 32 samples): R@1=0.689, R@2=0.846, R@3=0.912
Model saved: clip_motion_align_epoch_CMP_8.pt


Epoch 9/300: 100%|██████████| 177/177 [00:19<00:00,  9.18it/s, loss=0.6346]


Epoch [9/300] - Train Average Loss: 0.5894
[Validate at epoch 9] ...


Epoch_9_Test: 100%|██████████| 32/32 [00:02<00:00, 15.74it/s]


Epoch_9_Test Average Contrastive Loss: 0.9692
Epoch_9_Test M->T Retrieval (per 32 samples): R@1=0.662, R@2=0.843, R@3=0.910
Epoch_9_Test T->M Retrieval (per 32 samples): R@1=0.695, R@2=0.842, R@3=0.898
Model saved: clip_motion_align_epoch_CMP_9.pt


Epoch 10/300: 100%|██████████| 177/177 [00:18<00:00,  9.81it/s, loss=0.4007]


Epoch [10/300] - Train Average Loss: 0.5522
[Validate at epoch 10] ...


Epoch_10_Test: 100%|██████████| 32/32 [00:02<00:00, 15.64it/s]


Epoch_10_Test Average Contrastive Loss: 0.8891
Epoch_10_Test M->T Retrieval (per 32 samples): R@1=0.704, R@2=0.856, R@3=0.918
Epoch_10_Test T->M Retrieval (per 32 samples): R@1=0.724, R@2=0.858, R@3=0.919
Model saved: clip_motion_align_epoch_CMP_10.pt


Epoch 11/300: 100%|██████████| 177/177 [00:19<00:00,  9.21it/s, loss=0.3056]


Epoch [11/300] - Train Average Loss: 0.5231
[Validate at epoch 11] ...


Epoch_11_Test: 100%|██████████| 32/32 [00:02<00:00, 15.90it/s]


Epoch_11_Test Average Contrastive Loss: 0.9121
Epoch_11_Test M->T Retrieval (per 32 samples): R@1=0.703, R@2=0.857, R@3=0.916
Epoch_11_Test T->M Retrieval (per 32 samples): R@1=0.711, R@2=0.864, R@3=0.916
Model saved: clip_motion_align_epoch_CMP_11.pt


Epoch 12/300: 100%|██████████| 177/177 [00:18<00:00,  9.80it/s, loss=0.4009]


Epoch [12/300] - Train Average Loss: 0.4851
[Validate at epoch 12] ...


Epoch_12_Test: 100%|██████████| 32/32 [00:02<00:00, 15.94it/s]


Epoch_12_Test Average Contrastive Loss: 0.8626
Epoch_12_Test M->T Retrieval (per 32 samples): R@1=0.697, R@2=0.869, R@3=0.926
Epoch_12_Test T->M Retrieval (per 32 samples): R@1=0.728, R@2=0.879, R@3=0.927
Model saved: clip_motion_align_epoch_CMP_12.pt


Epoch 13/300: 100%|██████████| 177/177 [00:19<00:00,  9.24it/s, loss=0.6941]


Epoch [13/300] - Train Average Loss: 0.4652
[Validate at epoch 13] ...


Epoch_13_Test: 100%|██████████| 32/32 [00:01<00:00, 16.08it/s]


Epoch_13_Test Average Contrastive Loss: 0.8359
Epoch_13_Test M->T Retrieval (per 32 samples): R@1=0.719, R@2=0.878, R@3=0.926
Epoch_13_Test T->M Retrieval (per 32 samples): R@1=0.719, R@2=0.865, R@3=0.918
Model saved: clip_motion_align_epoch_CMP_13.pt


Epoch 14/300: 100%|██████████| 177/177 [00:18<00:00,  9.33it/s, loss=0.2669]


Epoch [14/300] - Train Average Loss: 0.4356
[Validate at epoch 14] ...


Epoch_14_Test: 100%|██████████| 32/32 [00:02<00:00, 15.76it/s]


Epoch_14_Test Average Contrastive Loss: 0.8717
Epoch_14_Test M->T Retrieval (per 32 samples): R@1=0.723, R@2=0.863, R@3=0.911
Epoch_14_Test T->M Retrieval (per 32 samples): R@1=0.729, R@2=0.869, R@3=0.907
Model saved: clip_motion_align_epoch_CMP_14.pt


Epoch 15/300: 100%|██████████| 177/177 [00:19<00:00,  9.20it/s, loss=0.6780]


Epoch [15/300] - Train Average Loss: 0.4108
[Validate at epoch 15] ...


Epoch_15_Test: 100%|██████████| 32/32 [00:01<00:00, 16.05it/s]


Epoch_15_Test Average Contrastive Loss: 0.8355
Epoch_15_Test M->T Retrieval (per 32 samples): R@1=0.709, R@2=0.876, R@3=0.929
Epoch_15_Test T->M Retrieval (per 32 samples): R@1=0.734, R@2=0.880, R@3=0.936
Model saved: clip_motion_align_epoch_CMP_15.pt


Epoch 16/300: 100%|██████████| 177/177 [00:18<00:00,  9.58it/s, loss=0.4411]


Epoch [16/300] - Train Average Loss: 0.3929
[Validate at epoch 16] ...


Epoch_16_Test: 100%|██████████| 32/32 [00:02<00:00, 15.81it/s]


Epoch_16_Test Average Contrastive Loss: 0.8117
Epoch_16_Test M->T Retrieval (per 32 samples): R@1=0.729, R@2=0.876, R@3=0.923
Epoch_16_Test T->M Retrieval (per 32 samples): R@1=0.740, R@2=0.886, R@3=0.936
Model saved: clip_motion_align_epoch_CMP_16.pt


Epoch 17/300: 100%|██████████| 177/177 [00:19<00:00,  9.12it/s, loss=0.3780]


Epoch [17/300] - Train Average Loss: 0.3773
[Validate at epoch 17] ...


Epoch_17_Test: 100%|██████████| 32/32 [00:02<00:00, 15.77it/s]


Epoch_17_Test Average Contrastive Loss: 0.8131
Epoch_17_Test M->T Retrieval (per 32 samples): R@1=0.726, R@2=0.878, R@3=0.932
Epoch_17_Test T->M Retrieval (per 32 samples): R@1=0.739, R@2=0.873, R@3=0.930
Model saved: clip_motion_align_epoch_CMP_17.pt


Epoch 18/300: 100%|██████████| 177/177 [00:18<00:00,  9.74it/s, loss=0.2685]


Epoch [18/300] - Train Average Loss: 0.3547
[Validate at epoch 18] ...


Epoch_18_Test: 100%|██████████| 32/32 [00:01<00:00, 16.02it/s]


Epoch_18_Test Average Contrastive Loss: 0.7642
Epoch_18_Test M->T Retrieval (per 32 samples): R@1=0.739, R@2=0.883, R@3=0.937
Epoch_18_Test T->M Retrieval (per 32 samples): R@1=0.758, R@2=0.899, R@3=0.945
Model saved: clip_motion_align_epoch_CMP_18.pt


Epoch 19/300: 100%|██████████| 177/177 [00:18<00:00,  9.33it/s, loss=0.3609]


Epoch [19/300] - Train Average Loss: 0.3513
[Validate at epoch 19] ...


Epoch_19_Test: 100%|██████████| 32/32 [00:02<00:00, 15.65it/s]


Epoch_19_Test Average Contrastive Loss: 0.7954
Epoch_19_Test M->T Retrieval (per 32 samples): R@1=0.733, R@2=0.881, R@3=0.938
Epoch_19_Test T->M Retrieval (per 32 samples): R@1=0.744, R@2=0.888, R@3=0.928
Model saved: clip_motion_align_epoch_CMP_19.pt


Epoch 20/300: 100%|██████████| 177/177 [00:18<00:00,  9.41it/s, loss=0.3833]


Epoch [20/300] - Train Average Loss: 0.3320
[Validate at epoch 20] ...


Epoch_20_Test: 100%|██████████| 32/32 [00:02<00:00, 15.55it/s]


Epoch_20_Test Average Contrastive Loss: 0.7986
Epoch_20_Test M->T Retrieval (per 32 samples): R@1=0.750, R@2=0.873, R@3=0.933
Epoch_20_Test T->M Retrieval (per 32 samples): R@1=0.730, R@2=0.874, R@3=0.935
Model saved: clip_motion_align_epoch_CMP_20.pt


Epoch 21/300: 100%|██████████| 177/177 [00:19<00:00,  9.19it/s, loss=0.3873]


Epoch [21/300] - Train Average Loss: 0.3101
[Validate at epoch 21] ...


Epoch_21_Test: 100%|██████████| 32/32 [00:02<00:00, 15.95it/s]


Epoch_21_Test Average Contrastive Loss: 0.7408
Epoch_21_Test M->T Retrieval (per 32 samples): R@1=0.755, R@2=0.881, R@3=0.943
Epoch_21_Test T->M Retrieval (per 32 samples): R@1=0.756, R@2=0.891, R@3=0.937
Model saved: clip_motion_align_epoch_CMP_21.pt


Epoch 22/300: 100%|██████████| 177/177 [00:19<00:00,  9.12it/s, loss=0.3300]


Epoch [22/300] - Train Average Loss: 0.3137
[Validate at epoch 22] ...


Epoch_22_Test: 100%|██████████| 32/32 [00:02<00:00, 15.75it/s]


Epoch_22_Test Average Contrastive Loss: 0.7306
Epoch_22_Test M->T Retrieval (per 32 samples): R@1=0.761, R@2=0.903, R@3=0.938
Epoch_22_Test T->M Retrieval (per 32 samples): R@1=0.762, R@2=0.902, R@3=0.945
Model saved: clip_motion_align_epoch_CMP_22.pt


Epoch 23/300: 100%|██████████| 177/177 [00:19<00:00,  9.26it/s, loss=0.2926]


Epoch [23/300] - Train Average Loss: 0.3019
[Validate at epoch 23] ...


Epoch_23_Test: 100%|██████████| 32/32 [00:02<00:00, 15.59it/s]


Epoch_23_Test Average Contrastive Loss: 0.7331
Epoch_23_Test M->T Retrieval (per 32 samples): R@1=0.748, R@2=0.897, R@3=0.940
Epoch_23_Test T->M Retrieval (per 32 samples): R@1=0.763, R@2=0.899, R@3=0.952
Model saved: clip_motion_align_epoch_CMP_23.pt


Epoch 24/300: 100%|██████████| 177/177 [00:19<00:00,  9.28it/s, loss=0.2154]


Epoch [24/300] - Train Average Loss: 0.2828
[Validate at epoch 24] ...


Epoch_24_Test: 100%|██████████| 32/32 [00:02<00:00, 15.75it/s]


Epoch_24_Test Average Contrastive Loss: 0.7293
Epoch_24_Test M->T Retrieval (per 32 samples): R@1=0.752, R@2=0.898, R@3=0.941
Epoch_24_Test T->M Retrieval (per 32 samples): R@1=0.771, R@2=0.896, R@3=0.941
Model saved: clip_motion_align_epoch_CMP_24.pt


Epoch 25/300: 100%|██████████| 177/177 [00:19<00:00,  9.16it/s, loss=0.3115]


Epoch [25/300] - Train Average Loss: 0.2804
[Validate at epoch 25] ...


Epoch_25_Test: 100%|██████████| 32/32 [00:02<00:00, 15.74it/s]


Epoch_25_Test Average Contrastive Loss: 0.7092
Epoch_25_Test M->T Retrieval (per 32 samples): R@1=0.761, R@2=0.903, R@3=0.945
Epoch_25_Test T->M Retrieval (per 32 samples): R@1=0.774, R@2=0.896, R@3=0.942
Model saved: clip_motion_align_epoch_CMP_25.pt


Epoch 26/300: 100%|██████████| 177/177 [00:19<00:00,  9.26it/s, loss=0.2715]


Epoch [26/300] - Train Average Loss: 0.2709
[Validate at epoch 26] ...


Epoch_26_Test: 100%|██████████| 32/32 [00:02<00:00, 15.77it/s]


Epoch_26_Test Average Contrastive Loss: 0.7229
Epoch_26_Test M->T Retrieval (per 32 samples): R@1=0.756, R@2=0.888, R@3=0.936
Epoch_26_Test T->M Retrieval (per 32 samples): R@1=0.771, R@2=0.902, R@3=0.940
Model saved: clip_motion_align_epoch_CMP_26.pt


Epoch 27/300: 100%|██████████| 177/177 [00:18<00:00,  9.61it/s, loss=0.1447]


Epoch [27/300] - Train Average Loss: 0.2574
[Validate at epoch 27] ...


Epoch_27_Test: 100%|██████████| 32/32 [00:01<00:00, 17.78it/s]


Epoch_27_Test Average Contrastive Loss: 0.7471
Epoch_27_Test M->T Retrieval (per 32 samples): R@1=0.764, R@2=0.882, R@3=0.934
Epoch_27_Test T->M Retrieval (per 32 samples): R@1=0.755, R@2=0.891, R@3=0.939
Model saved: clip_motion_align_epoch_CMP_27.pt


Epoch 28/300: 100%|██████████| 177/177 [00:19<00:00,  9.10it/s, loss=0.2730]


Epoch [28/300] - Train Average Loss: 0.2365
[Validate at epoch 28] ...


Epoch_28_Test: 100%|██████████| 32/32 [00:01<00:00, 16.11it/s]


Epoch_28_Test Average Contrastive Loss: 0.6925
Epoch_28_Test M->T Retrieval (per 32 samples): R@1=0.767, R@2=0.896, R@3=0.942
Epoch_28_Test T->M Retrieval (per 32 samples): R@1=0.785, R@2=0.896, R@3=0.935
Model saved: clip_motion_align_epoch_CMP_28.pt


Epoch 29/300: 100%|██████████| 177/177 [00:19<00:00,  9.25it/s, loss=0.1926]


Epoch [29/300] - Train Average Loss: 0.2313
[Validate at epoch 29] ...


Epoch_29_Test: 100%|██████████| 32/32 [00:02<00:00, 15.66it/s]


Epoch_29_Test Average Contrastive Loss: 0.6724
Epoch_29_Test M->T Retrieval (per 32 samples): R@1=0.772, R@2=0.909, R@3=0.944
Epoch_29_Test T->M Retrieval (per 32 samples): R@1=0.782, R@2=0.910, R@3=0.944
Model saved: clip_motion_align_epoch_CMP_29.pt


Epoch 30/300: 100%|██████████| 177/177 [00:18<00:00,  9.35it/s, loss=0.2306]


Epoch [30/300] - Train Average Loss: 0.2363
[Validate at epoch 30] ...


Epoch_30_Test: 100%|██████████| 32/32 [00:02<00:00, 15.85it/s]


Epoch_30_Test Average Contrastive Loss: 0.6933
Epoch_30_Test M->T Retrieval (per 32 samples): R@1=0.780, R@2=0.911, R@3=0.945
Epoch_30_Test T->M Retrieval (per 32 samples): R@1=0.776, R@2=0.905, R@3=0.941
Model saved: clip_motion_align_epoch_CMP_30.pt


Epoch 31/300: 100%|██████████| 177/177 [00:18<00:00,  9.32it/s, loss=0.3736]


Epoch [31/300] - Train Average Loss: 0.2347
[Validate at epoch 31] ...


Epoch_31_Test: 100%|██████████| 32/32 [00:02<00:00, 15.89it/s]


Epoch_31_Test Average Contrastive Loss: 0.6764
Epoch_31_Test M->T Retrieval (per 32 samples): R@1=0.776, R@2=0.910, R@3=0.945
Epoch_31_Test T->M Retrieval (per 32 samples): R@1=0.785, R@2=0.904, R@3=0.941
Model saved: clip_motion_align_epoch_CMP_31.pt


Epoch 32/300: 100%|██████████| 177/177 [00:19<00:00,  9.23it/s, loss=0.2284]


Epoch [32/300] - Train Average Loss: 0.2247
[Validate at epoch 32] ...


Epoch_32_Test: 100%|██████████| 32/32 [00:02<00:00, 15.75it/s]


Epoch_32_Test Average Contrastive Loss: 0.6729
Epoch_32_Test M->T Retrieval (per 32 samples): R@1=0.775, R@2=0.903, R@3=0.948
Epoch_32_Test T->M Retrieval (per 32 samples): R@1=0.792, R@2=0.905, R@3=0.945
Model saved: clip_motion_align_epoch_CMP_32.pt
Early stopping triggered!
Training completed!
