# Resnet

In [9]:
# used for data loading
from utils.data_loader import DataLoader

# device configuration
from utils.device import try_gpu

# resnet18 frame feature extractor
from models.resnet18 import FrameFeatureExtractor

# video recognition model
from models.lstm import VideoRecognitionModel

# tensor library
import torch

In [10]:
# if we are on aistudio, modify path configuration
import os
if (os.name == 'posix'):
    from config.paths import Paths
    Paths.modify_root_dir()

In [11]:
# try gpu
device = try_gpu()

# decide which to test
phase = 2  # 1: test model forward pass; 2: inspect multi-modal data loader

In [12]:
data_loader = DataLoader(num_samples=162)

if phase == 1:
    train_iter = data_loader.load_dataiter('train', modality='depth', frame_ext='png')

    # inspect a batch
    for batch in train_iter:
        if isinstance(batch, dict):
            print('frames shape:', batch['frames'].shape)
            print('labels:', batch['label'])
            print('lengths:', batch['length'])
        else:
            print('batch type:', type(batch))
            try:
                for i, elem in enumerate(batch):
                    if hasattr(elem, 'shape'):
                        print(f'elem[{i}] shape:', elem.shape)
                    else:
                        print(f'elem[{i}] type:', type(elem))
            except Exception as e:
                print('Error:', e)
        break

In [13]:
if phase == 1:
    # test model forward pass
    B, T, H, W = 2, 8, 224, 224

    # 1) 测试 RGB (C=3)
    model_rgb = FrameFeatureExtractor(modality='rgb', pretrained=False).to(device)
    x_rgb = torch.randn(B, T, 3, H, W, device=device)
    out_rgb = model_rgb(x_rgb)
    print('RGB out shape:', out_rgb.shape)  # 期望 (B, T, 512)

In [14]:
if phase == 1:
    # 2) 测试 depth/infrared 单通道 (C=1)
    model_depth = FrameFeatureExtractor(modality='depth', pretrained=False).to(device)
    x_depth = torch.randn(B, T, 1, H, W, device=device)
    out_depth = model_depth(x_depth)
    print('Depth out shape:', out_depth.shape)  # 期望 (B, T, 512)

In [15]:
if phase == 2:
    # inspect three modalities together
    train_iter = data_loader.load_multi_modal_dataiter(set='train')
    batch = next(iter(train_iter))
    print('rgb shape:', batch['rgb'].shape)        # (B, T, C_rgb, H, W)
    print('depth shape:', batch['depth'].shape)    # (B, T, C_depth, H, W)
    print('infrared shape:', batch['infrared'].shape)
    print('lengths:', batch['lengths'])
    print('labels:', batch['labels'])

rgb shape: torch.Size([4, 128, 3, 224, 224])
depth shape: torch.Size([4, 128, 1, 224, 224])
infrared shape: torch.Size([4, 128, 1, 224, 224])
lengths: tensor([128, 128, 128, 128])
labels: tensor([10, 10,  0, 17])


In [17]:
if phase == 2:
    # try gpu
    rgb = batch['rgb'].to(device)
    depth = batch['depth'].to(device)
    infrared = batch['infrared'].to(device)
    lengths = batch['lengths']   # pack_padded_sequence 需要在 CPU 上（collate 已返回 CPU tensor）
    labels = batch['labels'].to(device)

    # test model forward pass for multi-modal inputs
    model = VideoRecognitionModel(num_classes=20, num_frames=8).to(device)
    logits = model(rgb, depth, infrared, lengths=lengths)  # 如果你的 forward 支持 lengths，请传 lengths=lengths
    print('logits shape:', logits.shape)  # 期望 (B, num_classes)

logits shape: torch.Size([4, 20])
