In [1]:
# Video Pipeline Test: Video → Frame Sampling → Temporal Embeddings

# Test the complete video pipeline:
# 1. Load videos with CelebDFVisualDataset
# 2. Extract 8 uniformly-sampled frames with face detection (224x224)
# 3. Generate temporal embeddings with VisualEncoder (Xception + LSTM + Attention)
# 4. Validate output shapes and dimensions

In [2]:
# Step 1: Setup and imports
import sys
import torch
from torch.utils.data import DataLoader, Subset
from pathlib import Path

# Add src to path
src_path = Path(r"D:\florida_coursework\third_sem\multimedia_expert_systems\multimedia_prototype\mutlimedia_mvp\src")
sys.path.insert(0, str(src_path))

from loaders.visual_loader import CelebDFVisualDataset
from encoders.visual_encoder import VisualEncoder

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Data path
data_root = Path(r"D:\florida_coursework\third_sem\multimedia_expert_systems\multimedia_prototype\data\celeb_df")
print(f"Data directory: {data_root}")
print(f"Exists: {data_root.exists()}")

Using device: cuda
Data directory: D:\florida_coursework\third_sem\multimedia_expert_systems\multimedia_prototype\data\celeb_df
Exists: True


In [3]:
# Step 2: Load full dataset and create small test subset
print("Loading video dataset...")
frames_per_video = 8  # Number of frames to extract per video

dataset = CelebDFVisualDataset(
    root_dir=data_root,
    frames_per_video=frames_per_video
)


# Create small test subset (5 videos - face detection is slow)
# test_indices = list(range(5))
# dataset = Subset(dataset, test_indices)

# print(f"\n✓ Created test subset: {len(dataset)} videos")


print(f"✓ Total videos in dataset: {len(dataset)}")
print(f"  Sample video paths: {dataset.video_paths[:3]}")
print(f"  Sample labels: {dataset.labels[:3]} (0=real, 1=fake)")
print(f"  Frames per video: {frames_per_video}")

Loading video dataset...
✓ Total videos in dataset: 6529
  Sample video paths: ['D:\\florida_coursework\\third_sem\\multimedia_expert_systems\\multimedia_prototype\\data\\celeb_df\\Celeb-real\\id0_0000.mp4', 'D:\\florida_coursework\\third_sem\\multimedia_expert_systems\\multimedia_prototype\\data\\celeb_df\\Celeb-real\\id0_0001.mp4', 'D:\\florida_coursework\\third_sem\\multimedia_expert_systems\\multimedia_prototype\\data\\celeb_df\\Celeb-real\\id0_0002.mp4']
  Sample labels: [0, 0, 0] (0=real, 1=fake)
  Frames per video: 8


In [4]:
# Step 3: Test single sample - frame extraction and preprocessing
print("Testing single sample frame extraction...")
print("⏳ This may take 10-30 seconds (face detection on 8 frames)...\n")

sample_frames, sample_label = dataset[0]

print(f"✓ Extracted video frames: {sample_frames.shape}")
print(f"  Expected shape: ({frames_per_video}, 3, 224, 224) - 8 RGB frames")
print(f"  Data range: [{sample_frames.min():.3f}, {sample_frames.max():.3f}]")
print(f"  Label: {sample_label.item()} ({'real' if sample_label.item() == 0 else 'fake'})")

# Check if faces were detected (non-zero tensors)
non_zero_frames = (sample_frames.sum(dim=(1,2,3)) > 0).sum().item()
print(f"  Frames with detected faces: {non_zero_frames}/{frames_per_video}")

if non_zero_frames == 0:
    print("⚠ Warning: No faces detected in this video (all zeros)")
elif non_zero_frames < frames_per_video:
    print(f"⚠ Warning: Only {non_zero_frames} frames had detected faces (padded with zeros)")
else:
    print("✓ All frames successfully extracted with faces!")

Testing single sample frame extraction...
⏳ This may take 10-30 seconds (face detection on 8 frames)...

✓ Extracted video frames: torch.Size([8, 3, 224, 224])
  Expected shape: (8, 3, 224, 224) - 8 RGB frames
  Data range: [0.000, 0.937]
  Label: 0 (real)
  Frames with detected faces: 8/8
✓ All frames successfully extracted with faces!


In [5]:
# Step 4: Create DataLoader for batch processing
print("Creating DataLoader...")
test_loader = DataLoader(
    dataset,
    batch_size=2,  # Small batch due to video memory requirements
    shuffle=False,
    num_workers=0  # Use 0 for Windows to avoid multiprocessing issues
)

print(f"✓ Created DataLoader with {len(test_loader)} batches")

# Test one batch (this will take time due to face detection)
print("\n⏳ Loading one batch (may take 20-60 seconds)...")
batch_frames, batch_labels = next(iter(test_loader))

print(f"\nBatch content:")
print(f"  - Frames: {batch_frames.shape}")
print(f"  - Labels: {batch_labels.shape}, values: {batch_labels.tolist()}")
print(f"  - Memory usage: ~{batch_frames.element_size() * batch_frames.nelement() / 1024**2:.2f} MB")

Creating DataLoader...
✓ Created DataLoader with 3265 batches

⏳ Loading one batch (may take 20-60 seconds)...

Batch content:
  - Frames: torch.Size([2, 8, 3, 224, 224])
  - Labels: torch.Size([2]), values: [0, 0]
  - Memory usage: ~9.19 MB


In [6]:
# Step 5: Initialize VisualEncoder
print("Loading VisualEncoder (Xception + LSTM + Attention)...")

# Configure to match audio/image encoders
lstm_hidden = 256  # LSTM hidden size
embed_dim = lstm_hidden * 2  # 512 (bidirectional LSTM output)

encoder = VisualEncoder(
    backbone_name="xception",
    pretrained=True,
    lstm_hidden=lstm_hidden,
    lstm_layers=1,
    bidirectional=True,
    dropout=0.3,
    frame_chunk_size=32
).to(device)

encoder.eval()  # Set to evaluation mode

print(f"✓ VisualEncoder loaded on {device}")
print(f"  Backbone: Xception (pretrained on ImageNet)")
print(f"  Frame feature dim: {encoder.frame_feat_dim}")
print(f"  LSTM hidden: {lstm_hidden} (bidirectional)")
print(f"  Output dimension: {encoder.get_out_dim()}")
print(f"  Total parameters: {sum(p.numel() for p in encoder.parameters()):,}")

Loading VisualEncoder (Xception + LSTM + Attention)...


  model = create_fn(


✓ VisualEncoder loaded on cuda
  Backbone: Xception (pretrained on ImageNet)
  Frame feature dim: 2048
  LSTM hidden: 256 (bidirectional)
  Output dimension: 512
  Total parameters: 25,661,225


In [7]:
# Step 6: Generate temporal embeddings for all test batches
print("Generating video embeddings...")
print("⏳ This will take a few minutes (frame extraction + encoding)...\n")

all_embeddings = []
all_labels = []

with torch.no_grad():
    for i, (frames, labels) in enumerate(test_loader):
        print(f"  Processing batch {i+1}/{len(test_loader)}...")
        
        # Move batch to device
        frames = frames.to(device)  # (B, T, C, H, W)
        
        # Generate temporal embeddings
        embeddings = encoder(frames)
        
        # Store results
        all_embeddings.append(embeddings.cpu())
        all_labels.append(labels)
        
        print(f"    Input: {frames.shape} → Embeddings: {embeddings.shape}")

# Concatenate all batches
all_embeddings = torch.cat(all_embeddings, dim=0)
all_labels = torch.cat(all_labels, dim=0)

print(f"\n✓ Final Results:")
print(f"  Total embeddings: {all_embeddings.shape}")
print(f"  Total labels: {all_labels.shape}")
print(f"  Labels distribution: real={(all_labels==0).sum().item()}, fake={(all_labels==1).sum().item()}")
print(f"\n🎉 Video embeddings ready for multimodal fusion!")

Generating video embeddings...
⏳ This will take a few minutes (frame extraction + encoding)...

  Processing batch 1/3265...
    Input: torch.Size([2, 8, 3, 224, 224]) → Embeddings: torch.Size([2, 512])
  Processing batch 2/3265...
    Input: torch.Size([2, 8, 3, 224, 224]) → Embeddings: torch.Size([2, 512])
  Processing batch 3/3265...
    Input: torch.Size([2, 8, 3, 224, 224]) → Embeddings: torch.Size([2, 512])
  Processing batch 4/3265...
    Input: torch.Size([2, 8, 3, 224, 224]) → Embeddings: torch.Size([2, 512])
  Processing batch 5/3265...
    Input: torch.Size([2, 8, 3, 224, 224]) → Embeddings: torch.Size([2, 512])
  Processing batch 6/3265...
    Input: torch.Size([2, 8, 3, 224, 224]) → Embeddings: torch.Size([2, 512])
  Processing batch 7/3265...
    Input: torch.Size([2, 8, 3, 224, 224]) → Embeddings: torch.Size([2, 512])
  Processing batch 8/3265...
    Input: torch.Size([2, 8, 3, 224, 224]) → Embeddings: torch.Size([2, 512])
  Processing batch 9/3265...
    Input: torch.Si

In [8]:
save_path = "embeddings/video_embeddings.pt"
torch.save({
    "embeddings": all_embeddings,   # (N, D)
    "labels": all_labels            # (N,)
}, save_path)

print("Saved:", save_path)

Saved: embeddings/video_embeddings.pt


In [10]:
data = torch.load("embeddings/video_embeddings.pt")
video_embeddings = data["embeddings"]    # torch.Tensor
video_labels = data["labels"]            # torch.Tensor
video_embeddings, video_labels

(tensor([[-0.0239,  0.0959, -0.0075,  ..., -0.1034,  0.1031,  0.1394],
         [ 0.0005,  0.1355, -0.0237,  ..., -0.1124, -0.0067,  0.0664],
         [ 0.0802,  0.1241, -0.0864,  ...,  0.0914,  0.0502,  0.0687],
         ...,
         [ 0.0071,  0.0323,  0.0616,  ...,  0.0188,  0.0628,  0.1270],
         [-0.0028,  0.2159,  0.1033,  ...,  0.0776, -0.0029,  0.1234],
         [-0.0195,  0.1740,  0.0761,  ..., -0.0966,  0.0230,  0.0768]]),
 tensor([0, 0, 0,  ..., 1, 1, 1]))