In [1]:
# Image Pipeline Test: Video → Face Extraction → Embeddings

# Test the complete image pipeline:
# 1. Load videos with CelebDFImageDataset
# 2. Extract middle frame faces (224x224)
# 3. Generate embeddings with ImageEncoder (ResNet50)
# 4. Validate output shapes and dimensions

In [2]:
# Step 1: Setup and imports
import sys
import torch
from torch.utils.data import DataLoader, Subset
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd()))

from loaders.image_loader import CelebDFImageDataset
from encoders.image_encoder import ImageEncoder

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Data path
data_root = Path(r"D:\florida_coursework\third_sem\multimedia_expert_systems\multimedia_prototype\data\celeb_df")
print(f"Data directory: {data_root}")
print(f"Exists: {data_root.exists()}")

Using device: cuda
Data directory: D:\florida_coursework\third_sem\multimedia_expert_systems\multimedia_prototype\data\celeb_df
Exists: True


In [3]:
# Step 2: Load full dataset and create small test subset
print("Loading dataset...")
dataset = CelebDFImageDataset(root_dir=data_root)

print(f"✓ Total videos in dataset: {len(dataset)}")
print(f"  Sample video paths: {dataset.video_paths[:3]}")
print(f"  Sample labels: {dataset.labels[:3]} (0=real, 1=fake)")

# # Create small test subset (10 videos)
# test_indices = list(range(500))
# dataset = Subset(full_dataset, test_indices)

# print(f"\n✓ Created test subset: {len(dataset)} videos")

Loading dataset...
✓ Total videos in dataset: 6529
  Sample video paths: ['D:\\florida_coursework\\third_sem\\multimedia_expert_systems\\multimedia_prototype\\data\\celeb_df\\Celeb-real\\id0_0000.mp4', 'D:\\florida_coursework\\third_sem\\multimedia_expert_systems\\multimedia_prototype\\data\\celeb_df\\Celeb-real\\id0_0001.mp4', 'D:\\florida_coursework\\third_sem\\multimedia_expert_systems\\multimedia_prototype\\data\\celeb_df\\Celeb-real\\id0_0002.mp4']
  Sample labels: [0, 0, 0] (0=real, 1=fake)


In [4]:
# Step 3: Test single sample - face extraction and preprocessing
print("Testing single sample face extraction...")
sample_img, sample_label = dataset[0]

print(f"✓ Extracted face tensor: {sample_img.shape}")
print(f"  Expected shape: (3, 224, 224) - RGB image")
print(f"  Data range: [{sample_img.min():.3f}, {sample_img.max():.3f}]")
print(f"  Label: {sample_label.item()} ({'real' if sample_label.item() == 0 else 'fake'})")

# Check if face was detected (non-zero tensor)
if sample_img.sum() == 0:
    print("⚠ Warning: Face not detected in this video (zeros returned)")
else:
    print("✓ Face successfully extracted and preprocessed!")

Testing single sample face extraction...
✓ Extracted face tensor: torch.Size([3, 224, 224])
  Expected shape: (3, 224, 224) - RGB image
  Data range: [0.000, 0.925]
  Label: 0 (real)
✓ Face successfully extracted and preprocessed!


In [5]:
# Step 4: Create DataLoader for batch processing
print("Creating DataLoader...")
test_loader = DataLoader(
    dataset,
    batch_size=4,
    shuffle=False,
    num_workers=0  # Use 0 for Windows to avoid multiprocessing issues
)

print(f"✓ Created DataLoader with {len(test_loader)} batches")

# Test one batch
batch_imgs, batch_labels = next(iter(test_loader))
print(f"\nBatch content:")
print(f"  - Images: {batch_imgs.shape}")
print(f"  - Labels: {batch_labels.shape}, values: {batch_labels.tolist()}")

Creating DataLoader...
✓ Created DataLoader with 1633 batches

Batch content:
  - Images: torch.Size([4, 3, 224, 224])
  - Labels: torch.Size([4]), values: [0, 0, 0, 0]


In [6]:
# Step 5: Initialize ImageEncoder
print("Loading ImageEncoder (ResNet50)...")
embed_dim = 512  # Match audio encoder output for multimodal fusion

encoder = ImageEncoder(embed_dim=embed_dim).to(device)
encoder.eval()  # Set to evaluation mode

print(f"✓ ImageEncoder loaded on {device}")
print(f"  Backbone: ResNet50 (pretrained on ImageNet)")
print(f"  Output dimension: {embed_dim}")
print(f"  Total parameters: {sum(p.numel() for p in encoder.parameters()):,}")

Loading ImageEncoder (ResNet50)...
✓ ImageEncoder loaded on cuda
  Backbone: ResNet50 (pretrained on ImageNet)
  Output dimension: 512
  Total parameters: 24,558,144


In [7]:
# Step 6: Generate embeddings for all test batches
print("Generating image embeddings...")
all_embeddings = []
all_labels = []

with torch.no_grad():
    for i, (imgs, labels) in enumerate(test_loader):
        # Move batch to device
        imgs = imgs.to(device)
        
        # Generate embeddings
        embeddings = encoder(imgs)
        
        # Store results
        all_embeddings.append(embeddings.cpu())
        all_labels.append(labels)
        
        print(f"  Batch {i+1}/{len(test_loader)}: "
              f"images {imgs.shape} → embeddings {embeddings.shape}")

# Concatenate all batches
all_embeddings = torch.cat(all_embeddings, dim=0)
all_labels = torch.cat(all_labels, dim=0)

print(f"\n✓ Final Results:")
print(f"  Total embeddings: {all_embeddings.shape}")
print(f"  Total labels: {all_labels.shape}")
print(f"  Labels distribution: real={(all_labels==0).sum().item()}, fake={(all_labels==1).sum().item()}")
print(f"\n🎉 Image embeddings ready for multimodal fusion!")

Generating image embeddings...
  Batch 1/1633: images torch.Size([4, 3, 224, 224]) → embeddings torch.Size([4, 512])
  Batch 2/1633: images torch.Size([4, 3, 224, 224]) → embeddings torch.Size([4, 512])
  Batch 3/1633: images torch.Size([4, 3, 224, 224]) → embeddings torch.Size([4, 512])
  Batch 4/1633: images torch.Size([4, 3, 224, 224]) → embeddings torch.Size([4, 512])
  Batch 5/1633: images torch.Size([4, 3, 224, 224]) → embeddings torch.Size([4, 512])
  Batch 6/1633: images torch.Size([4, 3, 224, 224]) → embeddings torch.Size([4, 512])
  Batch 7/1633: images torch.Size([4, 3, 224, 224]) → embeddings torch.Size([4, 512])
  Batch 8/1633: images torch.Size([4, 3, 224, 224]) → embeddings torch.Size([4, 512])
  Batch 9/1633: images torch.Size([4, 3, 224, 224]) → embeddings torch.Size([4, 512])
  Batch 10/1633: images torch.Size([4, 3, 224, 224]) → embeddings torch.Size([4, 512])
  Batch 11/1633: images torch.Size([4, 3, 224, 224]) → embeddings torch.Size([4, 512])
  Batch 12/1633: imag

In [8]:
save_path = "embeddings/image_embeddings.pt"
torch.save({
    "embeddings": all_embeddings,   # (N, D)
    "labels": all_labels            # (N,)
}, save_path)

print("Saved:", save_path)


Saved: embeddings/image_embeddings.pt


In [9]:
data = torch.load("embeddings/image_embeddings.pt")
image_embeddings = data["embeddings"]    # torch.Tensor
image_labels = data["labels"]            # torch.Tensor
image_embeddings, image_labels

(tensor([[ 1.2190,  0.3257, -0.9558,  ...,  0.1115, -0.5676, -0.0800],
         [ 0.5036,  0.9678, -0.3484,  ...,  0.1990, -0.5880, -0.0583],
         [ 1.2167,  1.2103, -1.1854,  ..., -0.5021, -0.5036, -0.0161],
         ...,
         [ 1.4533,  0.5663, -1.7211,  ...,  0.2717, -0.2635,  0.8494],
         [ 1.3611,  1.0721, -1.4963,  ..., -0.2073, -0.7521,  0.3623],
         [ 1.9750,  0.9022, -1.9226,  ..., -0.2053, -0.7498,  0.1047]]),
 tensor([0, 0, 0,  ..., 1, 1, 1]))