In [2]:
# # Fix torchcodec FFmpeg DLL loading issue
# import os
# import sys

# # Add FFmpeg bin directory to PATH so torchcodec can find the DLLs
# ffmpeg_path = r"C:\ProgramData\chocolatey\lib\ffmpeg\tools\ffmpeg\bin"
# if os.path.exists(ffmpeg_path):
#     os.environ['PATH'] = ffmpeg_path + os.pathsep + os.environ.get('PATH', '')
#     os.add_dll_directory(ffmpeg_path)
#     print(f"âœ“ Added FFmpeg DLLs from: {ffmpeg_path}")
# else:
#     print(f"âš  FFmpeg bin directory not found at: {ffmpeg_path}")
#     print("Please check your FFmpeg installation location")

In [8]:
from loaders.audio_loader import AudioLoader
# 1. Load data
loader = AudioLoader(data_dir="../../data/audio/for-norm/for-norm")
dataset = loader.load()

Scanning train split...
  Found 26927 fake files
  Found 26941 real files
  Registered 53868 examples
Scanning validation split...
  Found 5398 fake files
  Found 5400 real files
  Registered 10798 examples
Scanning test split...
  Found 2370 fake files
  Found 2264 real files
  Registered 4634 examples


In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio_path', 'label'],
        num_rows: 53868
    })
    validation: Dataset({
        features: ['audio_path', 'label'],
        num_rows: 10798
    })
    test: Dataset({
        features: ['audio_path', 'label'],
        num_rows: 4634
    })
})

In [10]:
from preprocessing.audio_preprocessor import AudioPreprocessor

# 2. Preprocess
preprocessor = AudioPreprocessor(max_duration=5.0)
encoded = dataset['train'].map(preprocessor, batched=True)


Map:   0%|          | 0/53868 [00:00<?, ? examples/s]

  normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
  ret = ret.dtype.type(ret / rcount)
  normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [6]:
encoded[0]

{'audio_path': '..\\..\\data\\audio\\for-norm\\for-norm\\training\\fake\\file1000.mp3.wav_16k.wav_norm.wav_mono.wav_silence.wav',
 'label': 'fake',
 'input_values': [0.49110135436058044,
  0.5426614880561829,
  0.5924965739250183,
  0.6185641288757324,
  0.6360064148902893,
  0.6427149772644043,
  0.6329396367073059,
  0.6304478645324707,
  0.6450150609016418,
  0.6476984620094299,
  0.6285311579704285,
  0.6126222610473633,
  0.6005468368530273,
  0.5702624320983887,
  0.5298193693161011,
  0.4999183416366577,
  0.47308406233787537,
  0.44509977102279663,
  0.43014925718307495,
  0.4023566246032715,
  0.32127878069877625,
  0.23579247295856476,
  0.23157565295696259,
  0.2643517851829529,
  0.21336668729782104,
  0.09874605387449265,
  0.05504452809691429,
  0.08475389331579208,
  0.05561954900622368,
  -0.051525842398405075,
  -0.10404432564973831,
  -0.058234408497810364,
  -0.037150342017412186,
  -0.1251283884048462,
  -0.23284882307052612,
  -0.24415753781795502,
  -0.18818892538

In [7]:
# Test with small subset first (10 samples)
small_encoded = encoded.select(range(10))
print(f"Small test dataset: {len(small_encoded)} samples")
print(f"Columns: {small_encoded.column_names}")
print(f"\nFirst sample keys: {small_encoded[0].keys()}")
print(f"Input values shape: {len(small_encoded[0]['input_values'])}")

Small test dataset: 10 samples
Columns: ['audio_path', 'label', 'input_values']

First sample keys: dict_keys(['audio_path', 'label', 'input_values'])
Input values shape: 19752


In [8]:
# Test DataLoader with small subset
from torch.utils.data import DataLoader

test_loader = DataLoader(
    small_encoded,
    batch_size=4,
    collate_fn=preprocessor.collate_fn
)

print(f"\n--- Testing DataLoader ---")
print(f"Number of batches: {len(test_loader)}")

# Get first batch
batch = next(iter(test_loader))
print(f"\nFirst batch keys: {batch.keys()}")
print(f"Input values shape: {batch['input_values'].shape}")
print(f"Attention mask shape: {batch['attention_mask'].shape}")
print(f"Labels shape: {batch['labels'].shape}")
print(f"Labels: {batch['labels']}")


--- Testing DataLoader ---
Number of batches: 3


ValueError: too many dimensions 'str'

In [None]:
# Test a few more batches
print(f"\n--- Iterating through all batches ---")
for i, batch in enumerate(test_loader):
    print(f"Batch {i+1}: input_values={batch['input_values'].shape}, "
          f"attention_mask={batch['attention_mask'].shape}, "
          f"labels={batch['labels'].shape}")

## Test with Full Training DataLoader

Once the test passes, create the full training DataLoader:

## End-to-End Test: Small Dataset â†’ Embeddings

Test the complete pipeline with 20 samples to verify everything works:

In [1]:
from loaders.audio_loader import AudioLoader
# 1. Load data
loader = AudioLoader(data_dir="../../data/audio/for-norm/for-norm")
dataset = loader.load()

Scanning train split...
  Found 26927 fake files
  Found 26941 real files
  Registered 53868 examples
Scanning validation split...
  Found 5398 fake files
  Found 5400 real files
  Registered 10798 examples
Scanning test split...
  Found 2370 fake files
  Found 2264 real files
  Registered 4634 examples


In [2]:
# Step 1: Create small test dataset (20 samples)
test_size = 20
small_dataset = dataset['train'].select(range(test_size))
print(f"Test dataset size: {len(small_dataset)} samples")
print(f"Labels: {small_dataset['label'][:10]}")  # Show first 10 labels

Test dataset size: 20 samples
Labels: ['fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake']


In [3]:
from preprocessing.audio_preprocessor import AudioPreprocessor

# 2. Preprocess
preprocessor = AudioPreprocessor(max_duration=5.0)




In [4]:
# Step 2: Preprocess the small dataset
print("Preprocessing...")
small_encoded = small_dataset.map(preprocessor, batched=True, batch_size=10)
print(f"âœ“ Preprocessed {len(small_encoded)} samples")
print(f"Columns: {small_encoded.column_names}")
print(f"First sample input_values length: {len(small_encoded[0]['input_values'])}")

Preprocessing...


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

âœ“ Preprocessed 20 samples
Columns: ['audio_path', 'label', 'input_values']
First sample input_values length: 19752


In [5]:
# Step 3: Create DataLoader
from torch.utils.data import DataLoader

test_dataloader = DataLoader(
    small_encoded,
    batch_size=8,
    collate_fn=preprocessor.collate_fn,
    shuffle=False
)

print(f"âœ“ Created DataLoader with {len(test_dataloader)} batches")
print(f"Batch size: 8")

# Test one batch
batch = next(iter(test_dataloader))
print(f"\nBatch content:")
print(f"  - input_values: {batch['input_values'].shape}")
print(f"  - attention_mask: {batch['attention_mask'].shape}")
print(f"  - labels: {batch['labels'].shape}, values: {batch['labels']}")

âœ“ Created DataLoader with 3 batches
Batch size: 8

Batch content:
  - input_values: torch.Size([8, 34902])
  - attention_mask: torch.Size([8, 34902])
  - labels: torch.Size([8]), values: tensor([0, 0, 0, 0, 0, 0, 0, 0])


In [6]:
# Step 4: Load AudioEncoder and generate embeddings
import torch
from models.audio_encoder import AudioEncoder

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create encoder
encoder = AudioEncoder(
    model_checkpoint="facebook/wav2vec2-base",
    projection_dim=512,
    freeze_feature_extractor=True
).to(device)

print(f"âœ“ AudioEncoder created with output dim: {encoder.out_dim}")

Using device: cuda




âœ“ AudioEncoder created with output dim: 512


In [7]:
# Step 5: Generate embeddings for all test batches
encoder.eval()  # Set to evaluation mode

all_embeddings = []
all_labels = []

print("Generating embeddings...")
with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        # Move batch to device
        input_values = batch['input_values'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        
        # Generate embeddings
        embeddings = encoder(input_values, attention_mask)
        
        # Store results
        all_embeddings.append(embeddings.cpu())
        all_labels.append(labels)
        
        print(f"  Batch {i+1}/{len(test_dataloader)}: "
              f"input {input_values.shape} â†’ embeddings {embeddings.shape}")

# Concatenate all batches
all_embeddings = torch.cat(all_embeddings, dim=0)
all_labels = torch.cat(all_labels, dim=0)

print(f"\nâœ“ Final Results:")
print(f"  Total embeddings: {all_embeddings.shape}")
print(f"  Total labels: {all_labels.shape}")
print(f"  Labels distribution: fake={(all_labels==0).sum().item()}, real={(all_labels==1).sum().item()}")
print(f"\nEmbeddings ready for multimodal fusion! ðŸŽ‰")

Generating embeddings...
  Batch 1/3: input torch.Size([8, 34902]) â†’ embeddings torch.Size([8, 512])
  Batch 2/3: input torch.Size([8, 43651]) â†’ embeddings torch.Size([8, 512])
  Batch 3/3: input torch.Size([4, 36526]) â†’ embeddings torch.Size([4, 512])

âœ“ Final Results:
  Total embeddings: torch.Size([20, 512])
  Total labels: torch.Size([20])
  Labels distribution: fake=20, real=0

Embeddings ready for multimodal fusion! ðŸŽ‰


In [None]:

# # 3. Create DataLoader
# train_loader = DataLoader(
#     encoded,
#     batch_size=4,
#     collate_fn=preprocessor.collate_fn
# )

# # 4. Encode to embeddings
# encoder = AudioEncoder(projection_dim=512).cuda()
# batch = next(iter(train_loader))
# embeddings = encoder(batch['input_values'], batch['attention_mask'])
# # Output: (B, 512) ready for fusion!