In [2]:
import sys
import torch
from torch.utils.data import DataLoader, Subset
from dotenv import load_dotenv
import os
from pathlib import Path

load_dotenv() 

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load paths from environment variables
data_root = Path(os.getenv("DATA_ROOT", "./data"))  # default fallback to ./data
image_data_path = "audio/for-norm/for-norm/"
data_root = data_root/ image_data_path

print(f"Data directory: {data_root}")
print(f"Exists: {data_root.exists()}")

# Add src to path
src_path = Path(r"D:\florida_coursework\third_sem\multimedia_expert_systems\multimedia_prototype\mutlimedia_mvp\src")
sys.path.insert(0, str(src_path))

from loaders.visual_loader import CelebDFVisualDataset
from encoders.visual_encoder import VisualEncoder

Using device: cuda
Data directory: D:\florida_coursework\third_sem\multimedia_expert_systems\multimedia_prototype\data\audio\for-norm\for-norm
Exists: True


In [4]:
import librosa
import gc
import ast
import numpy as np
import pandas as pd
from pathlib import Path
# from tqdm import tqdm
import torchaudio  # torchaudio==2.9.0  torch==2.9.0 torchcodec==0.8
import sys
import datasets  # pip install datasets==3.6.0
from datasets import load_dataset
import evaluate
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

full_dataset = load_dataset(
    "audiofolder",
    data_dir=data_root,
)

Resolving data files:   0%|          | 0/53868 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/10798 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/4634 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/53868 [00:00<?, ?files/s]

Computing checksums:  69%|######8   | 36998/53868 [00:05<00:02, 7398.20it/s]

Downloading data:   0%|          | 0/10798 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/4634 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
full_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 53868
    })
    validation: Dataset({
        features: ['audio', 'label'],
        num_rows: 10798
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 4634
    })
})

In [6]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

model_name = "facebook/wav2vec2-base"

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
encoder = Wav2Vec2Model.from_pretrained(model_name)



In [7]:
max_duration = 5.0  # seconds (you can change this)

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]

    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        padding="max_length",
        return_attention_mask=True,
        return_tensors="pt"
    )
    
    inputs["labels"] = examples["label"]
    return inputs



In [None]:
# from datasets import DatasetDict
# test_only = DatasetDict({
#     "test": full_dataset["test"]
# })


In [None]:
processed_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["audio","label"],
)

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

train_loader = DataLoader(
    processed_dataset["train"],
    batch_size=8,
    shuffle=True,
    collate_fn=default_data_collator  # <-- important!
)

test_loader = DataLoader(
    processed_dataset["test"],
    batch_size=8,
    collate_fn=default_data_collator
)

In [None]:
import torch
import torch.nn as nn

class AudioEmbeddingModel(nn.Module):
    def __init__(self, encoder):
        super().__init__()
        self.encoder = encoder
        if hasattr(self.encoder, "freeze_feature_extractor"):
            self.encoder.freeze_feature_extractor()

    def forward(self, input_values, attention_mask):
        outputs = self.encoder(
            input_values=input_values,
            attention_mask=attention_mask
        )
        
        hidden_states = outputs.last_hidden_state  # (B, T_hidden, H)
        
        # HuggingFace Wav2Vec2 / WavLM already applies attention masking internally
        # So you can just do mean pooling over time dimension
        embedding = hidden_states.mean(dim=1)     # (B, H)
        
        return embedding


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

embedding_model = AudioEmbeddingModel(encoder).to(device)
embedding_model.eval()


In [None]:
all_audio_embs = []
all_labels = []

embedding_model.eval()
with torch.no_grad():
    for batch in train_loader:
        input_values = batch["input_values"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"]  # still available as tensor

        emb = embedding_model(input_values, attention_mask)
        all_audio_embs.append(emb.cpu())
        all_labels.append(labels)  # collect labels


In [None]:
all_audio_embs = torch.cat(all_audio_embs, dim=0)   # (N, H)
all_labels = torch.cat(all_labels, dim=0)           # (N,)

In [None]:
save_path = "embeddings/audio_embeddings.pt"
torch.save({
    "embeddings": all_audio_embs,   # (N, D)
    "labels": all_labels            # (N,)
}, save_path)

print("Saved:", save_path)