# Hierarchical Semantic Voice Analysis

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchaudio
import pandas as pd
from transformers import Wav2Vec2Processor
import json

# Load manifest and label2id
manifest = pd.read_csv("./preprocessed_dataset/manifest.csv")
with open("./preprocessed_dataset/label2id.json") as f:
    label2id = json.load(f)

# Load Wav2Vec2 processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")




In [7]:
class AudioDataset(Dataset):
    def __init__(self, manifest_df, label2id, processor):
        self.df = manifest_df.reset_index(drop=True)
        self.label2id = label2id
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = row["path"]

        # Load audio (already preprocessed to 16k)
        waveform, sr = torchaudio.load(path)
        waveform = waveform.squeeze(0)  # [T]

        # Process for Wav2Vec2 (padding happens later)
        inputs = self.processor(
            waveform.numpy(),
            sampling_rate=sr,
            return_tensors="pt",
            padding="longest"
        )

        gender_id = self.label2id["gender"][row["gender"]]
        emotion_id = self.label2id["emotion"][row["emotion"]]
        intensity_id = self.label2id["intensity"][row["intensity"]]

        return {
            "input_values": inputs.input_values.squeeze(0),
            # "attention_mask": inputs.attention_mask.squeeze(0),
            "gender": torch.tensor(gender_id, dtype=torch.long),
            "emotion": torch.tensor(emotion_id, dtype=torch.long),
            "intensity": torch.tensor(intensity_id, dtype=torch.long),
        }


In [8]:
def collate_fn(batch):
    input_values = [b["input_values"] for b in batch]

    padded = processor.pad(
        {"input_values": input_values},
        padding=True,
        return_tensors="pt",
        return_attention_mask=True  # <-- ensures attention_mask exists
    )

    gender = torch.stack([b["gender"] for b in batch])
    emotion = torch.stack([b["emotion"] for b in batch])
    intensity = torch.stack([b["intensity"] for b in batch])

    return {
        "input_values": padded.input_values,        # [B, T]
        "attention_mask": padded.attention_mask,    # [B, T]
        "gender": gender,
        "emotion": emotion,
        "intensity": intensity
    }


In [9]:
dataset = AudioDataset(manifest, label2id, processor)

dataloader = DataLoader(
    dataset,
    batch_size=4,       # you can adjust
    shuffle=True,
    collate_fn=collate_fn
)

# quick test
batch = next(iter(dataloader))
print(batch["input_values"].shape)
print("gender:", batch["gender"])
print("emotion:", batch["emotion"])
print("intensity:", batch["intensity"])


KeyError: 'gender'

In [None]:
import torch
import torch.nn as nn
from transformers import Wav2Vec2Model, Wav2Vec2Processor

class HierarchicalAudioClassifier(nn.Module):
    def __init__(self, 
                 wav2vec_model_name="facebook/wav2vec2-base", 
                 hidden_dim=768, 
                 num_speech_types=2, 
                 num_emotions=8, 
                 num_intensities=2):
        """
        Hierarchical audio classification model using Wav2Vec2 as a feature extractor.
        The model performs:
            1. Speech Type classification (speech/song)
            2. Emotion classification
            3. Emotion intensity classification
        """
        super().__init__()
        
        # --- Feature Extractor ---
        self.encoder = Wav2Vec2Model.from_pretrained(wav2vec_model_name)
        self.hidden_dim = hidden_dim
        
        # --- Level 1: Speech Type Classifier ---
        self.speech_type_head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim // 2, num_speech_types)
        )
        
        # --- Level 2: Emotion Classifier ---
        # We can have separate emotion heads for speech and song if desired
        self.emotion_heads = nn.ModuleDict({
            "speech": nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim // 2),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(hidden_dim // 2, num_emotions)
            ),
            "song": nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim // 2),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(hidden_dim // 2, num_emotions)
            )
        })
        
        # --- Level 3: Intensity Classifier ---
        # Similarly, emotion-specific or global intensity heads can be defined
        self.intensity_heads = nn.ModuleDict({
            emotion: nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim // 2),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(hidden_dim // 2, num_intensities)
            ) for emotion in [
                "neutral", "calm", "happy", "sad", 
                "angry", "fearful", "disgust", "surprised"
            ]
        })
        
        # --- Global Pooling Layer ---
        self.pooling = nn.AdaptiveAvgPool1d(1)

    def forward_features(self, audio_inputs, attention_mask=None):
        """Extract and pool features from Wav2Vec2."""
        outputs = self.encoder(audio_inputs, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # [B, T, H]
        pooled = hidden_states.mean(dim=1)  # [B, H]
        return pooled
    
    def forward(self, audio_inputs, attention_mask=None):
        """
        Forward pass through all levels.
        This function only returns raw logits for each level.
        In practice, you’ll likely call them sequentially (level by level).
        """
        pooled_features = self.forward_features(audio_inputs, attention_mask)
        
        # Level 1: Speech type logits
        speech_logits = self.speech_type_head(pooled_features)
        
        # Placeholder outputs for emotion/intensity (computed conditionally in practice)
        emotion_logits = None
        intensity_logits = None
        
        return {
            "features": pooled_features,
            "speech_logits": speech_logits,
            "emotion_logits": emotion_logits,
            "intensity_logits": intensity_logits
        }

    def forward_emotion(self, features, speech_type_label: str):
        """Forward through the appropriate emotion head."""
        return self.emotion_heads[speech_type_label](features)
    
    def forward_intensity(self, features, emotion_label: str):
        """Forward through the appropriate intensity head."""
        return self.intensity_heads[emotion_label](features)

model = HierarchicalAudioClassifier()

model

HierarchicalAudioClassifier(
  (encoder): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HierarchicalAudioClassifier().to(device)


In [None]:
batch = next(iter(dataloader))
inputs = batch["input_values"].to(device)
mask = batch["attention_mask"].to(device)

out = model(inputs, attention_mask=mask)
print(out["speech_logits"].shape)


torch.Size([4, 2])
