# Hierarchical Semantic Voice Analysis

In [None]:
import torch
import torch.nn as nn
from transformers import Wav2Vec2Model, Wav2Vec2Processor

class HierarchicalAudioClassifier(nn.Module):
    def __init__(self, 
                 wav2vec_model_name="facebook/wav2vec2-base", 
                 hidden_dim=768, 
                 num_speech_types=2, 
                 num_emotions=8, 
                 num_intensities=2):
        """
        Hierarchical audio classification model using Wav2Vec2 as a feature extractor.
        The model performs:
            1. Speech Type classification (speech/song)
            2. Emotion classification
            3. Emotion intensity classification
        """
        super().__init__()
        
        # --- Feature Extractor ---
        self.encoder = Wav2Vec2Model.from_pretrained(wav2vec_model_name)
        self.hidden_dim = hidden_dim
        
        # --- Level 1: Speech Type Classifier ---
        self.speech_type_head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim // 2, num_speech_types)
        )
        
        # --- Level 2: Emotion Classifier ---
        # We can have separate emotion heads for speech and song if desired
        self.emotion_heads = nn.ModuleDict({
            "speech": nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim // 2),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(hidden_dim // 2, num_emotions)
            ),
            "song": nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim // 2),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(hidden_dim // 2, num_emotions)
            )
        })
        
        # --- Level 3: Intensity Classifier ---
        # Similarly, emotion-specific or global intensity heads can be defined
        self.intensity_heads = nn.ModuleDict({
            emotion: nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim // 2),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(hidden_dim // 2, num_intensities)
            ) for emotion in [
                "neutral", "calm", "happy", "sad", 
                "angry", "fearful", "disgust", "surprised"
            ]
        })
        
        # --- Global Pooling Layer ---
        self.pooling = nn.AdaptiveMeanPool1d(1)

    def forward_features(self, audio_inputs, attention_mask=None):
        """Extract and pool features from Wav2Vec2."""
        outputs = self.encoder(audio_inputs, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # [B, T, H]
        pooled = hidden_states.mean(dim=1)  # [B, H]
        return pooled
    
    def forward(self, audio_inputs, attention_mask=None):
        """
        Forward pass through all levels.
        This function only returns raw logits for each level.
        In practice, you’ll likely call them sequentially (level by level).
        """
        pooled_features = self.forward_features(audio_inputs, attention_mask)
        
        # Level 1: Speech type logits
        speech_logits = self.speech_type_head(pooled_features)
        
        # Placeholder outputs for emotion/intensity (computed conditionally in practice)
        emotion_logits = None
        intensity_logits = None
        
        return {
            "features": pooled_features,
            "speech_logits": speech_logits,
            "emotion_logits": emotion_logits,
            "intensity_logits": intensity_logits
        }

    def forward_emotion(self, features, speech_type_label: str):
        """Forward through the appropriate emotion head."""
        return self.emotion_heads[speech_type_label](features)
    
    def forward_intensity(self, features, emotion_label: str):
        """Forward through the appropriate intensity head."""
        return self.intensity_heads[emotion_label](features)


: 

In [None]:
model = HierarchicalAudioClassifier()

model