In [11]:
torchaudio.set_audio_backend("sox_io")

  torchaudio.set_audio_backend("sox_io")


In [16]:
import numpy as np
print(np.__version__)


1.24.4


In [21]:
import torch
import torch.nn as nn
from transformers import Wav2Vec2Model, Wav2Vec2Processor
import librosa
import numpy

# Define the custom model
class Wav2Vec2WithAttention(nn.Module):
    def __init__(self, pretrained_model_name, num_classes):
        super(Wav2Vec2WithAttention, self).__init__()
        # Load Wav2Vec2 base model
        self.wav2vec2 = Wav2Vec2Model.from_pretrained(pretrained_model_name)
        # Attention mechanism
        self.attention = nn.MultiheadAttention(embed_dim=self.wav2vec2.config.hidden_size, num_heads=4, batch_first=True)
        # Fully connected layer for classification
        self.fc = nn.Linear(self.wav2vec2.config.hidden_size, num_classes)
    
    def forward(self, input_values, attention_mask=None):
        # Get hidden states from Wav2Vec2
        hidden_states = self.wav2vec2(input_values, attention_mask=attention_mask).last_hidden_state
        # Apply self-attention
        attention_output, _ = self.attention(hidden_states, hidden_states, hidden_states)
        # Mean pooling
        pooled_output = torch.mean(attention_output, dim=1)
        # Classify
        logits = self.fc(pooled_output)
        return logits

# Load pre-trained Wav2Vec2 processor and custom model
PRETRAINED_MODEL_NAME = "facebook/wav2vec2-base"
NUM_CLASSES = 8  # Number of emotion classes
processor = Wav2Vec2Processor.from_pretrained(PRETRAINED_MODEL_NAME)
model = Wav2Vec2WithAttention(PRETRAINED_MODEL_NAME, NUM_CLASSES)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Preprocess audio function
def preprocess_audio(file_path):
    # Load the audio file using librosa
    waveform, sample_rate = librosa.load(file_path, sr=16000)
    return torch.tensor(waveform, dtype=torch.float32)

# Predict emotion function
def predict_emotion_with_attention(audio_path):
    # Preprocess audio
    input_audio = preprocess_audio(audio_path)
    
    # Tokenize audio and prepare inputs
    inputs = processor(input_audio.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    
    # Debugging: print shapes of input_values and attention_mask
    print(f"input_values shape: {inputs.input_values.shape}")
    print(f"attention_mask shape: {inputs.attention_mask.shape}")

    input_values = inputs.input_values.to(device)
    attention_mask = inputs.attention_mask.to(device)
    
    # Predict emotion
    model.eval()
    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask)
        predicted_class = torch.argmax(logits, dim=-1).item()
    
    # Emotion labels (can customize based on dataset)
    emotion_labels = {0: "neutral", 1: "happy", 2: "sad", 3: "angry", 4: "fearful", 5: "disgust", 6: "surprised", 7: "calm"}
    return emotion_labels.get(predicted_class, "unknown")

# Example usage
audio_file = "/home/common/ai_workspace/user_space/hooda_workspace/emotion_recognition/wave2vec2/audio_file/03-01-05-01-01-01-05.wav"  # Replace with your audio file path
emotion = predict_emotion_with_attention(audio_file)
print(f"Predicted Emotion: {emotion}")


RuntimeError: Numpy is not available

In [10]:
import torch
import torch.nn as nn
from transformers import Wav2Vec2Model, Wav2Vec2Processor
import torchaudio

import soundfile as sf

def preprocess_audio(file_path):
    # Load the audio file using soundfile
    waveform, sample_rate = sf.read(file_path)
    if sample_rate != 16000:
        resample = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resample(torch.tensor(waveform).unsqueeze(0))
    return waveform.squeeze().numpy()


# Define the custom model
class Wav2Vec2WithAttention(nn.Module):
    def __init__(self, pretrained_model_name, num_classes):
        super(Wav2Vec2WithAttention, self).__init__()
        # Load Wav2Vec2 base model
        self.wav2vec2 = Wav2Vec2Model.from_pretrained(pretrained_model_name)
        # Attention mechanism
        self.attention = nn.MultiheadAttention(embed_dim=self.wav2vec2.config.hidden_size, num_heads=4, batch_first=True)
        # Fully connected layer for classification
        self.fc = nn.Linear(self.wav2vec2.config.hidden_size, num_classes)
    
    def forward(self, input_values, attention_mask=None):
        # Get hidden states from Wav2Vec2
        hidden_states = self.wav2vec2(input_values, attention_mask=attention_mask).last_hidden_state
        # Apply self-attention
        attention_output, _ = self.attention(hidden_states, hidden_states, hidden_states)
        # Mean pooling
        pooled_output = torch.mean(attention_output, dim=1)
        # Classify
        logits = self.fc(pooled_output)
        return logits

# Load pre-trained Wav2Vec2 processor and custom model
PRETRAINED_MODEL_NAME = "facebook/wav2vec2-base"
NUM_CLASSES = 8  # Number of emotion classes
processor = Wav2Vec2Processor.from_pretrained(PRETRAINED_MODEL_NAME)
model = Wav2Vec2WithAttention(PRETRAINED_MODEL_NAME, NUM_CLASSES)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Preprocess audio function
def preprocess_audio(file_path):
    # Load the audio file and resample to 16kHz
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != 16000:
        resample = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resample(waveform)
    return waveform.squeeze().numpy()

# Predict emotion function
def predict_emotion_with_attention(audio_path):
    # Preprocess audio
    input_audio = preprocess_audio(audio_path)
    
    # Tokenize audio and prepare inputs
    inputs = processor(input_audio, sampling_rate=16000, return_tensors="pt", padding=True)
    input_values = inputs.input_values.to(device)
    attention_mask = inputs.attention_mask.to(device)
    
    # Predict emotion
    model.eval()
    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask)
        predicted_class = torch.argmax(logits, dim=-1).item()
    
    # Emotion labels (can customize based on dataset)
    emotion_labels = {0: "neutral", 1: "happy", 2: "sad", 3: "angry", 4: "fearful", 5: "disgust", 6: "surprised", 7: "calm"}
    return emotion_labels.get(predicted_class, "unknown")

# Example usage
audio_file = "/home/common/ai_workspace/user_space/hooda_workspace/emotion_recognition/wave2vec2/audio_file/03-01-05-01-01-01-05.wav"  # Replace with your audio file path
# preprocess_audio(audio_file)
emotion = predict_emotion_with_attention(preprocess_audio(audio_file))
print(f"Predicted Emotion: {emotion}")


RuntimeError: Couldn't find appropriate backend to handle uri /home/common/ai_workspace/user_space/hooda_workspace/emotion_recognition/wave2vec2/audio_file/03-01-05-01-01-01-05.wav and format None.