In [1]:
# Install required packages
!pip install transformers torch torchaudio librosa datasets
!pip install soundfile

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
# Import necessary libraries
import torch
import torchaudio
import librosa
import numpy as np
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
from transformers import pipeline
import warnings
warnings.filterwarnings('ignore')


In [3]:
# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [17]:
def display_emotion_result_v2(result):
    """Display emotion detection results in detailed format"""
    print("=" * 60)
    print("🎵 AUDIO EMOTION DETECTION RESULTS (Wav2Vec2)")
    print("=" * 60)

    # Basic file info
    print(f"📁 Audio File: {result['audio_file']}")

    # Main emotion display
    print(f"\n🎯 Primary Emotion: {result['emoji']} {result['emotion'].capitalize()}")




In [5]:
def detect_emotion_rf(audio_path):
    """
    Detect emotion using r-f Wav2Vec2 model (7 emotions)
    """
    # Load audio
    speech, sampling_rate = librosa.load(audio_path, sr=16000)

    # Process audio
    inputs = feature_extractor_rf(speech, sampling_rate=16000, return_tensors="pt", padding=True)

    with torch.no_grad():
        outputs = model_rf(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # r-f model emotions (7 classes)
    emotion_labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
    emotion_emojis = ['😠', '🤢', '😨', '😊', '😐', '😢', '😲']

    predicted_idx = predictions.argmax().item()

    return {
        'emotion': emotion_labels[predicted_idx],
        'emoji': emotion_emojis[predicted_idx],
        'confidence': predictions.max().item(),
        'audio_file': audio_path,
        'all_probabilities': {emotion_labels[i]: predictions[0][i].item()
                            for i in range(len(emotion_labels))}
    }

In [6]:
model_name_rf = "r-f/wav2vec-english-speech-emotion-recognition"
model_rf = Wav2Vec2ForSequenceClassification.from_pretrained(model_name_rf)
feature_extractor_rf = Wav2Vec2FeatureExtractor.from_pretrained(model_name_rf)

print("r-f Wav2Vec2 emotion model loaded successfully!")

config.json:   0%|          | 0.00/2.73k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at r-f/wav2vec-english-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

r-f Wav2Vec2 emotion model loaded successfully!


A few audio files are provided in the Custom_Examples folder. You can replace the path and test them

In [26]:
result_rf_1 = detect_emotion_rf("man_laugh.wav")
display_emotion_result_v2(result_rf_1)

🎵 AUDIO EMOTION DETECTION RESULTS (Wav2Vec2)
📁 Audio File: man_laugh.wav

🎯 Primary Emotion: 😊 Happy


In [27]:
result_rf_1 = detect_emotion_rf("angry_grunt.wav")
display_emotion_result_v2(result_rf_1)

🎵 AUDIO EMOTION DETECTION RESULTS (Wav2Vec2)
📁 Audio File: angry_grunt.wav

🎯 Primary Emotion: 😠 Angry


In [28]:
result_rf_1 = detect_emotion_rf("Hello.wav")
display_emotion_result_v2(result_rf_1)

🎵 AUDIO EMOTION DETECTION RESULTS (Wav2Vec2)
📁 Audio File: Hello.wav

🎯 Primary Emotion: 😐 Neutral


In [29]:
result_rf_1 = detect_emotion_rf("whoah.wav")
display_emotion_result_v2(result_rf_1)

🎵 AUDIO EMOTION DETECTION RESULTS (Wav2Vec2)
📁 Audio File: whoah.wav

🎯 Primary Emotion: 😊 Happy
