In [55]:
import os
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline, AutoModelForImageClassification, AutoFeatureExtractor, AutoModelForAudioClassification
import accelerate
import pandas as pd

In [56]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

print("Device:", device)
print("dtype:", torch_dtype)

Device: cuda:0
dtype: torch.float16


In [57]:
import librosa
import matplotlib.pyplot as plt
import numpy as np

def get_audio_features(y, sr, output_path="spectrogram.png"):
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    mel_spec_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mel_spec_db, x_axis='time', y_axis='mel', sr=sr)
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel-frequency spectrogram')
    plt.savefig(output_path)
    plt.close()
    return output_path

In [58]:
def setup_huggingface_model():
    model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
    model = AutoModelForAudioClassification.from_pretrained(model_name)
    model.to(device)
    return feature_extractor, model

In [59]:
def recognize_emotion_huggingface(audio_path, feature_extractor, model):
    waveform, sample_rate = librosa.load(audio_path, sr=16000)
    

    inputs = feature_extractor(waveform, sampling_rate=sample_rate, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits
        predicted_class_id = logits.argmax(-1).item()
    
    emotion = model.config.id2label[predicted_class_id]

    scores = torch.nn.functional.softmax(logits, dim=1)[0].tolist()
    emotion_scores = {model.config.id2label[i]: scores[i] for i in range(len(scores))}
    
    return emotion, emotion_scores

In [64]:
def analyze_audio_emotion(audio_path, use_huggingface=True, use_resnet=False, 
                         resnet_model_path=None, save_spectrogram=True):
    try:
        results = {"audio_path": audio_path}
        
        # HuggingFace model approach
        if use_huggingface:
            feature_extractor, hf_model = setup_huggingface_model()
            emotion, scores = recognize_emotion_huggingface(audio_path, feature_extractor, hf_model)
            results["huggingface_emotion"] = emotion
            results["huggingface_scores"] = scores
        
        return results
    except Exception as e:
        print(f"Error in analyze_audio_emotion: {e}")
        # Return an empty dictionary instead of None
        return {}

In [71]:
audio_file = "Crema_data\\1091_WSI_HAP_XX.wav"

results = analyze_audio_emotion(
    audio_file, 
    use_huggingface=True,
    use_resnet=False,
    save_spectrogram=True
)

# Print results
print(f"Audio file: {audio_file}")
if 'huggingface_emotion' in results:
    print(f"Detected emotion (HuggingFace): {results['huggingface_emotion']}")
    print("Emotion confidence scores:")
    for emotion, score in results['huggingface_scores'].items():
        print(f"  {emotion}: {score:.4f}")

if 'spectrogram_path' in results:
    print(f"Spectrogram saved to: {results['spectrogram_path']}")

Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', '

Audio file: Crema_data\1091_WSI_HAP_XX.wav
Detected emotion (HuggingFace): happy
Emotion confidence scores:
  angry: 0.1296
  calm: 0.1228
  disgust: 0.1248
  fearful: 0.1094
  happy: 0.1360
  neutral: 0.1283
  sad: 0.1304
  surprised: 0.1188
