In [None]:
import torch
import librosa
import numpy as np
from transformers import (
    Wav2Vec2ForSequenceClassification,
    Wav2Vec2FeatureExtractor
)

# ---------------------------
# Load model & feature extractor
# ---------------------------
MODEL_NAME = "superb/wav2vec2-base-superb-er"

model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)

model.eval()  # inference mode

# ---------------------------
# Load and preprocess audio
# ---------------------------
def load_audio(file_path, target_sr=16000):
    speech, sr = librosa.load(file_path, sr=target_sr, mono=True)
    return speech

# ---------------------------
# Predict emotion
# ---------------------------
def predict_emotion(audio_path):
    speech = load_audio(audio_path)

    inputs = feature_extractor(
        speech,
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    )

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=-1)

    predicted_id = torch.argmax(probs, dim=-1).item()
    label = model.config.id2label[predicted_id]
    confidence = probs[0][predicted_id].item()

    # Full distribution (recommended)
    emotions = {
        model.config.id2label[i]: probs[0][i].item()
        for i in range(len(probs[0]))
    }

    return label, confidence, emotions

# ---------------------------
# Run
# ---------------------------
emotion, confidence, all_emotions = predict_emotion("sample_3.wav")

print(f"Detected emotion: {emotion}")
print(f"Confidence: {confidence:.2f}")
print("All emotion scores:")
for k, v in all_emotions.items():
    print(f"  {k}: {v:.3f}")
