In [2]:
import librosa
import numpy as np
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler
import warnings
import os
warnings.filterwarnings('ignore')

class EmotionRecognizer:
    def __init__(self, model_path='model.h5'):
        self.model_path = model_path
        self.model = None
        self.scaler = None
        self.required_time_steps = 352
        self.emotion_map = {
            0: 'neutral',
            1: 'happy',
            2: 'sad',
            3: 'angry',
            4: 'fear',
            5: 'disgust'
        }

    def load_model(self):
        """Load the trained model."""
        try:
            self.model = load_model(self.model_path)
            print("Model loaded successfully")
            return True
        except Exception as e:
            print(f"Error loading model: {str(e)}")
            return False

    def extract_features(self, audio_path):
        """Extract and normalize audio features matching the original 15 features."""
        try:
            # Load audio file
            y, sr = librosa.load(audio_path, sr=22050, duration=3)

            # Set consistent parameters
            frame_length = 2048
            hop_length = 512

            # Extract only the original features
            zcr = librosa.feature.zero_crossing_rate(y, frame_length=frame_length, hop_length=hop_length)
            rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)
            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length)

            # Print shapes for debugging
            print("\nFeature shapes before processing:")
            print(f"ZCR shape: {zcr.shape}")
            print(f"RMS shape: {rms.shape}")
            print(f"MFCCs shape: {mfccs.shape}")

            # Ensure all features have the same time steps
            min_time_steps = min(zcr.shape[1], rms.shape[1], mfccs.shape[1])

            # Trim all features to minimum length
            zcr = zcr[:, :min_time_steps]
            rms = rms[:, :min_time_steps]
            mfccs = mfccs[:, :min_time_steps]

            # Stack features (15 features total: 1 ZCR + 1 RMS + 13 MFCCs)
            features = np.vstack([zcr, rms, mfccs])

            # Handle time steps
            if features.shape[1] < self.required_time_steps:
                # Use repetition instead of zero padding
                repeat_times = self.required_time_steps // features.shape[1] + 1
                features = np.tile(features, (1, repeat_times))[:, :self.required_time_steps]
            elif features.shape[1] > self.required_time_steps:
                # Take center portion
                start_idx = (features.shape[1] - self.required_time_steps) // 2
                features = features[:, start_idx:start_idx + self.required_time_steps]

            # Transpose to (time_steps, features)
            features = features.T

            # Initialize scaler if needed
            if self.scaler is None:
                self.scaler = StandardScaler()
                features_normalized = self.scaler.fit_transform(features)
            else:
                features_normalized = self.scaler.transform(features)

            # Expand dimensions for model input
            features_final = np.expand_dims(features_normalized, axis=0)

            print(f"\nFinal feature shape: {features_final.shape}")
            return features_final

        except Exception as e:
            print(f"Error in feature extraction: {str(e)}")
            raise

    def predict_emotion(self, audio_path):
        """Predict emotion with detailed probability output."""
        try:
            # Load model if not already loaded
            if self.model is None:
                if not self.load_model():
                    return None, None, None

            # Extract features
            X_input = self.extract_features(audio_path)

            # Get predictions
            predictions = self.model.predict(X_input, verbose=0)

            # Get probabilities for all emotions
            emotion_probabilities = {
                self.emotion_map[i]: float(prob)
                for i, prob in enumerate(predictions[0])
            }

            # Get predicted emotion and confidence
            predicted_class = np.argmax(predictions[0])
            predicted_emotion = self.emotion_map[predicted_class]
            confidence = predictions[0][predicted_class]

            return predicted_emotion, confidence, emotion_probabilities

        except Exception as e:
            print(f"Error in prediction: {str(e)}")
            return None, None, None

    def process_directory(self, directory_path):
        """Process all audio files in a directory."""
        results = []

        for file in os.listdir(directory_path):
            if file.endswith(('.wav', '.mp3')):
                print(f"\nProcessing: {file}")
                audio_path = os.path.join(directory_path, file)
                emotion, confidence, probs = self.predict_emotion(audio_path)

                if emotion:
                    results.append({
                        'file': file,
                        'emotion': emotion,
                        'confidence': confidence,
                        'probabilities': probs
                    })

        return results

In [4]:
def main():
    # Initialize the emotion recognizer
    recognizer = EmotionRecognizer('model.h5')

    # Test single file
    audio_path = "/content/call_142_0.wav"

    try:
        emotion, confidence, probabilities = recognizer.predict_emotion(audio_path)

        if emotion:
            print("\nPrediction Results:")
            print(f"Predicted Emotion: {emotion}")
            print(f"Confidence: {confidence:.2%}")
            print("\nProbabilities for all emotions:")
            for emotion_name, prob in probabilities.items():
                print(f"{emotion_name}: {prob:.2%}")
        else:
            print("Failed to predict emotion.")

    except Exception as e:
        print(f"Error in main: {str(e)}")

if __name__ == "__main__":
    main()



Model loaded successfully

Feature shapes before processing:
ZCR shape: (1, 130)
RMS shape: (1, 130)
MFCCs shape: (13, 130)

Final feature shape: (1, 352, 15)

Prediction Results:
Predicted Emotion: sad
Confidence: 73.09%

Probabilities for all emotions:
neutral: 11.03%
happy: 6.33%
sad: 73.09%
angry: 1.32%
fear: 3.76%
disgust: 4.48%
