In [1]:
import numpy as np
import joblib
import tensorflow as tf
import librosa
from moviepy.editor import VideoFileClip
from sklearn.preprocessing import LabelEncoder

# Define the observed emotions
observed_emotions = ['neutral', 'calm', 'happy', 'sad', 'fearful', 'surprised']

def extract_feature(data, sr, mfcc=True, chroma=True, mel=True):
    result = np.array([])
    if chroma:
        stft = np.abs(librosa.stft(data))
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma_feature = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
        result = np.hstack((result, chroma_feature))
    if mel:
        mel_feature = np.mean(librosa.feature.melspectrogram(y=data, sr=sr).T, axis=0)
        result = np.hstack((result, mel_feature))
    return result

def extract_audio_from_video(video_path, audio_path):
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(audio_path)

def predict_emotion(file_path):
    # Load the model without compiling
    model = tf.keras.models.load_model('emotion_recognition_model_combined.h5', compile=False)
    print("Model loaded successfully")

    # Compile the model with a new optimizer if needed
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)  # Adjust learning rate as necessary
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Load the label encoder
    label_encoder = joblib.load('label_encoder_combined.pkl')

    # Load and preprocess the audio file
    data, sr = librosa.load(file_path)
    segment_length = 5  # Segment length in seconds
    segment_samples = segment_length * sr
    emotion_counts = {emotion: 0 for emotion in observed_emotions}

    num_segments = len(data) // segment_samples
    remaining_samples = len(data) % segment_samples

    # Process each full segment
    for i in range(num_segments):
        start = i * segment_samples
        end = start + segment_samples
        segment = data[start:end]
        feature = extract_feature(segment, sr)
        feature_processed = np.expand_dims(feature, axis=0)
        feature_processed = np.expand_dims(feature_processed, axis=2)

        # Make prediction
        y_pred = model.predict(feature_processed)
        y_pred = np.argmax(y_pred, axis=1)
        predicted_emotion = label_encoder.inverse_transform(y_pred)[0]
        emotion_counts[predicted_emotion] += 1

    # Process the remaining part if it's significant
    if remaining_samples > 0:
        start = num_segments * segment_samples
        segment = data[start:]
        feature = extract_feature(segment, sr)
        feature_processed = np.expand_dims(feature, axis=0)
        feature_processed = np.expand_dims(feature_processed, axis=2)

        # Make prediction
        y_pred = model.predict(feature_processed)
        y_pred = np.argmax(y_pred, axis=1)
        predicted_emotion = label_encoder.inverse_transform(y_pred)[0]
        emotion_counts[predicted_emotion] += 1

    # Calculate percentage
    total_segments = sum(emotion_counts.values())
    emotion_percentages = {emotion: (count / total_segments) for emotion, count in emotion_counts.items()}
    
    return emotion_percentages

def process_video(video_path):
    audio_path = 'extracted_audio.wav'
    
    # Extract audio from video
    extract_audio_from_video(video_path, audio_path)
    
    # Predict emotion for the extracted audio
    emotion_percentages = predict_emotion(audio_path)
    
    return emotion_percentages

# Example usage
video_path = 'E:\SER\_uNup91ZYw0.002.mp4'  # Replace with your video file path
emotion_percentages = process_video(video_path)
print("Emotion Percentages:", emotion_percentages)


MoviePy - Writing audio in extracted_audio.wav


                                                                    

MoviePy - Done.




Model loaded successfully
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 262ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Emotion Percentages: ({'neutral': 0.0, 'calm': 0.25, 'happy': 0.25, 'sad': 0.0, 'fearful': 0.25, 'surprised': 0.25}, 4)
