In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
uwrfkaggler_ravdess_emotional_speech_audio_path = kagglehub.dataset_download('uwrfkaggler/ravdess-emotional-speech-audio')
ejlok1_toronto_emotional_speech_set_tess_path = kagglehub.dataset_download('ejlok1/toronto-emotional-speech-set-tess')
ejlok1_cremad_path = kagglehub.dataset_download('ejlok1/cremad')
ejlok1_surrey_audiovisual_expressed_emotion_savee_path = kagglehub.dataset_download('ejlok1/surrey-audiovisual-expressed-emotion-savee')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/uwrfkaggler/ravdess-emotional-speech-audio?dataset_version_number=1...


100%|██████████| 429M/429M [00:15<00:00, 28.9MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/ejlok1/toronto-emotional-speech-set-tess?dataset_version_number=1...


100%|██████████| 428M/428M [00:14<00:00, 30.4MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/ejlok1/cremad?dataset_version_number=1...


100%|██████████| 451M/451M [00:18<00:00, 25.5MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/ejlok1/surrey-audiovisual-expressed-emotion-savee?dataset_version_number=1...


100%|██████████| 107M/107M [00:05<00:00, 19.1MB/s]

Extracting files...





Data source import complete.


In [None]:
# Import libraries
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import glob
from sklearn.metrics import confusion_matrix
import IPython.display as ipd  # To play sound in the notebook
import os
import sys
import warnings
# ignore warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
import kagglehub
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm

# Step 1: Download the RAVDESS dataset
path = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")
print("Path to dataset files:", path)

RAV = os.path.join(path, "audio_speech_actors_01-24")

# Step 2: Define feature extraction function
def extract_features(file_path, n_mfcc=40):
    """
    Extract MFCC features from an audio file.
    """
    y, sr = librosa.load(file_path, duration=2.5, offset=0.5)  # Load audio
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)  # Extract MFCC
    mfcc_scaled = np.mean(mfcc.T, axis=0)  # Scale features
    return mfcc_scaled

# Step 3: Prepare data
def load_data(dataset_path):
    """
    Load dataset, extract features, and prepare labels.
    """
    emotions = {
        "01": "neutral",
        "02": "calm",
        "03": "happy",
        "04": "sad",
        "05": "angry",
        "06": "fearful",
        "07": "disgust",
        "08": "surprised",
    }
    X, y = [], []
    for actor in os.listdir(dataset_path):  # Iterate over actor folders
        actor_folder = os.path.join(dataset_path, actor)
        for file in tqdm(os.listdir(actor_folder), desc=f"Processing actor {actor}"):
            if file.endswith(".wav"):
                file_path = os.path.join(actor_folder, file)
                features = extract_features(file_path)
                X.append(features)
                emotion_label = emotions[file.split("-")[2]]  # Extract emotion from filename
                y.append(emotion_label)
    return np.array(X), np.array(y)

print("Loading data...")
X, y = load_data(RAV)

# Step 4: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

# Step 5: Train a model (Random Forest Classifier)
print("Training the model...")
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Path to dataset files: /root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1
Loading data...


Processing actor Actor_04: 100%|██████████| 60/60 [00:00<00:00, 63.42it/s]
Processing actor Actor_14: 100%|██████████| 60/60 [00:00<00:00, 60.15it/s]
Processing actor Actor_01: 100%|██████████| 60/60 [00:00<00:00, 61.36it/s]
Processing actor Actor_11: 100%|██████████| 60/60 [00:01<00:00, 33.42it/s]
Processing actor Actor_06: 100%|██████████| 60/60 [00:01<00:00, 31.59it/s]
Processing actor Actor_13: 100%|██████████| 60/60 [00:01<00:00, 32.84it/s]
Processing actor Actor_17: 100%|██████████| 60/60 [00:00<00:00, 60.15it/s]
Processing actor Actor_16: 100%|██████████| 60/60 [00:00<00:00, 60.47it/s]
Processing actor Actor_15: 100%|██████████| 60/60 [00:00<00:00, 61.64it/s]
Processing actor Actor_08: 100%|██████████| 60/60 [00:00<00:00, 62.00it/s]
Processing actor Actor_12: 100%|██████████| 60/60 [00:00<00:00, 62.60it/s]
Processing actor Actor_03: 100%|██████████| 60/60 [00:00<00:00, 63.80it/s]
Processing actor Actor_09: 100%|██████████| 60/60 [00:00<00:00, 61.79it/s]
Processing actor Actor_20

Training samples: 1152, Testing samples: 288
Training the model...
Classification Report:
               precision    recall  f1-score   support

       angry       0.79      0.62      0.70        50
        calm       0.60      0.97      0.74        37
     disgust       0.56      0.56      0.56        32
     fearful       0.51      0.63      0.57        30
       happy       0.53      0.35      0.42        46
     neutral       0.75      0.45      0.56        20
         sad       0.59      0.47      0.52        36
   surprised       0.55      0.73      0.63        37

    accuracy                           0.60       288
   macro avg       0.61      0.60      0.59       288
weighted avg       0.61      0.60      0.59       288

Accuracy: 0.6006944444444444


In [None]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from keras.utils import to_categorical
from keras.optimizers import Adam
from tqdm import tqdm

# Step 1: Dataset Path
path = "/root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/audio_speech_actors_01-24/"

# Step 2: Feature Extraction
def extract_features(file_path):
    y, sr = librosa.load(file_path, duration=2.5, offset=0.5)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    features = np.concatenate([np.mean(mfcc, axis=1), np.mean(mel, axis=1), np.mean(chroma, axis=1)])
    return features

# Step 3: Data Preparation
def load_data(dataset_path):
    emotions = {
        "01": "neutral",
        "02": "calm",
        "03": "happy",
        "04": "sad",
        "05": "angry",
        "06": "fearful",
        "07": "disgust",
        "08": "surprised",
    }
    X, y = [], []
    for actor in os.listdir(dataset_path):
        actor_folder = os.path.join(dataset_path, actor)
        for file in tqdm(os.listdir(actor_folder), desc=f"Processing actor {actor}"):
            if file.endswith(".wav"):
                file_path = os.path.join(actor_folder, file)
                features = extract_features(file_path)
                X.append(features)
                emotion_label = emotions[file.split("-")[2]]
                y.append(emotion_label)
    return np.array(X), np.array(y)

print("Loading data...")
X, y = load_data(path)


Loading data...


Processing actor Actor_04: 100%|██████████| 60/60 [00:02<00:00, 25.24it/s]
Processing actor Actor_14: 100%|██████████| 60/60 [00:03<00:00, 17.86it/s]
Processing actor Actor_01: 100%|██████████| 60/60 [00:04<00:00, 14.77it/s]
Processing actor Actor_11: 100%|██████████| 60/60 [00:02<00:00, 24.04it/s]
Processing actor Actor_06: 100%|██████████| 60/60 [00:04<00:00, 12.26it/s]
Processing actor Actor_13: 100%|██████████| 60/60 [00:03<00:00, 19.49it/s]
Processing actor Actor_17: 100%|██████████| 60/60 [00:04<00:00, 13.43it/s]
Processing actor Actor_16: 100%|██████████| 60/60 [00:02<00:00, 25.61it/s]
Processing actor Actor_15: 100%|██████████| 60/60 [00:02<00:00, 24.55it/s]
Processing actor Actor_08: 100%|██████████| 60/60 [00:02<00:00, 25.40it/s]
Processing actor Actor_12: 100%|██████████| 60/60 [00:02<00:00, 25.15it/s]
Processing actor Actor_03: 100%|██████████| 60/60 [00:04<00:00, 12.53it/s]
Processing actor Actor_09: 100%|██████████| 60/60 [00:02<00:00, 23.00it/s]
Processing actor Actor_20

In [None]:
# Encode labels
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Dropout, Flatten, Dense
from keras.utils import to_categorical
from keras.optimizers import Adam
from tqdm import tqdm

# Import necessary libraries
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import numpy as np

# Convert multi-column labels to 1D if necessary
# If y is one-hot encoded, we skip this step
if y.ndim > 1:
    y = np.argmax(y, axis=1)  # Flatten one-hot encoded labels to integer labels

# Apply label encoding
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Convert to one-hot encoding
y = to_categorical(y)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape for CNN input (from (samples, features) to (samples, features, 1))
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Step 4: CNN Model
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Dropout, Flatten, Dense
from keras.optimizers import Adam

model = Sequential([
    Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Conv1D(128, 3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(y_train.shape[1], activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Step 5: Train the Model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

# Step 6: Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

# Classification Report
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

from sklearn.metrics import classification_report
print("Classification Report:\n", classification_report(y_test_classes, y_pred_classes))

print("Classification Report:\n", classification_report(y_test_classes, y_pred_classes))

# Save the trained model as an .h5 file
model.save('emotion_model.h5')
print("Model saved as emotion_model.h5")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 49ms/step - accuracy: 0.1693 - loss: 4.5027 - val_accuracy: 0.1632 - val_loss: 1.9674
Epoch 2/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.2253 - loss: 1.9901 - val_accuracy: 0.3021 - val_loss: 1.8610
Epoch 3/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 73ms/step - accuracy: 0.3051 - loss: 1.8304 - val_accuracy: 0.3403 - val_loss: 1.7419
Epoch 4/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.3214 - loss: 1.7962 - val_accuracy: 0.3438 - val_loss: 1.7040
Epoch 5/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.3259 - loss: 1.7904 - val_accuracy: 0.3750 - val_loss: 1.6781
Epoch 6/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - accuracy: 0.3858 - loss: 1.6717 - val_accuracy: 0.3993 - val_loss: 1.5706
Epoch 7/30
[1m36/36[0m [32m━━━━



Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.70      0.80        50
           1       0.67      0.81      0.73        37
           2       0.73      0.59      0.66        32
           3       0.62      0.70      0.66        30
           4       0.64      0.30      0.41        46
           5       0.56      0.50      0.53        20
           6       0.44      0.61      0.51        36
           7       0.59      0.89      0.71        37

    accuracy                           0.64       288
   macro avg       0.65      0.64      0.63       288
weighted avg       0.67      0.64      0.63       288

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.70      0.80        50
           1       0.67      0.81      0.73        37
           2       0.73      0.59      0.66        32
           3       0.62      0.70      0.66        30
           4       0.64      0

In [None]:
# Print all unique moods from the dataset
unique_moods = encoder.classes_  # Retrieve the classes from the LabelEncoder
print("All Moods in the Dataset:")
for mood in unique_moods:
    print(mood)


All Moods in the Dataset:
angry
calm
disgust
fearful
happy
neutral
sad
surprised


In [None]:
!pip install pydub



In [None]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Dropout, Flatten, Dense
from keras.utils import to_categorical
from keras.optimizers import Adam
from tqdm import tqdm
import soundfile as sf  # For saving audio (optional)
from sklearn.metrics import classification_report

# --- Feature Extraction ---
def extract_features(file_path):
    y, sr = librosa.load(file_path, duration=2.5, offset=0.5)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    features = np.concatenate([np.mean(mfcc, axis=1), np.mean(mel, axis=1), np.mean(chroma, axis=1)])
    return features

# --- Data Loading and Preprocessing ---
def load_and_preprocess_data(dataset_path):
    emotions = {
        "01": "neutral",
        "02": "calm",
        "03": "happy",
        "04": "sad",
        "05": "angry",
        "06": "fearful",
        "07": "disgust",
        "08": "surprised",
    }
    X, y = [], []
    for actor in os.listdir(dataset_path):
        actor_folder = os.path.join(dataset_path, actor)
        for file in tqdm(os.listdir(actor_folder), desc=f"Processing actor {actor}"):
            if file.endswith(".wav"):
                file_path = os.path.join(actor_folder, file)
                features = extract_features(file_path)
                X.append(features)
                emotion_label = emotions[file.split("-")[2]]
                y.append(emotion_label)

    X = np.array(X)
    y = np.array(y)

    encoder = LabelEncoder()
    y = encoder.fit_transform(y)
    y = to_categorical(y)  # One-hot encoding

    return X, y, encoder  # Return features, labels, and encoder

# Load data
ravdess_path = "/root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/audio_speech_actors_01-24/"
 # Replace with your dataset path
X, y, encoder = load_and_preprocess_data(ravdess_path)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# --- Model Architecture ---
model = Sequential([
    Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Conv1D(128, 3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(y_train.shape[1], activation='softmax')  # Output layer for multi-class classification
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

# --- Model Evaluation ---
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

# --- Prediction Function ---
def predict_emotion(audio_data, model, encoder, sample_rate=22050):
    """
    Predicts the emotion from input audio data (NumPy array).

    Args:
        audio_data (np.ndarray): The audio data as a NumPy array.
        model: The trained Keras model.
        encoder: The trained LabelEncoder.
        sample_rate (int): The sample rate of the audio data. Defaults to 22050 Hz.

    Returns:
        str or None: The predicted emotion label or None if an error occurs.
    """
    try:
        # Feature extraction
        features = extract_features_from_array(audio_data, sample_rate).reshape(1, -1, 1)  # Reshape for model

        # Predict emotion
        probabilities = model.predict(features)[0]
        predicted_class = np.argmax(probabilities)
        predicted_emotion = encoder.inverse_transform([predicted_class])[0]
        return predicted_emotion
    except Exception as e:
        print(f"Error predicting emotion: {e}")
        return None

# Feature extraction for NumPy array input
def extract_features_from_array(y, sr):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    features = np.concatenate([np.mean(mfcc, axis=1), np.mean(mel, axis=1), np.mean(chroma, axis=1)])
    return features

# --- Example Usage with Audio File ---
new_audio_file = "/content/Man Upset _ Raging Screams Sound Effect ( 64kbps ) - @nrbots.mp3"  # Replace with your new audio file (wav or mp3)
y, sr = librosa.load(new_audio_file, sr=None)  # Load audio with original sample rate

# Predict emotion
predicted_emotion = predict_emotion(y, model, encoder, sr)
if predicted_emotion:
    print(f"Predicted Emotion for the audio: {predicted_emotion}")

# --- Example Usage with Recorded Audio (Optional) ---
# If you want to record audio:
# import sounddevice as sd
# duration = 2.5  # seconds
# y = sd.rec(int(sr * duration), samplerate=sr, channels=1)
# sd.wait()  # Wait until recording is finished

# predicted_emotion = predict_emotion(y, model, encoder, sr)
# if predicted_emotion:
#     print(f"Predicted Emotion for recorded audio: {predicted_emotion}")


Processing actor Actor_04: 100%|██████████| 60/60 [00:02<00:00, 24.50it/s]
Processing actor Actor_14: 100%|██████████| 60/60 [00:02<00:00, 24.68it/s]
Processing actor Actor_01: 100%|██████████| 60/60 [00:02<00:00, 24.34it/s]
Processing actor Actor_11: 100%|██████████| 60/60 [00:02<00:00, 25.60it/s]
Processing actor Actor_06: 100%|██████████| 60/60 [00:03<00:00, 15.65it/s]
Processing actor Actor_13: 100%|██████████| 60/60 [00:03<00:00, 17.57it/s]
Processing actor Actor_17: 100%|██████████| 60/60 [00:02<00:00, 25.93it/s]
Processing actor Actor_16: 100%|██████████| 60/60 [00:02<00:00, 25.22it/s]
Processing actor Actor_15: 100%|██████████| 60/60 [00:02<00:00, 24.23it/s]
Processing actor Actor_08: 100%|██████████| 60/60 [00:03<00:00, 18.62it/s]
Processing actor Actor_12: 100%|██████████| 60/60 [00:04<00:00, 13.59it/s]
Processing actor Actor_03: 100%|██████████| 60/60 [00:02<00:00, 25.75it/s]
Processing actor Actor_09: 100%|██████████| 60/60 [00:02<00:00, 25.50it/s]
Processing actor Actor_20

Epoch 1/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 47ms/step - accuracy: 0.1811 - loss: 3.9127 - val_accuracy: 0.2882 - val_loss: 1.8967
Epoch 2/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 70ms/step - accuracy: 0.2786 - loss: 1.8862 - val_accuracy: 0.3333 - val_loss: 1.7708
Epoch 3/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.2988 - loss: 1.7955 - val_accuracy: 0.3194 - val_loss: 1.7202
Epoch 4/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.3209 - loss: 1.7307 - val_accuracy: 0.3854 - val_loss: 1.6889
Epoch 5/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.3631 - loss: 1.6878 - val_accuracy: 0.3993 - val_loss: 1.6519
Epoch 6/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.3584 - loss: 1.6572 - val_accuracy: 0.4097 - val_loss: 1.5773
Epoch 7/30
[1m36/36[0m [32m━━━━

In [None]:
!pip install gradio



In [None]:
# prompt: write gradio for get audio input and use model.h5 and provode output

import gradio as gr
import librosa
import numpy as np
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
from pydub import AudioSegment

# Load the trained model and encoder
model = keras.models.load_model('emotion_model.h5')
# Load data (replace with your actual data loading)
ravdess_path = "/root/.cache/kagglehub/datasets/uwrfkaggler/ravdess-emotional-speech-audio/versions/1/audio_speech_actors_01-24/"
X, y, encoder = load_and_preprocess_data(ravdess_path) # Assuming you have this function defined

# Feature extraction function (same as before)
def extract_features_from_array(y, sr):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    features = np.concatenate([np.mean(mfcc, axis=1), np.mean(mel, axis=1), np.mean(chroma, axis=1)])
    return features

# Prediction function (modified to handle Gradio input)


Processing actor Actor_04: 100%|██████████| 60/60 [00:02<00:00, 24.25it/s]
Processing actor Actor_14: 100%|██████████| 60/60 [00:02<00:00, 25.53it/s]
Processing actor Actor_01: 100%|██████████| 60/60 [00:02<00:00, 24.87it/s]
Processing actor Actor_11: 100%|██████████| 60/60 [00:02<00:00, 25.09it/s]
Processing actor Actor_06: 100%|██████████| 60/60 [00:04<00:00, 12.78it/s]
Processing actor Actor_13: 100%|██████████| 60/60 [00:02<00:00, 22.80it/s]
Processing actor Actor_17: 100%|██████████| 60/60 [00:02<00:00, 25.59it/s]
Processing actor Actor_16: 100%|██████████| 60/60 [00:02<00:00, 24.28it/s]
Processing actor Actor_15: 100%|██████████| 60/60 [00:02<00:00, 25.20it/s]
Processing actor Actor_08: 100%|██████████| 60/60 [00:04<00:00, 14.93it/s]
Processing actor Actor_12: 100%|██████████| 60/60 [00:03<00:00, 19.27it/s]
Processing actor Actor_03: 100%|██████████| 60/60 [00:02<00:00, 24.26it/s]
Processing actor Actor_09: 100%|██████████| 60/60 [00:02<00:00, 24.83it/s]
Processing actor Actor_20

In [None]:
def predict_emotion(audio_filepath): #changed from audio_file to audio_filepath
    try:
        # Use os.path.exists to check if the path is a valid local file
        if os.path.exists(audio_filepath):
            audio = AudioSegment.from_file(audio_filepath)  # No need for .name
        else:
            #If not a file it should be a bytesIO object (Colab or similar env)
            audio = AudioSegment.from_file_using_temporary_files(audio_filepath)

        audio = audio.set_frame_rate(22050)
        audio_data = np.array(audio.get_array_of_samples(), dtype=np.float32)
        audio_data = audio_data / np.max(np.abs(audio_data))  # Normalize to [-1, 1]

        sr = audio.frame_rate


        features = extract_features_from_array(audio_data, sr).reshape(1, -1, 1)
        probabilities = model.predict(features)[0]
        predicted_class = np.argmax(probabilities)
        predicted_emotion = encoder.inverse_transform([predicted_class])[0]
        return predicted_emotion
    except Exception as e:
        return f"Error: {str(e)}" # Convert exception to string for display




iface = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Emotion Recognition from Audio",
    description="Upload an audio file to predict the emotion."
)


iface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8fa985b9820605d311.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
import gradio as gr
import librosa
import numpy as np
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
from pydub import AudioSegment
import os

# Load the trained model and encoder
model = keras.models.load_model('emotion_model.h5')

# Define emotion labels (ensure these match your trained model's labels)
emotions = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]
encoder = LabelEncoder()
encoder.fit(emotions)  # Use the same order as during training

emotion_colors = {
    "angry": "#FF0000",        # Red (symbolizing anger)
    "calm": "#00CED1",         # Dark Turquoise (symbolizing tranquility)
    "disgust": "#556B2F",      # Dark Olive Green (symbolizing unpleasantness)
    "fearful": "#FFFF00",      # Yellow (symbolizing caution or fear)
    "happy": "#FFD700",        # Gold (symbolizing joy)
    "neutral": "#F5F5F5",      # Light Gray (symbolizing neutrality)
    "sad": "#1E90FF",          # Dodger Blue (symbolizing sadness)
    "surprised": "#FF69B4",    # Hot Pink (symbolizing excitement or shock)
}

# Feature extraction function
def extract_features_from_array(y, sr):
    try:
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        mel = librosa.feature.melspectrogram(y=y, sr=sr)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        features = np.concatenate([np.mean(mfcc, axis=1), np.mean(mel, axis=1), np.mean(chroma, axis=1)])
        return features
    except Exception as e:
        raise ValueError(f"Feature extraction error: {str(e)}")

# Prediction function
def predict_emotion(audio_filepath):
    try:
        if os.path.exists(audio_filepath):
            audio = AudioSegment.from_file(audio_filepath)
        else:
            audio = AudioSegment.from_file_using_temporary_files(audio_filepath)

        # Normalize and resample audio
        audio = audio.set_frame_rate(22050)
        audio_data = np.array(audio.get_array_of_samples(), dtype=np.float32)
        audio_data = audio_data / np.max(np.abs(audio_data))  # Normalize to [-1, 1]
        sr = audio.frame_rate

        # Extract features and reshape for model
        features = extract_features_from_array(audio_data, sr).reshape(1, -1, 1)

        # Make prediction
        probabilities = model.predict(features)[0]
        predicted_class = np.argmax(probabilities)
        predicted_emotion = encoder.inverse_transform([predicted_class])[0]

        # Get color for emotion
        emotion_color = emotion_colors.get(predicted_emotion, "#FFFFFF")  # Default to white
        return f"<div style='width:100%; height:50px; background-color:{emotion_color}; text-align:center; color:black; line-height:50px; border-radius:10px;'>{predicted_emotion.capitalize()}</div>"
    except Exception as e:
        return f"<div style='color:red;'>Error: {str(e)}</div>"

# Gradio interface
iface = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(type="filepath"),
    outputs=gr.HTML(),  # HTML output to show styled color box
    title="Emotion Recognition from Audio",
    description="Upload an audio file to predict the emotion, displayed with a corresponding color."
)

iface.launch()




Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a1771cff0d1a803986.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


