In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import librosa
import soundfile
import os, glob, pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, Conv1D, Concatenate, Dense, Multiply, Add, Reshape, Flatten, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# Function to extract MFCC features
def extract_mfcc(audio_file, n_mfcc=26, hop_length=512, pad_length=300):
    y, sr = librosa.load(audio_file, sr=None)  # Load audio file
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length)  # Extract MFCC features
    # Pad or truncate to a fixed length
    if mfccs.shape[1] < pad_length:
        mfccs = np.pad(mfccs, ((0, 0), (0, pad_length - mfccs.shape[1])), mode='constant')
    else:
        mfccs = mfccs[:, :pad_length]
    return mfccs

# Path to the RAVDESS dataset directory
ravdess_dir = "/content/drive/MyDrive/dataset/"

# Lists to hold the MFCCs and labels
mfccs_data = []
labels = []

# Mapping for emotions
emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

# Emotions to include
included_emotions = ['neutral', 'angry', 'sad', 'happy']

# Iterate through each audio file in the dataset
for actor_dir in os.listdir(ravdess_dir):
    actor_subdir = os.path.join(ravdess_dir, actor_dir)
    if os.path.isdir(actor_subdir):
        for filename in os.listdir(actor_subdir):
            if filename.endswith('.wav'):
                emotion_code = filename.split('-')[2]
                emotion = emotion_map[emotion_code]
                if emotion in included_emotions:
                    audio_file = os.path.join(actor_subdir, filename)
                    mfccs = extract_mfcc(audio_file)
                    mfccs_data.append(mfccs)
                    labels.append(included_emotions.index(emotion))  # Using index as label
                else:
                    continue

# Convert lists to numpy arrays
mfccs_data = np.array(mfccs_data)
labels = np.array(labels)

# Print shape of the MFCCs and labels arrays
print("MFCCs shape:", mfccs_data.shape)
print("Labels shape:", labels.shape)

MFCCs shape: (672, 26, 300)
Labels shape: (672,)


In [9]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(mfccs_data, labels, test_size=0.4, random_state=0)
# X_train=StandardScaler().fit_transform(X_train)

In [10]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (403, 26, 300)
X_test shape: (269, 26, 300)
y_train shape: (403,)
y_test shape: (269,)


In [11]:
# Assuming y_train and y_test are already defined
unique_train, counts_train = np.unique(y_train, return_counts=True)
print("Unique values in y_train:", unique_train)
print("Counts in y_train:", counts_train)

unique_test, counts_test = np.unique(y_test, return_counts=True)
print("Unique values in y_test:", unique_test)
print("Counts in y_test:", counts_test)

Unique values in y_train: [0 1 2 3]
Counts in y_train: [ 60 121 109 113]
Unique values in y_test: [0 1 2 3]
Counts in y_test: [36 71 83 79]


In [12]:
import tensorflow as tf
from tensorflow.keras.layers import Input, GlobalAveragePooling2D
from tensorflow.keras.models import Model

def mua_module(input_tensor):
    # Weighting the Time Dimension
    x_time = tf.transpose(input_tensor, perm=[0, 2, 3, 1])
    x_time_gap = tf.reduce_mean(x_time, axis=[1, 2], keepdims=True)
    x_time = Dense(4, activation='relu')(x_time_gap)
    x_time = Dense(1, activation='relu')(x_time)
    x_time = Reshape((1, 1, -1))(x_time)

    # Weighting the Frequency Dimension
    x_frequency = tf.transpose(input_tensor, perm=[0, 2, 3, 1])
    x_frequency_gap = tf.reduce_mean(x_frequency, axis=[1, 2], keepdims=True)
    x_frequency = Dense(2, activation='relu')(x_frequency_gap)
    x_frequency = Dense(1, activation='relu')(x_frequency)
    x_frequency = Reshape((1, 1, -1))(x_frequency)

    # Weighting the Channel Dimension
    x_channel_gap = tf.reduce_mean(input_tensor, axis=[1, 2], keepdims=True)
    q = Conv2D(filters=input_tensor.shape[3], kernel_size=(3, 3), padding='same')(x_channel_gap)
    k = Conv2D(filters=input_tensor.shape[3], kernel_size=(3, 3), padding='same')(x_channel_gap)
    v = Conv2D(filters=input_tensor.shape[3], kernel_size=(3, 3), padding='same')(x_channel_gap)
    h = tf.nn.softmax(q * k) * v

    # Concatenate h along the channel axis
    h_concat = Concatenate(axis=-1)([h, h, h])

    w = Dense(input_tensor.shape[3], activation='tanh')(h_concat)
    w = Reshape((1, 1, -1))(w)

    # Apply attention weights to the input tensor
    x_out = Multiply()([input_tensor, x_time])
    x_out = Add()([x_out, Multiply()([input_tensor, x_frequency])])
    x_out = Add()([x_out, Multiply()([input_tensor, w])])
    return x_out
def create_mlf_extractor_with_mua(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    conv_4x4 = Conv2D(filters=64, kernel_size=(4, 4), activation='relu', padding='same')(input_layer)
    conv_4x4 = Conv2D(filters=32, kernel_size=(4, 4), activation='relu', padding='same')(conv_4x4)
    conv_4x4 = Dropout(0.3)(conv_4x4)
    conv_2x8 = Conv2D(filters=64, kernel_size=(2, 8), activation='relu', padding='same')(input_layer)
    conv_2x8 = Conv2D(filters=32, kernel_size=(2, 8), activation='relu', padding='same')(conv_2x8)
    conv_2x8 = Dropout(0.3)(conv_2x8)
    conv_10x2 = Conv2D(filters=64, kernel_size=(10, 2), activation='relu', padding='same')(input_layer)
    conv_10x2 = Conv2D(filters=32, kernel_size=(10, 2), activation='relu', padding='same')(conv_10x2)
    conv_10x2= Dropout(0.3)(conv_10x2)
    merged_layer = Concatenate()([conv_4x4, conv_2x8, conv_10x2])
    mua_out = mua_module(merged_layer)
    GlobalAvg_layer = GlobalAveragePooling2D()(mua_out)
    dense_layer_1 = Dense(1024, activation='relu')(GlobalAvg_layer)
    dense_layer_2 = Dense(1024, activation='relu')(dense_layer_1)
    output_layer = Dense(num_classes, activation='softmax')(dense_layer_2)
    model = Model(inputs=input_layer, outputs=output_layer)
    return model

In [8]:
def create_compile_model(input_shape, num_classes):
    model = create_mlf_extractor_with_mua(input_shape, num_classes)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)  # Adjust the learning rate
    model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create and compile the model
num_classes = len(np.unique(y_train))
input_shape = X_train[0].shape + (1,)
model = create_compile_model(input_shape, num_classes)

ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.operations`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```


In [None]:
model.summary()

In [None]:
y_train_reshaped = tf.keras.utils.to_categorical(y_train, num_classes)
y_test_reshaped = tf.keras.utils.to_categorical(y_test, num_classes)

In [None]:
y_train = np.expand_dims(y_train, axis=1)
y_test = np.expand_dims(y_test, axis=1)

In [None]:
# Define input shape and number of classes
input_shape = (26, 300, 1)  # Assuming input shape for MFCC features
num_classes = 4  # Number of output classes

# Create model
model = create_mlf_extractor_with_mua(input_shape, num_classes)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# Reshape the input data to include the channel dimension
X_train_reshaped = X_train[..., np.newaxis]
X_test_reshaped = X_test[..., np.newaxis]

# # Reshape the labels to match the logits shape
# y_train_reshaped = tf.keras.utils.to_categorical(y_train, num_classes)
# y_test_reshaped = tf.keras.utils.to_categorical(y_test, num_classes)

# Define early stopping
#early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model using the reshaped input data
history = model.fit(X_train_reshaped, y_train, epochs=100, batch_size=32, validation_data=(X_test_reshaped,y_test))

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test, batch_size=32)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

In [None]:
from keras.models import load_model

save_directory = "/content/drive/MyDrive/dataset/"
model.save(save_directory + 'model.h5')

# Optionally, you can load the model later using:
# loaded_model = load_model(save_directory + 'model.h5')

In [None]:
from sklearn.metrics import confusion_matrix

# Obtain predictions on test data
y_pred = model.predict(X_test_reshaped)
y_pred_classes = np.argmax(y_pred, axis=1)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_classes)

# Define labels for the confusion matrix
emotion_labels = ['neutral', 'angry', 'sad', 'happy']

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=emotion_labels, yticklabels=emotion_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
#Predict for the test set
y_pred=model.predict(X_test_reshaped)

In [None]:
import pandas as pd

# Mapping of emotion labels to their respective indices
emotion_labels = ['neutral', 'angry', 'sad', 'happy']

# Convert predicted probabilities to emotion labels
predicted_emotions = [emotion_labels[np.argmax(prediction)] for prediction in y_pred]

# Convert integer labels of y_test to emotion labels
actual_emotions = [emotion_labels[label] for label in y_test]

# Create a DataFrame to compare actual and predicted emotions
df = pd.DataFrame({'Actual': actual_emotions, 'Predicted': predicted_emotions})

# Print the first 20 rows of the DataFrame
print(df.head(20))