In [1]:
pip install librosa








In [1]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout

def extract_mfcc(wav_file_name):
    y, sr = librosa.load(wav_file_name)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=45).T, axis=0)
    return mfcc

def extract_stft(wav_file_name):
    y, sr = librosa.load(wav_file_name)
    stft = np.abs(librosa.stft(y))
    stft_mean = np.mean(stft, axis=1)  # Averaging along the time axis
    return stft_mean

def extract_features(wav_file_name):
    mfcc = extract_mfcc(wav_file_name)
    stft = extract_stft(wav_file_name)
    return np.concatenate((mfcc, stft))

ravdess_speech_labels = []
ravdess_speech_data = []

for dirname, _, filenames in os.walk(r'C:\Users\Aprajit Sharma\Desktop\Sentiment Analysis\ravdess'):
    for filename in filenames:
        if filename.endswith(".wav"):
            ravdess_speech_labels.append(int(filename[7:8])-1)
            wav_file_name = os.path.join(dirname, filename)
            ravdess_speech_data.append(extract_features(wav_file_name))

ravdess_speech_data_array = np.asarray(ravdess_speech_data)
ravdess_speech_label_array = np.array(ravdess_speech_labels)

print("Feature data shape:", ravdess_speech_data_array.shape)
print("Labels shape:", ravdess_speech_label_array.shape)

# Prepare the data for training
labels_categorical = to_categorical(ravdess_speech_label_array, num_classes=8)

x_train, x_test, y_train, y_test = train_test_split(
    np.expand_dims(ravdess_speech_data_array, axis=-1), 
    labels_categorical, 
    test_size=0.30, 
    random_state=9
)

print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

# Define and compile the model
early_stopping = callbacks.EarlyStopping(
    monitor='loss',  
    patience=15,          
    restore_best_weights=True  
)

def model_cnn():
    input_shape = (ravdess_speech_data_array.shape[1], 1)
    model = models.Sequential([
        layers.Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape),
        layers.MaxPooling1D(pool_size=2),
        layers.Dropout(0.2),
        layers.Conv1D(32, kernel_size=3, activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Dropout(0.5),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(8, activation='softmax')  
    ])
    
    model.compile(optimizer='nadam',
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    
    return model

model_A = model_cnn()

# Train the model
history = model_A.fit(x_train, y_train,
                      validation_split=0.1,
                      epochs=100,
                      shuffle=True,
                      callbacks=[early_stopping])

# Emotion labels
emotions = {
    0: 'neutral',
    1: 'calm',
    2: 'happy',
    3: 'sad',
    4: 'angry',
    5: 'fearful',
    6: 'disgust',
    7: 'surprised'
}

# Prediction function
def predict(wav_filepath):
    test_point = extract_features(wav_filepath)
    test_point = np.reshape(test_point, newshape=(1, test_point.shape[0], 1))
    predictions = model_A.predict(test_point)
    print(emotions[np.argmax(predictions[0])])

# # Test the predict function
# predict('C:/Users/Aprajit Sharma/Desktop/Sentiment Analysis/ravdess/Actor_01/03-01-01-01-01-01-01.wav')

# Evaluate the model
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

predictions = model_A.predict(x_test)
y_pred = np.argmax(predictions, axis=1)
y_true = np.argmax(y_test, axis=1)

precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
accuracy = accuracy_score(y_true, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Feature data shape: (2880, 1070)
Labels shape: (2880,)
x_train shape: (2016, 1070, 1)
y_train shape: (2016, 8)
x_test shape: (864, 1070, 1)
y_test shape: (864, 8)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 45ms/step - accuracy: 0.1943 - loss: 2.2999 - val_accuracy: 0.2921 - val_loss: 1.7361
Epoch 2/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.3657 - loss: 1.6723 - val_accuracy: 0.3366 - val_loss: 1.5964
Epoch 3/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 47ms/step - accuracy: 0.4052 - loss: 1.5178 - val_accuracy: 0.3960 - val_loss: 1.5488
Epoch 4/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 48ms/step - accuracy: 0.4525 - loss: 1.4448 - val_accuracy: 0.3911 - val_loss: 1.5159
Epoch 5/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - accuracy: 0.4917 - loss: 1.3579 - val_accuracy: 0.4653 - val_loss: 1.4105
Epoch 6/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step - accuracy: 0.4958 - loss: 1.3413 - val_accuracy: 0.4505 - val_loss: 1.4025
Epoch 7/100
[1m57/57[0m [

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9133 - loss: 0.2694 - val_accuracy: 0.8119 - val_loss: 0.5734
Epoch 52/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.9272 - loss: 0.2234 - val_accuracy: 0.8317 - val_loss: 0.6290
Epoch 53/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.9079 - loss: 0.2674 - val_accuracy: 0.8168 - val_loss: 0.5616
Epoch 54/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.9248 - loss: 0.2322 - val_accuracy: 0.8416 - val_loss: 0.5946
Epoch 55/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.9026 - loss: 0.2814 - val_accuracy: 0.8317 - val_loss: 0.6399
Epoch 56/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.9020 - loss: 0.2824 - val_accuracy: 0.8515 - val_loss: 0.5304
Epoch 57/100
[1m57/57[0m [32m━━━

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Accuracy: 0.8553240740740741
Precision: 0.8628777802259316
Recall: 0.8553240740740741
F1 Score: 0.8569297666750535


In [2]:
predict('C:/Users/Aprajit Sharma/Desktop/Sentiment Analysis/ravdess/Actor_01/03-01-01-01-01-01-01.wav')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
neutral


In [3]:
model_A.save('audio_sentiment_model.h5')


