In [1]:
# pip install pydub noisereduce json-tricks

In [2]:
import numpy as np
import os
from json_tricks import dump, load

from pydub import AudioSegment, effects
import librosa
import noisereduce as nr

import tensorflow as tf
import keras
import sklearn



In [3]:
# 'emotions' list fix for classification purposes:
#     Classification values start from 0, Thus an 'n = n-1' operation has been executed for both RAVDESS and TESS databases:
def emotionfix(e_num):
    if e_num == "01":   return 0 # neutral
    elif e_num == "02": return 1 # calm
    elif e_num == "03": return 2 # happy
    elif e_num == "04": return 3 # sad
    elif e_num == "05": return 4 # angry
    elif e_num == "06": return 5 # fear
    elif e_num == "07": return 6 # disgust
    else:               return 7 # suprised

# Maximum samples count for padding purposes.

sample_lengths = []
folder_path = 'Audio_Speech_Actors_01-24'

for subdir, dirs, files in os.walk(folder_path):
  for file in files: 
    x, sr = librosa.load(path = os.path.join(subdir,file), sr = None)
    xt, index = librosa.effects.trim(x, top_db=30)
     
    sample_lengths.append(len(xt))

print('Maximum sample length:', np.max(sample_lengths))       

In [4]:
# Maximum samples count for padding purposes.

sample_lengths = []
folder_path = 'RAVDESS/'

for subdir, dirs, files in os.walk(folder_path):
  for file in files: 
    x, sr = librosa.load(path = os.path.join(subdir,file), sr = None)
    xt, index = librosa.effects.trim(x, top_db=30)
     
    sample_lengths.append(len(xt))

print('Maximum sample length:', np.max(sample_lengths))                   

Maximum sample length: 204288


In [20]:
import time
tic = time.perf_counter()

# Initialize data lists
rms = []
zcr = []
mfcc = []
emotions = []

# Initialize variables
total_length = 173056 # desired frame length for all of the audio samples.
frame_length = 2048
hop_length = 512

folder_path = 'RAVDESS/' 

for subdir, dirs, files in os.walk(folder_path):
  for file in files: 

    # Fetch the sample rate.
      _, sr = librosa.load(path = os.path.join(subdir,file), sr = None) # sr (the sample rate) is used for librosa's MFCCs. '_' is irrelevant.
    # Load the audio file.
      rawsound = AudioSegment.from_file(os.path.join(subdir,file)) 
    # Normalize the audio to +5.0 dBFS.
      normalizedsound = effects.normalize(rawsound, headroom = 0) 
    # Transform the normalized audio to np.array of samples.
      normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32')
    # Trim silence from the beginning and the end.
      xt, index = librosa.effects.trim(normal_x, top_db=30)
      #print(file,"\t", len(xt), "\t", rawsound.dBFS, "\t", normalizedsound.dBFS) #--QA purposes if needed-- 
    # Pad for duration equalization.
      padded_x = np.pad(xt, max(0, (total_length - len(xt))), 'constant')
    # Noise reduction.
      final_x = nr.reduce_noise(padded_x, sr=sr) #updated 03/03/22
       
   # Features extraction 
      f1 = librosa.feature.rms(y=final_x, frame_length=frame_length, hop_length=hop_length)# Energy - Root Mean Square   
      f2 = librosa.feature.zero_crossing_rate(y=final_x , frame_length=frame_length, hop_length=hop_length, center=True) # ZCR      
      f3 = librosa.feature.mfcc(y=final_x, sr=sr, n_mfcc=13, hop_length = hop_length) # MFCC
      
   # Emotion extraction from the different databases
      name = file[6:8]                      

   # Filling the data lists  
      rms.append(f1)
      zcr.append(f2)
      mfcc.append(f3)
      emotions.append(emotionfix(name)) 

toc = time.perf_counter()
print(f"Running time: {(toc - tic)/60:0.4f} minutes")

In [19]:
# Adjusting features shape to the 3D format: (batch, timesteps, feature)

# making all in equal dimensions

f_rms = np.asarray(rms).astype('float32')
f_rms = np.swapaxes(f_rms,1,2)
f_zcr = np.asarray(zcr).astype('float32')
f_zcr = np.swapaxes(f_zcr,1,2)
f_mfccs = np.asarray(mfcc).astype('float32')
f_mfccs = np.swapaxes(f_mfccs,1,2)

print('ZCR shape:',f_zcr.shape)
print('RMS shape:',f_rms.shape)
print('MFCCs shape:',f_mfccs.shape)

MemoryError: Unable to allocate 951. MiB for an array with shape (1440, 1, 173056) and data type float32

In [13]:
# Adjusting features shape to the 3D format: (batch, timesteps, feature)

f_rms = np.asarray(rms).astype('float32')
f_rms = np.swapaxes(f_rms,1,2)
f_zcr = np.asarray(zcr).astype('float32')
f_zcr = np.swapaxes(f_zcr,1,2)
f_mfccs = np.asarray(mfcc).astype('float32')
f_mfccs = np.swapaxes(f_mfccs,1,2)

print('ZCR shape:',f_zcr.shape)
print('RMS shape:',f_rms.shape)
print('MFCCs shape:',f_mfccs.shape)

AxisError: axis1: axis 1 is out of bounds for array of dimension 1

In [None]:
# Concatenating all features to 'X' variable.
X = np.concatenate((f_zcr, f_rms, f_mfccs), axis=2)

# Preparing 'Y' as a 2D shaped variable.
Y = np.asarray(emotions).astype('int8')
Y = np.expand_dims(Y, axis=1)

: 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

: 

# Using RBF Kernel SVM to classify the data

In [None]:
from sklearn.svm import SVC
model = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)

: 

In [None]:
model.fit(X_train.reshape(X_train.shape[0], -1), y_train.ravel())

: 

In [None]:
# Make predictions
y_pred = model.predict(X_test.reshape(X_test.shape[0], -1))

# Evaluate performance
from sklearn.metrics import classification_report, accuracy_score
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred)*100, "%")

: 

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Flatten

# Define the model architecture
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


: 

# Using the RandomForestClassifier to classify the data

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.reshape(X_train.shape[0], -1))
X_test_scaled = scaler.transform(X_test.reshape(X_test.shape[0], -1))

# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test_scaled)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred)*100, "%")

: 

# Using the XGBoost Classifier to classify the data

In [None]:
from xgboost import XGBClassifier

# Standardize features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.reshape(X_train.shape[0], -1))
X_test_scaled = scaler.transform(X_test.reshape(X_test.shape[0], -1))

# Initialize and train the XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)
xgb_classifier.fit(X_train_scaled, y_train)

# Make predictions
y_pred = xgb_classifier.predict(X_test_scaled)

: 

In [None]:
# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred)*100, "%")

: 

# Using LSTM to classify the data

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Define LSTM model
model = Sequential([
    LSTM(units=128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True, kernel_regularizer=l2(0.01)),
    Dropout(0.5),
    BatchNormalization(),
    LSTM(units=64),
    Dense(units=num_classes, activation='softmax')
])

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, to_categorical(y_train), epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, to_categorical(y_test))
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

: 

# Using CNN to classify the data

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the CNN model
def create_cnn_model(input_shape):
    model = models.Sequential()

    # Convolutional layers
    model.add(layers.Conv1D(32, 3, activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling1D(2))
    model.add(layers.Conv1D(64, 3, activation='relu'))
    model.add(layers.MaxPooling1D(2))
    model.add(layers.Conv1D(128, 3, activation='relu'))
    model.add(layers.MaxPooling1D(2))
    model.add(layers.Conv1D(128, 3, activation='relu'))
    model.add(layers.MaxPooling1D(2))

    # Flatten layer
    model.add(layers.Flatten())

    # Dense layers
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(8, activation='softmax'))  # Assuming 8 emotions

    return model

# Input shape should match the shape of the concatenated features
input_shape = X.shape[1:]

# Create the CNN model
cnn_model = create_cnn_model(input_shape)

# Compile the model
cnn_model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

# Train the model
history = cnn_model.fit(X, Y, epochs=20, batch_size=32, validation_split=0.2)

: 

In [None]:
# Evaluate the model
loss, accuracy = cnn_model.evaluate(X, Y)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy*100}%')

: 

In [None]:
# Confusion matrix for the CNN model
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Make predictions
y_pred = np.argmax(cnn_model.predict(X), axis=1)

# Compute confusion matrix
cm = confusion_matrix(Y, y_pred)

# Plot confusion matrix
labels = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fear', 'disgust', 'surprised']
plt.figure(figsize=(10, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap='Blues', xticks_rotation='vertical')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


: 

In [None]:
# Save the CNN model
cnn_model.save("emotion_cnn_model.h5")

: 

In [None]:
# Load the CNN model
import tensorflow as tf
loaded_model = tf.keras.models.load_model("emotion_cnn_model.h5")

# # Evaluate the loaded model
# loss, accuracy = loaded_model.evaluate(X, Y)
# print("Accuracy on the test set:", accuracy*100, "%")

: 

In [None]:
# Record audio
import sounddevice as sd
import soundfile as sf

# Record audio
duration = 3  # seconds
sample_rate = 22050  # sample rate
channels = 1  # number of audio channels
filename = 'output.wav'

print("Recording...")
audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=channels, dtype='float32')
sd.wait()
sf.write(filename, audio, sample_rate)

: 

In [None]:
# Load and preprocess the audio file
import numpy as np
import librosa
import noisereduce as nr

# Load the audio file
x, sr = librosa.load('Audio_Speech_Actors_01-24\Actor_10\\03-01-01-01-01-01-10.wav', sr=None)

# Trim silence
xt, index = librosa.effects.trim(x, top_db=30)

# Pad the audio file
total_length = 173056
padded_x = np.pad(xt, (0, max(total_length-len(xt), 0)), 'constant')

# listen to the audio
import IPython.display as ipd
ipd.Audio(padded_x, rate=sr)

: 

In [None]:
# Noise reduction
final_x = nr.reduce_noise(padded_x, sr=sr)

#listen to the audio
ipd.Audio(final_x, rate=sr)

: 

In [None]:
# Extract features
import librosa

# Extract RMS feature
rms = librosa.feature.rms(y=final_x, frame_length=2048, hop_length=512)

# Extract ZCR feature
zcr = librosa.feature.zero_crossing_rate(final_x, frame_length=2048, hop_length=512, center=True)

# Extract MFCCs
mfcc = librosa.feature.mfcc(y=final_x, sr=sr, n_mfcc=13, hop_length=512)

# Ensure all arrays have the same shape
rms = np.resize(rms, (1, 1440))
zcr = np.resize(zcr, (1, 1440))
mfcc = np.resize(mfcc, (13, 1440))

# Concatenate the features
X = np.concatenate((zcr, rms, mfcc), axis=0)

: 

In [None]:
# load the CNN model
import tensorflow as tf
loaded_model = tf.keras.models.load_model("emotion_cnn_model.h5")

# Transpose the input data to match the expected input shape of the model
X_transposed = np.transpose(X, (1, 0))

# Make predictions
y_pred = np.argmax(loaded_model.predict(np.expand_dims(X_transposed, axis=0)), axis=1)

# Map the predicted label to the corresponding emotion
emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fear', 'disgust', 'surprised']
emotion = emotions[y_pred[0]]
print("Predicted emotion:", emotion)

: 

In [None]:
# predict emotion from in real-time

# define a function to predict the emotion from a given audio file

def predict_emotion(filename):
    # Load the audio file
    x, sr = librosa.load(filename, sr=None)

    # Trim silence
    xt, index = librosa.effects.trim(x, top_db=30)

    # Pad the audio file
    total_length = 173056
    padded_x = np.pad(xt, (0, max(total_length-len(xt), 0)), 'constant')

    # Noise reduction
    final_x = nr.reduce_noise(padded_x, sr=sr)

    # Extract features
    rms = librosa.feature.rms(y=final_x, frame_length=2048, hop_length=512)
    zcr = librosa.feature.zero_crossing_rate(final_x, frame_length=2048, hop_length=512, center=True)
    mfcc = librosa.feature.mfcc(y=final_x, sr=sr, n_mfcc=13, hop_length=512)

    # Ensure all arrays have the same shape
    rms = np.resize(rms, (1, 1440))
    zcr = np.resize(zcr, (1, 1440))
    mfcc = np.resize(mfcc, (13, 1440))

    # Concatenate the features
    X = np.concatenate((zcr, rms, mfcc), axis=0)

    # Transpose the input data to match the expected input shape of the model
    X_transposed = np.transpose(X, (1, 0))

    # Make predictions
    y_pred = np.argmax(loaded_model.predict(np.expand_dims(X_transposed, axis=0)), axis=0)

    # Map the predicted label to the corresponding emotion
    emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fear', 'disgust', 'surprised']
    emotion = emotions[y_pred[0]]
    return emotion

# Record audio
import sounddevice as sd
import soundfile as sf

# Record audio
duration = 3  # seconds
sample_rate = 22050  # sample rate
channels = 1  # number of audio channels
filename = 'recording.wav'

print("Recording...")
audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=channels, dtype='float32')
sd.wait()
sf.write(filename, audio, sample_rate)

# Predict the emotion
emotion = predict_emotion(filename)
print("Predicted emotion:", emotion)

: 

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import speech_recognition as sr
reco=sr.Recognizer()
with sr.Microphone() as source:
    reco.adjust_for_ambient_noise(source,duration=1)
    print('Waiting for your message...')
    recordedaudio=reco.listen(source)
    print('Done recording..')

try:
    print('Printing the message..')
    text=reco.recognize_google(recordedaudio,language='en-US')
    print('Your message:{}'.format(text))
except Exception as ex:
    print(ex)

#Sentiment analysis

Statement=[str(text)]
analyser=SentimentIntensityAnalyzer()
for i in Statement:
    v=analyser.polarity_scores(i)
    print(v)

Waiting for your message...
Done recording..
Printing the message..
Your message:this is too scary
{'neg': 0.516, 'neu': 0.484, 'pos': 0.0, 'compound': -0.4939}
