# Import Dependencies

In [1]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import speech_recognition as sr
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

# Part 1 - Speech Emotion Recognition

## Build Functions to Handle Feature Extraction and Load Audio Data

In [6]:
# Extract features from sound file - mfcc, chroma, mel
def extract_features(file, mfcc, chroma, mel):
    with soundfile.SoundFile(file) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
    return result

In [7]:
# Create function to load data and extract features for each sound file
def load_data(directory):
    x = []
    y = []
    for file in glob.glob(f"{directory}/Actor_*/*.wav"):
        filename=os.path.basename(file)
        emotion=emotions[filename.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_features(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return x, y

## Load Audio Data

In [8]:
# Get directory of sound files
dirloc = os.path.join(os.getcwd(), "speech-emotion-recognition-ravdess-data")

In [9]:
# Provide RAVDESS emotions for encoding
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

# Observed emotions
observed_emotions=['calm', 'happy', 'fearful', 'disgust']

In [10]:
# Load audio data and extract features
x, y = load_data(dirloc)

In [11]:
# Transform x to array
X = np.array(x)

## Split the audio dataset into the Training and Test set

In [17]:
# Split audio dataset into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [18]:
# Review number of samples in each dataset
print(X_train.shape[0], X_test.shape[0])

614 154


In [19]:
# Review number of features
print(f'Features extracted: {X_train.shape[1]}')

Features extracted: 180


## Train MLPClassifier Model with Training Data

In [20]:
# Initialize MLPClassifier instance from Sci-Kit Learn - multi-layer perceptron classifier utilizes internal neural network for classification
classifier = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

In [21]:
# Fit MLPClassifier model with training data
classifier.fit(X_train, y_train)

MLPClassifier(alpha=0.01, batch_size=256, hidden_layer_sizes=(300,),
              learning_rate='adaptive', max_iter=500)

## Predict Audio Emotion Based on Test Data

In [24]:
# Predict values based on test set
y_pred = classifier.predict(X_test)

In [25]:
# Create confusion matrix and calculate accuracy score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[34  2  1  1]
 [ 6 26  3  4]
 [ 2  5 21  4]
 [ 2  6  3 34]]


0.7467532467532467

## Predict New Result with New Clip

In [26]:
pred_dirloc = os.path.join(os.getcwd(), 'Prediction_Data')

In [67]:
x_pred_file = pred_dirloc + '/John F. Kennedy - The Declaration of Independence.wav'

In [71]:
y, sr = librosa.load(x_pred_file)

In [72]:
y_mono = librosa.to_mono(y)

In [76]:
stft=np.abs(librosa.stft(y_mono))
result=np.array([])

In [77]:
mfccs=np.mean(librosa.feature.mfcc(y=y_mono, sr=sr, n_mfcc=40).T, axis=0)
result=np.hstack((result, mfccs))

In [78]:
chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T,axis=0)
result=np.hstack((result, chroma))

In [79]:
mel=np.mean(librosa.feature.melspectrogram(y_mono, sr=sample_rate).T,axis=0)
result=np.hstack((result, mel))

In [81]:
x_pred_new = result

In [83]:
y_pred_new = classifier.predict(x_pred_new.reshape(1, -1))

In [84]:
print(y_pred_new)

['fearful']


# Outcome Review

After providing a clip of JFK presenting the Declaration of Independence, the model provided a result of "Fearful." This means that the model recognized several measures within the speakers voice which indicated an underlying tone of fear. This is very, very interesting, and we will explore this further in part 2 when we analyze the text sentiment.

# Part 2 - Sentiment Analysis

In [None]:
# Create function to extract text from each sound file and detect sentiment
def get_text(directory):
    xt = []
    yt = []
    for file in glob.glob(f"{directory}/Actor_*/*.wav"):
        try:
            text = extract_text(file)
            sia = SIA()
            pol_score = sia.polarity_scores(text)
            sentiment = 0
            if pol_score["compound"] > 0.2:
                sentiment = 1
            elif pol_score["compound"] < -0.2:
                sentiment = -1
            else:
                sentiment = 0
            xt.append(text)
            yt.append(sentiment)
        except:
            print("Error")
    return xt, yt

In [3]:
# initialize speech recognizer
r = sr.Recognizer()

# Extract text from sound file
def extract_text(file):
    with sr.AudioFile(file) as source:
        # load audio
        audio_data = r.record(source)
        # convert speech to text
        text = r.recognize_google(audio_data)
    return text

In [None]:
# Load text data and analyze sentiment
xt, yt = get_text(dirloc)

In [None]:
#for subdir in os.listdir(dirloc):
    #subdir_folder = os.path.join(dirloc,subdir)
   # if os.path.isdir(subdir_folder):
       # for file in os.listdir(subdir_folder):
           # filename = os.path.join(subdir_folder, file)
            #extract_features(filename)