In [1]:
import os
import numpy as np
import librosa
import librosa.display

from tqdm.notebook import tqdm

In [2]:
def get_waveforms(file):
    wave,_= librosa.load(file,duration=3,offset=0.5,sr=sample_rate)
    waveform, _ = librosa.effects.trim(wave,top_db=60) # tops at 60 decibles

    # make sure waveform vectors are homogenous by defining explicitly
    waveform_homo = np.zeros((int(sample_rate*3,)))
    waveform_homo[:len(waveform)] = waveform
    
    # return a single file's waveform    
    return waveform_homo

In [3]:
emotions_dict = {
    0:'surprised',
    1:'neutral',
    2:'calm',
    3:'happy',
    4:'sad',
    5:'angry',
    6:'fearful',
    7:'disgust'
}
# Additional attributes from RAVDESS to play with
emotion_intensities = {
    1: 'normal',
    2: 'strong'
}

# RAVDESS native sample rate is 48k
sample_rate = 48000

In [4]:
RAV = "ravdess-emotional-speech-audio/"

dir_list = os.listdir(RAV)
dir_list.sort()

# features and labels
emotions = []
# raw waveforms to augment later
waveforms = []
# extra labels
intensities, genders = [],[]

for i in tqdm(dir_list, total=len(dir_list)):
    if i == ".DS_Store":
        continue

    if not i.startswith("Actor_"):
        continue

    fname = os.listdir(RAV+i)
    for f in fname:
        # Modality (01 = full-AV, 02 = video-only, 03 = audio-only). We only have 03
        # Vocal channel (01 = speech, 02 = song). We only have 01
        # Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
        # Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
        # Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
        # Repetition (01 = 1st repetition, 02 = 2nd repetition).
        # Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).
        emotion_onehot = np.zeros(8)
        part = f.split('.')[0].split('-')
        emotion = int(part[2])
        
        #  move surprise to 0 for cleaner behaviour with PyTorch/0-indexing
        if emotion == 8: emotion = 0 # surprise is now at 0 index; other emotion indeces unchanged

        emotion_onehot[emotion] = 1
        
        intensity = int(part[3])
        
        gender = int(part[6])
        if gender%2==0:
            gender = 'female'
        else:
            gender = 'male'
        
        # get waveform from the sample
        waveform = get_waveforms(RAV + i +'/'+ f)
        
        # store waveforms and labels
        waveforms.append(waveform)
        # emotions.append(emotion_onehot)
        emotions.append(emotion) # no need for the onehot encoded
        intensities.append(intensity) # store intensity in case we wish to predict

  0%|          | 0/26 [00:00<?, ?it/s]

In [5]:
len(waveforms)

1440

In [6]:
waveforms[0].shape

(144000,)

^ 144000 = sample_rate * duration in seconds = 48000 * 3

In [7]:
emotions[0]

0

In [8]:
intensities[0]

2

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_valid_test, y_train, y_valid_test = train_test_split(waveforms, emotions, test_size=0.2, stratify=emotions)

In [11]:
from torch.utils.data import Dataset, DataLoader

In [12]:
class WaveformDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index].astype(np.float32), self.labels[index]

In [13]:
train_ds = WaveformDataset(X_train, y_train)
train_dataloader = DataLoader(train_ds, batch_size=32, num_workers=0, shuffle=True)

In [14]:
for x in train_dataloader:
    print(x)
    break

[tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -3.0518e-05,
         -3.0518e-05, -3.0518e-05],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [-3.0518e-05, -3.0518e-05,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 6.1035e-05,  9.1553e-05,  9.1553e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]]), tensor([2, 3, 3, 6, 0, 4, 5, 2, 7, 1, 5, 0, 3, 5, 4, 6, 7, 2, 5, 7, 7, 5, 3, 4,
        0, 4, 0, 7, 4, 7, 4, 4])]


In [15]:
valid_ds = WaveformDataset(X_valid_test, y_valid_test)
valid_dataloader = DataLoader(valid_ds, batch_size=32, num_workers=0, shuffle=False)

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [24]:
class SimpleLinearModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimpleLinearModel, self).__init__()

        self.lstm = nn.LSTM(input_size, 128, num_layers=1, dropout=0.1, bidirectional=True, batch_first=True)

        self.linear1 = nn.Linear(128 * 2, 256) # for bidirectional
        self.linear2 = nn.Linear(256, 128)
        self.linear3 = nn.Linear(128, output_size)

    
    def forward(self, waveform):
        output, _ = self.lstm(waveform)
        output = F.relu(self.linear1(output))
        output = F.relu(self.linear2(output))
        output = self.linear3(output)

        return output

In [25]:
device = torch.device("mps")

In [26]:
import gc

torch.mps.empty_cache()
gc.collect()

1743

In [27]:
linear_model = SimpleLinearModel(waveforms[0].shape[0], len(emotions_dict)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(linear_model.parameters(), lr=0.001)



In [28]:
EPOCHS = 20

In [29]:
def fit(model, criterion, optimizer, epochs=EPOCHS):
    train_accuracies = []
    val_accuracies = []
    train_losses = []
    val_losses = []

    for epoch in range(epochs):  # Loop over the dataset multiple times
        running_loss = 0.0
        total = 0
        correct = 0
        model.train()
        for inputs, labels in train_dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
    
            optimizer.zero_grad()
    
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    
            running_loss += loss.item() 

            _, predicted = torch.max(outputs.data, axis=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
        # get train loss and accuracy
        train_loss = running_loss / len(train_dataloader.dataset)
        train_accuracy = correct / total
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
    
        # get test loss and accuracy
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            model.eval()
            for data in valid_dataloader:
                inputs, labels = data
                inputs = inputs.to(device)
                labels = labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, axis=1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        val_loss /= len(valid_dataloader)
        val_accuracy = val_correct / val_total
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        print(f"Epoch {epoch}: train_loss: {train_loss:.4f}; train_accuracy: {train_accuracy:.4f}; val_loss: {val_loss:.4f}; val_accuracy: {val_accuracy:.4f}")

    return {
        "train_loss": train_losses,
        "train_accuracy": train_accuracies,
        "val_loss": val_losses,
        "val_accuracy": val_accuracies
    }

In [30]:
train_result = fit(linear_model, criterion, optimizer)

Epoch 0: train_loss: 0.0647; train_accuracy: 0.1484; val_loss: 2.0611; val_accuracy: 0.1944
Epoch 1: train_loss: 0.0421; train_accuracy: 0.5677; val_loss: 2.7844; val_accuracy: 0.1840
Epoch 2: train_loss: 0.0200; train_accuracy: 0.7752; val_loss: 3.4378; val_accuracy: 0.1875
Epoch 3: train_loss: 0.0097; train_accuracy: 0.8993; val_loss: 4.3955; val_accuracy: 0.1979
Epoch 4: train_loss: 0.0050; train_accuracy: 0.9470; val_loss: 5.0413; val_accuracy: 0.1736
Epoch 5: train_loss: 0.0026; train_accuracy: 0.9731; val_loss: 5.6630; val_accuracy: 0.1771
Epoch 6: train_loss: 0.0019; train_accuracy: 0.9809; val_loss: 6.2531; val_accuracy: 0.2014
Epoch 7: train_loss: 0.0015; train_accuracy: 0.9878; val_loss: 6.7257; val_accuracy: 0.1840
Epoch 8: train_loss: 0.0009; train_accuracy: 0.9896; val_loss: 6.9954; val_accuracy: 0.1632
Epoch 9: train_loss: 0.0008; train_accuracy: 0.9922; val_loss: 7.3670; val_accuracy: 0.1840
Epoch 10: train_loss: 0.0004; train_accuracy: 0.9957; val_loss: 7.8857; val_accu

In [31]:
from IPython.display import Audio

In [69]:
sr = 22050

y_sweep = librosa.chirp(fmin=librosa.note_to_hz('C#4'),
                        fmax=librosa.note_to_hz('Eb5'),
                        sr=sr,
                        duration=1)

display(Audio(data=y_sweep, rate=sr))
8

8

In [47]:
for x in train_dataloader:
    print(x[0].shape)
    break

torch.Size([32, 144000])


In [60]:
def predict(model, filename):
    f = filename.split("/")[-1]
    part = f.split('.')[0].split('-')
    emotion = int(part[2])

    emotion_label = emotions_dict[emotion]
    this_audio, sr = librosa.load(filename)

    waveform = get_waveforms(filename)

    with torch.no_grad():
        model.eval()
        waveform_tensor = torch.tensor(np.array([waveform]), dtype=torch.float).to(device)
        output = model(waveform_tensor)
        _, prediction = torch.max(output, axis=1)
        prediction_label = emotions_dict[prediction.cpu().numpy()[0]]

    # show
    display(Audio(data=this_audio, rate=sr))
    print(f"Label     : {emotion_label}")
    print(f"Prediction: {prediction_label}")

In [65]:
predict(linear_model, "ravdess-emotional-speech-audio/Actor_02/03-01-01-01-02-01-02.wav")

Label     : neutral
Prediction: neutral
