In [1]:
import pandas as pd
import os
import numpy as np
import librosa
from sklearn.preprocessing import OneHotEncoder

In [2]:
paths=[]
labels=[]
for filename in os.listdir('./AudioWAV'):
    
    paths.append('./AudioWAV/' + filename)
    file = filename.split('.')[0]
   
    label = file.split('_')[2]
    if label == 'ANG':
        labels.append('angry.wav')
    elif label == 'DIS':
        labels.append('disgust.wav')
    elif label == 'FEA':
        labels.append('fear.wav')
    elif label == 'HAP':
        labels.append('happy.wav')
    elif label == 'NEU':
        labels.append('neutral.wav')
    elif label == 'SAD':
        labels.append('sad.wav')
        

df_cremad = pd.DataFrame({'speech':paths,'label':labels})
df_cremad.sample(5)

Unnamed: 0,speech,label
5688,./AudioWAV/1070_IWW_HAP_XX.wav,happy.wav
4211,./AudioWAV/1052_IWW_FEA_XX.wav,fear.wav
3285,./AudioWAV/1041_IOM_FEA_XX.wav,fear.wav
6714,./AudioWAV/1083_IEO_DIS_LO.wav,disgust.wav
5743,./AudioWAV/1071_IOM_ANG_XX.wav,angry.wav


In [3]:
def MFCC(filename):
    y, sr = librosa.load(filename,duration=3,offset=0.5)
    return np.mean(librosa.feature.mfcc(y=y,sr=sr,n_mfcc=40).T,axis=0)

mfcc_cremad = df_cremad['speech'].apply(lambda x:MFCC(x))

In [4]:
X =[x for x in mfcc_cremad]
X =np.array(X)
X.shape
X =np.expand_dims(X,-1)
X.shape

(7442, 40, 1)

In [5]:
ohe=OneHotEncoder()
y = ohe.fit_transform(df_cremad[['label']] )
y = y.toarray()

In [6]:
X.shape, y.shape

((7442, 40, 1), (7442, 6))

In [7]:
df_cremad['label'].unique()

array(['angry.wav', 'disgust.wav', 'fear.wav', 'happy.wav', 'neutral.wav',
       'sad.wav'], dtype=object)

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Definindo os modelos

class SimpleDNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SimpleDNN, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

class LSTMAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMAttention, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.attention = nn.Linear(hidden_size, 1)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        out, _ = self.lstm(x, (h0, c0))
        attn_weights = torch.softmax(self.attention(out), dim=1)
        out = torch.sum(attn_weights * out, dim=1)
        out = self.fc(out)
        return out

class CNNModel(nn.Module):
    def __init__(self, input_channels, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(input_channels, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(64, 32, kernel_size=3, padding=1)
        self.fc_input_size = 32 * 1 * 1
        self.fc1 = nn.Linear(self.fc_input_size, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.dropout(x)
        x = x.view(-1, self.fc_input_size)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x


class CNNAttention(nn.Module):
    def __init__(self, input_channels, num_classes):
        super(CNNAttention, self).__init__()
        self.conv1 = nn.Conv1d(input_channels, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(64, 32, kernel_size=3, padding=1)
        self.attention = nn.Linear(32, 1)
        self.fc = nn.Linear(32, num_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        attn_weights = self.softmax(self.attention(x.permute(0, 2, 1))).squeeze(-1)
        attn_weights = attn_weights.unsqueeze(-1)
        x = torch.sum(attn_weights * x, dim=2)
        x = self.fc(x)
        return x
    
# Construindo e treinando os modelos

input_size = X.shape[1:]
num_classes = y.shape[1]
hidden_size = 64
num_layers = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Convertendo os dados para tensores PyTorch
X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y, dtype=torch.float32).to(device)

# Definindo o tamanho do lote
batch_size = 32

# Criando conjuntos de dados PyTorch
dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Definindo tensor para o LSTM
X_tensorLSTM = X_tensor.permute(0, 2, 1)

# Criando conjuntos de dados PyTorch para LSTM
datasetLSTM = torch.utils.data.TensorDataset(X_tensorLSTM, y_tensor)
train_sizeLSTM = int(0.8 * len(datasetLSTM))
test_sizeLSTM = len(datasetLSTM) - train_sizeLSTM
train_datasetLSTM, test_datasetLSTM = torch.utils.data.random_split(datasetLSTM, [train_sizeLSTM, test_sizeLSTM])

# DataLoader para o LSTM
train_loaderLSTM = DataLoader(train_datasetLSTM, batch_size=batch_size, shuffle=True)
test_loaderLSTM = DataLoader(test_datasetLSTM, batch_size=batch_size, shuffle=False)

# Convertendo os dados para tensores PyTorch do CNN
X_tensorCNN = torch.tensor(X, dtype=torch.float32).to(device)

# Criando conjuntos de dados PyTorch do CNN
datasetCNN = torch.utils.data.TensorDataset(X_tensorCNN, y_tensor)
train_sizeCNN = int(0.8 * len(datasetCNN))
test_sizeCNN = len(datasetCNN) - train_sizeCNN
train_datasetCNN, test_datasetCNN = torch.utils.data.random_split(datasetCNN, [train_sizeCNN, test_sizeCNN])

# DataLoader para o CNN
train_loaderCNN = DataLoader(train_datasetCNN, batch_size=batch_size, shuffle=True)
test_loaderCNN = DataLoader(test_datasetCNN, batch_size=batch_size, shuffle=False)

# Função para treinamento
def train(model, train_loader, criterion, optimizer, num_epochs=1000):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            #print(outputs.shape)
            loss = criterion(outputs, torch.max(labels, 1)[1])
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == torch.max(labels, 1)[1]).sum().item()

        epoch_loss = running_loss / len(train_loader)
        accuracy = correct / total
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.4f}')

In [9]:
# Standard Deep Neural Network
sdnn_model = SimpleDNN(input_size[0], num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(sdnn_model.parameters(), lr=0.00001)
train(sdnn_model, train_loader, criterion, optimizer)

Epoch [1/1000], Loss: 13.7853, Accuracy: 0.1680
Epoch [2/1000], Loss: 11.7800, Accuracy: 0.1695
Epoch [3/1000], Loss: 10.3541, Accuracy: 0.1776
Epoch [4/1000], Loss: 9.4800, Accuracy: 0.1675
Epoch [5/1000], Loss: 8.9561, Accuracy: 0.1601
Epoch [6/1000], Loss: 8.0599, Accuracy: 0.1688
Epoch [7/1000], Loss: 7.3352, Accuracy: 0.1692
Epoch [8/1000], Loss: 6.5692, Accuracy: 0.1727
Epoch [9/1000], Loss: 6.0965, Accuracy: 0.1698
Epoch [10/1000], Loss: 5.7681, Accuracy: 0.1742
Epoch [11/1000], Loss: 5.1347, Accuracy: 0.1718
Epoch [12/1000], Loss: 4.8409, Accuracy: 0.1700
Epoch [13/1000], Loss: 4.4778, Accuracy: 0.1771
Epoch [14/1000], Loss: 4.1627, Accuracy: 0.1688
Epoch [15/1000], Loss: 3.8088, Accuracy: 0.1821
Epoch [16/1000], Loss: 3.6196, Accuracy: 0.1692
Epoch [17/1000], Loss: 3.4238, Accuracy: 0.1661
Epoch [18/1000], Loss: 3.2007, Accuracy: 0.1727
Epoch [19/1000], Loss: 3.0396, Accuracy: 0.1707
Epoch [20/1000], Loss: 2.8408, Accuracy: 0.1724
Epoch [21/1000], Loss: 2.7311, Accuracy: 0.173

In [10]:
# LSTM
lstm_model = LSTMModel(input_size[0], hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.00001)
train(lstm_model, train_loaderLSTM, criterion, optimizer)

Epoch [1/1000], Loss: 1.7929, Accuracy: 0.1693
Epoch [2/1000], Loss: 1.7911, Accuracy: 0.1693
Epoch [3/1000], Loss: 1.7901, Accuracy: 0.1693
Epoch [4/1000], Loss: 1.7902, Accuracy: 0.1693
Epoch [5/1000], Loss: 1.7895, Accuracy: 0.1700
Epoch [6/1000], Loss: 1.7889, Accuracy: 0.1720
Epoch [7/1000], Loss: 1.7876, Accuracy: 0.1727
Epoch [8/1000], Loss: 1.7866, Accuracy: 0.1755
Epoch [9/1000], Loss: 1.7860, Accuracy: 0.1772
Epoch [10/1000], Loss: 1.7856, Accuracy: 0.1826
Epoch [11/1000], Loss: 1.7838, Accuracy: 0.1918
Epoch [12/1000], Loss: 1.7832, Accuracy: 0.2081
Epoch [13/1000], Loss: 1.7817, Accuracy: 0.2093
Epoch [14/1000], Loss: 1.7804, Accuracy: 0.2122
Epoch [15/1000], Loss: 1.7793, Accuracy: 0.2143
Epoch [16/1000], Loss: 1.7777, Accuracy: 0.2190
Epoch [17/1000], Loss: 1.7765, Accuracy: 0.2202
Epoch [18/1000], Loss: 1.7745, Accuracy: 0.2207
Epoch [19/1000], Loss: 1.7734, Accuracy: 0.2214
Epoch [20/1000], Loss: 1.7716, Accuracy: 0.2248
Epoch [21/1000], Loss: 1.7690, Accuracy: 0.2493
E

In [11]:
# LSTM with Attention
lstm_atn_model = LSTMAttention(input_size[0], hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_atn_model.parameters(), lr=0.00001)
train(lstm_atn_model, train_loaderLSTM, criterion, optimizer)

Epoch [1/1000], Loss: 1.7954, Accuracy: 0.1503
Epoch [2/1000], Loss: 1.7949, Accuracy: 0.1503
Epoch [3/1000], Loss: 1.7940, Accuracy: 0.1503
Epoch [4/1000], Loss: 1.7934, Accuracy: 0.1503
Epoch [5/1000], Loss: 1.7932, Accuracy: 0.1503
Epoch [6/1000], Loss: 1.7925, Accuracy: 0.1502
Epoch [7/1000], Loss: 1.7924, Accuracy: 0.1500
Epoch [8/1000], Loss: 1.7913, Accuracy: 0.1594
Epoch [9/1000], Loss: 1.7901, Accuracy: 0.1725
Epoch [10/1000], Loss: 1.7892, Accuracy: 0.1806
Epoch [11/1000], Loss: 1.7877, Accuracy: 0.1897
Epoch [12/1000], Loss: 1.7859, Accuracy: 0.1992
Epoch [13/1000], Loss: 1.7842, Accuracy: 0.2083
Epoch [14/1000], Loss: 1.7822, Accuracy: 0.2044
Epoch [15/1000], Loss: 1.7804, Accuracy: 0.2206
Epoch [16/1000], Loss: 1.7785, Accuracy: 0.2288
Epoch [17/1000], Loss: 1.7766, Accuracy: 0.2384
Epoch [18/1000], Loss: 1.7744, Accuracy: 0.2417
Epoch [19/1000], Loss: 1.7724, Accuracy: 0.2466
Epoch [20/1000], Loss: 1.7702, Accuracy: 0.2503
Epoch [21/1000], Loss: 1.7672, Accuracy: 0.2488
E

In [12]:
# Convolutional Neural Network
cnn_model = CNNModel(input_size[0],num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=0.00001)
train(cnn_model, train_loaderCNN, criterion, optimizer)

Epoch [1/1000], Loss: 2.2141, Accuracy: 0.1747
Epoch [2/1000], Loss: 1.9610, Accuracy: 0.1807
Epoch [3/1000], Loss: 1.9023, Accuracy: 0.1740
Epoch [4/1000], Loss: 1.8657, Accuracy: 0.1801
Epoch [5/1000], Loss: 1.8423, Accuracy: 0.1695
Epoch [6/1000], Loss: 1.8225, Accuracy: 0.1828
Epoch [7/1000], Loss: 1.8209, Accuracy: 0.1791
Epoch [8/1000], Loss: 1.8120, Accuracy: 0.1861
Epoch [9/1000], Loss: 1.7965, Accuracy: 0.1972
Epoch [10/1000], Loss: 1.7930, Accuracy: 0.1925
Epoch [11/1000], Loss: 1.7832, Accuracy: 0.1977
Epoch [12/1000], Loss: 1.7857, Accuracy: 0.1970
Epoch [13/1000], Loss: 1.7840, Accuracy: 0.2002
Epoch [14/1000], Loss: 1.7674, Accuracy: 0.2229
Epoch [15/1000], Loss: 1.7716, Accuracy: 0.2177
Epoch [16/1000], Loss: 1.7630, Accuracy: 0.2283
Epoch [17/1000], Loss: 1.7586, Accuracy: 0.2271
Epoch [18/1000], Loss: 1.7570, Accuracy: 0.2402
Epoch [19/1000], Loss: 1.7494, Accuracy: 0.2384
Epoch [20/1000], Loss: 1.7412, Accuracy: 0.2481
Epoch [21/1000], Loss: 1.7355, Accuracy: 0.2552
E

In [13]:
# Convolutional Neural Network with Attention
cnn_atn_model = CNNAttention(input_size[0],num_classes).to(device)
optimizer = optim.Adam(cnn_atn_model.parameters(), lr=0.00001)
train(cnn_atn_model, train_loaderCNN, criterion, optimizer)

Epoch [1/1000], Loss: 1.8132, Accuracy: 0.2417
Epoch [2/1000], Loss: 1.7595, Accuracy: 0.2471
Epoch [3/1000], Loss: 1.7471, Accuracy: 0.2249
Epoch [4/1000], Loss: 1.7402, Accuracy: 0.2464
Epoch [5/1000], Loss: 1.7335, Accuracy: 0.2500
Epoch [6/1000], Loss: 1.7270, Accuracy: 0.2718
Epoch [7/1000], Loss: 1.7242, Accuracy: 0.2741
Epoch [8/1000], Loss: 1.7158, Accuracy: 0.2859
Epoch [9/1000], Loss: 1.7121, Accuracy: 0.2800
Epoch [10/1000], Loss: 1.7069, Accuracy: 0.2963
Epoch [11/1000], Loss: 1.7018, Accuracy: 0.2973
Epoch [12/1000], Loss: 1.6962, Accuracy: 0.2970
Epoch [13/1000], Loss: 1.6909, Accuracy: 0.3093
Epoch [14/1000], Loss: 1.6879, Accuracy: 0.3123
Epoch [15/1000], Loss: 1.6807, Accuracy: 0.3079
Epoch [16/1000], Loss: 1.6736, Accuracy: 0.3136
Epoch [17/1000], Loss: 1.6669, Accuracy: 0.3215
Epoch [18/1000], Loss: 1.6642, Accuracy: 0.3182
Epoch [19/1000], Loss: 1.6555, Accuracy: 0.3269
Epoch [20/1000], Loss: 1.6511, Accuracy: 0.3264
Epoch [21/1000], Loss: 1.6455, Accuracy: 0.3297
E

In [14]:
# Função para teste
def test(model, test_loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, torch.max(labels, 1)[1])
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == torch.max(labels, 1)[1]).sum().item()

    test_loss = running_loss / len(test_loader)
    test_accuracy = correct / total
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

In [15]:
print("Testing SimpleDNN:")
test(sdnn_model, test_loader, criterion)

Testing SimpleDNN:
Test Loss: 1.4620, Test Accuracy: 0.3902


In [16]:
print("\nTesting LSTMModel:")
test(lstm_model, test_loaderLSTM, criterion)



Testing LSTMModel:
Test Loss: 1.3791, Test Accuracy: 0.4433


In [17]:
print("\nTesting LSTMAttention:")
test(lstm_atn_model, test_loaderLSTM, criterion)


Testing LSTMAttention:
Test Loss: 1.3838, Test Accuracy: 0.4473


In [18]:
print("\nTesting CNNModel:")
test(cnn_model, test_loaderCNN, criterion)



Testing CNNModel:
Test Loss: 1.3934, Test Accuracy: 0.4372


In [19]:
print("\nTesting CNNAttention:")
test(cnn_atn_model, test_loaderCNN, criterion)


Testing CNNAttention:
Test Loss: 1.3705, Test Accuracy: 0.4412
