In [1]:
import torch
from torch import nn
from torch.nn import functional as Fn
from torch import optim
from torchvision.transforms import transforms
from torch.utils.data import DataLoader, TensorDataset, random_split
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time


In [None]:
class CNNArchitecture(nn.Module):

    def __init__(self, classes):
        super(CNNArchitecture, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)

        self.Pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.fc1 = nn.Linear(256 , 256)  # Adjusted for the output size after pooling
        # self.fc2 = nn.Linear(1024, 512)
        # self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, 128)
        # self.fc5 = nn.Linear(128, 64)
        self.fc6 = nn.Linear(128, classes)

    def forward(self, x):

        # Convolutional layers with ReLU activations and pooling
        x = Fn.relu(self.conv1(x))
        x = self.Pool(x)
        x = Fn.relu(self.conv2(x))
        x = self.Pool(x)
        x = Fn.relu(self.conv3(x))
        x = self.Pool(x)
        x = Fn.relu(self.conv4(x))
        x = self.Pool(x)

        x = x.view(x.size(0), -1)  # Flatten the output to (batch_size, num_features)
        # print(x.shape)

        # Fully connected layers with ReLU activations
        x = Fn.relu(self.fc1(x))
        x = Fn.relu(self.fc2(x))
        x = Fn.relu(self.fc3(x))
        x = Fn.relu(self.fc4(x))
        x = Fn.relu(self.fc5(x))

        # Final output layer (no ReLU)
        x = self.fc6(x)
        # print(x.shape)
        # out = torch.argmax(x, dim=1).float()
        # print(out)

        return x



In [29]:
import data_loader #import getDatapoints

trainDataLoader, testDataLoader = data_loader.getDatapoints()




Completed collecting all the data points X: (23932, 128, 173) and Y: (23932,)
 Mean of DataPoints: [[[-0.2380136]]] and STD of DataPoints: [[[0.07312169]]]


In [36]:
device = torch.device("cpu")
epochs = 10
model = CNNArchitecture(classes=6)

model.to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.001)
for param in model.parameters():
    param.requires_grad = True 
lossFn = nn.CrossEntropyLoss()


for each_epoch in range(epochs):
    epoch_loss = 0
    correct_predictions = 0 
    total_samples = 0

    for batch_id, (trainX, trainY) in enumerate(trainDataLoader):
        
        start_time = time.time()

        # Add channel dimension (for Conv2d)
        trainX = trainX.unsqueeze(1)  # Adds a channel dimension at position 1
        trainX = trainX.to(device)
        trainY = trainY.long()
        trainY = trainY.to(device)
        
        # Forward pass
        pred = model(trainX)
        
        # print("Predictions dtype:", pred.dtype)  # This should be torch.float32
        # trainY.dtype = torch.long()
        # print("Targets dtype:", trainY.dtype) 
        # Compute the loss
        lossval = lossFn(pred, trainY)
        
        optimizer.zero_grad()
        
        # Check if the loss tensor requires gradients
        # print("Check: ", lossval.requires_grad)  # This should print True now

        # Backward pass
        lossval.backward()

        # Update model parameters
        optimizer.step()
        
        epoch_loss += lossval.item()  # Add batch loss to epoch loss
        
        end_time = time.time()
        batch_time = end_time - start_time  # Time taken for the batch
        # print(f"Batch {batch_id + 1}, Time per batch: {batch_time:.4f} seconds")

        # break  # For debugging, you can remove this to train on the full dataset
        with torch.no_grad():  # No gradient computation for accuracy
            predictions = torch.argmax(pred, dim=1)  # Get predicted class labels
            correct_predictions += (predictions == trainY).sum().item()  # Count correct predictions
            total_samples += trainY.size(0)  # Update total number of samples
    average_loss = epoch_loss / len(trainDataLoader)  # Average loss
    accuracy = correct_predictions / total_samples * 100  # Accuracy as percentage

    # Display metrics for the epoch
    print(f"Epoch {each_epoch + 1}/{epochs}, Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}%")

    if(accuracy >= 99):
        print(f"Stopping the Training as Accuracy has reached to Maximum")
        break
    





torch.save(model.state_dict(), f = "/Users/ishananand/Desktop/ser/Speech-Emotion-Recognition/model_weight/CNNModel.pth")
print(f"Model saved to Model Path")




Epoch 1/10, Loss: 0.1053, Accuracy: 95.0367%
Epoch 2/10, Loss: 0.4029, Accuracy: 91.8609%
Epoch 3/10, Loss: 0.0000, Accuracy: 100.0000%
Stopping the Training as Accuracy has reached to Maximum
Model saved to Model Path


Training without Normalization

In [3]:
import data_loader #import getDatapoints

trainDataLoader, testDataLoader = data_loader.getDatapoints()



device = torch.device("cpu")
epochs = 10
model = CNNArchitecture(classes=6)

model.to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.001)
for param in model.parameters():
    param.requires_grad = True 
lossFn = nn.CrossEntropyLoss()


for each_epoch in range(epochs):
    epoch_loss = 0
    correct_predictions = 0 
    total_samples = 0

    for batch_id, (trainX, trainY) in enumerate(trainDataLoader):
        
        start_time = time.time()

        # Add channel dimension (for Conv2d)
        trainX = trainX.unsqueeze(1)  # Adds a channel dimension at position 1
        trainX = trainX.to(device)
        trainY = trainY.long()
        trainY = trainY.to(device)
        
        # Forward pass
        pred = model(trainX)
        
        # print("Predictions dtype:", pred.dtype)  # This should be torch.float32
        # trainY.dtype = torch.long()
        # print("Targets dtype:", trainY.dtype) 
        # Compute the loss
        lossval = lossFn(pred, trainY)
        
        optimizer.zero_grad()
        
        # Check if the loss tensor requires gradients
        # print("Check: ", lossval.requires_grad)  # This should print True now

        # Backward pass
        lossval.backward()

        # Update model parameters
        optimizer.step()
        
        epoch_loss += lossval.item()  # Add batch loss to epoch loss
        
        end_time = time.time()
        batch_time = end_time - start_time  # Time taken for the batch
        # print(f"Batch {batch_id + 1}, Time per batch: {batch_time:.4f} seconds")

        # break  # For debugging, you can remove this to train on the full dataset
        with torch.no_grad():  # No gradient computation for accuracy
            predictions = torch.argmax(pred, dim=1)  # Get predicted class labels
            correct_predictions += (predictions == trainY).sum().item()  # Count correct predictions
            total_samples += trainY.size(0)  # Update total number of samples
    average_loss = epoch_loss / len(trainDataLoader)  # Average loss
    accuracy = correct_predictions / total_samples * 100  # Accuracy as percentage

    # Display metrics for the epoch
    print(f"Epoch {each_epoch + 1}/{epochs}, Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}%")

    if(accuracy >= 99):
        print(f"Stopping the Training as Accuracy has reached to Maximum")
        break
    





torch.save(model.state_dict(), f = "/Users/ishananand/Desktop/ser/Speech-Emotion-Recognition/model_weight/CNNModel_NoNorm.pth")
print(f"Model saved to Model Path")




Completed collecting all the data points X: (23932, 128, 173) and Y: (23932,)
21538 2394
Completed collecting all the data points X: (23932, 128, 173) and Y: (23932,)
Epoch 1/10, Loss: 0.2776, Accuracy: 87.3572%
Epoch 2/10, Loss: 0.0000, Accuracy: 100.0000%
Stopping the Training as Accuracy has reached to Maximum
Model saved to Model Path


In [27]:
class CNNArchitecture2(nn.Module):

    def __init__(self, classes):
        super(CNNArchitecture2, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)

        self.Pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.fc1 = nn.Linear(256 , 256)  # Adjusted for the output size after pooling
        # self.fc2 = nn.Linear(1024, 512)
        # self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, 128)
        # self.fc5 = nn.Linear(128, 64)
        self.fc6 = nn.Linear(128, classes)

    def forward(self, x):

        # Convolutional layers with ReLU activations and pooling
        x = Fn.relu(self.conv1(x))
        x = self.Pool(x)
        x = Fn.relu(self.conv2(x))
        x = self.Pool(x)
        x = Fn.relu(self.conv3(x))
        x = self.Pool(x)
        x = Fn.relu(self.conv4(x))
        x = self.Pool(x)

        x = x.view(x.size(0), -1)  # Flatten the output to (batch_size, num_features)
        # print(x.shape)

        # Fully connected layers with ReLU activations
        x = Fn.relu(self.fc1(x))
        # x = Fn.relu(self.fc2(x))
        # x = Fn.relu(self.fc3(x))
        x = Fn.relu(self.fc4(x))
        # x = Fn.relu(self.fc5(x))

        # Final output layer (no ReLU)
        x = self.fc6(x)
        # print(x.shape)
        # out = torch.argmax(x, dim=1).float()
        # print(out)

        return x



In [28]:
import data_loader #import getDatapoints

# trainDataLoader, testDataLoader = data_loader.getDatapoints()



device = torch.device("cpu")
epochs = 10
model = CNNArchitecture2(classes=6)

model.to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.001)
for param in model.parameters():
    param.requires_grad = True 
lossFn = nn.CrossEntropyLoss()


for each_epoch in range(epochs):
    epoch_loss = 0
    correct_predictions = 0 
    total_samples = 0

    for batch_id, (trainX, trainY) in enumerate(trainDataLoader):
        
        start_time = time.time()

        # Add channel dimension (for Conv2d)
        trainX = trainX.unsqueeze(1)  # Adds a channel dimension at position 1
        trainX = trainX.to(device)
        trainY = trainY.long()
        trainY = trainY.to(device)
        
        # Forward pass
        pred = model(trainX)
        
        # print("Predictions dtype:", pred.dtype)  # This should be torch.float32
        # trainY.dtype = torch.long()
        # print("Targets dtype:", trainY.dtype) 
        # Compute the loss
        lossval = lossFn(pred, trainY)
        
        optimizer.zero_grad()
        
        # Check if the loss tensor requires gradients
        # print("Check: ", lossval.requires_grad)  # This should print True now

        # Backward pass
        lossval.backward()

        # Update model parameters
        optimizer.step()
        
        epoch_loss += lossval.item()  # Add batch loss to epoch loss
        
        end_time = time.time()
        batch_time = end_time - start_time  # Time taken for the batch
        # print(f"Batch {batch_id + 1}, Time per batch: {batch_time:.4f} seconds")

        # break  # For debugging, you can remove this to train on the full dataset
        with torch.no_grad():  # No gradient computation for accuracy
            predictions = torch.argmax(pred, dim=1)  # Get predicted class labels
            correct_predictions += (predictions == trainY).sum().item()  # Count correct predictions
            total_samples += trainY.size(0)  # Update total number of samples
    average_loss = epoch_loss / len(trainDataLoader)  # Average loss
    accuracy = correct_predictions / total_samples * 100  # Accuracy as percentage

    # Display metrics for the epoch
    print(f"Epoch {each_epoch + 1}/{epochs}, Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}%")

    if(accuracy >= 99):
        print(f"Stopping the Training as Accuracy has reached to Maximum")
        break
    





torch.save(model.state_dict(), f = "/Users/ishananand/Desktop/ser/Speech-Emotion-Recognition/model_weight/CNNModelBN.pth")
print(f"Model saved to Model Path")




Epoch 1/10, Loss: 0.1258, Accuracy: 95.2920%
Epoch 2/10, Loss: 0.0000, Accuracy: 100.0000%
Stopping the Training as Accuracy has reached to Maximum
Model saved to Model Path


In [30]:
test_model = CNNArchitecture2(classes=6)
# Load the saved model weights
# test_model.load_state_dict(torch.load('/Users/ishananand/Desktop/ser/Speech-Emotion-Recognition/model_weight/CNNModel_NoNorm.pth'))
test_model.load_state_dict(torch.load('/Users/ishananand/Desktop/ser/Speech-Emotion-Recognition/model_weight/CNNModelBN.pth'))

# Move the model to the appropriate device (CPU or GPU)
# 5. Set the model to evaluation mode (important for inference)

# the model to evaluation mode (important for inference)
test_model.eval()

def testAccuracy(model, loader):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:

            x = x.unsqueeze(1)  # Adds a channel dimension at position 1
            y = y.long()
            x = x.to(device)
            y = y.to(device)
            
            scores = model(x)

            predictions = torch.argmax(scores, dim=1)
            # print(predictions, "-----", y)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

    model.train()
    accuracy = num_correct / num_samples * 100  # Accuracy as percentage
    return accuracy


testAccuracy = testAccuracy(test_model, testDataLoader)
print(f"Test Accuracy is  {testAccuracy}")


Test Accuracy is  100.0


In [31]:
from data_loader import getMelVector
test_audio = getMelVector("/Users/ishananand/Desktop/ser/testAudios/angry.wav", 4)
X, Y = [], []
X.append(test_audio)
Y.append(1)
X = np.array(X)
Y = np.array(Y)
Y = Y.astype(np.int64)
# X = (X - (-1 * 0.2380136)) / 0.07312169  # This normalization will use the same mean and std for all images
X_tensor = torch.tensor(X)
Y_tensor = torch.tensor(Y, dtype=torch.long)  # Use torch.float32 for regression, torch.long for classification

dataset = TensorDataset(X_tensor, Y_tensor)
customLoader = DataLoader(dataset, batch_size=1, shuffle=True)
# customLoader.dataset
with torch.no_grad():
    for x, y in customLoader:

        x = x.unsqueeze(1)  # Adds a channel dimension at position 1
        y = y.long()
        x = x.to(device)
        y = y.to(device)
        
        scores = test_model(x)
        print(scores)

        predictions = torch.argmax(scores, dim=1)
        print(predictions, "-----", y)

    test_model.train()

tensor([[ -1.3048,  10.3338,   4.9817,  12.7780, -19.9045,  -6.4331]])
tensor([3]) ----- tensor([1])


In [32]:
from data_loader import getMelVector
import os
test_model.load_state_dict(torch.load('/Users/ishananand/Desktop/ser/Speech-Emotion-Recognition/model_weight/CNNModelBN.pth'))
# test_model.load_state_dict(torch.load('/Users/ishananand/Desktop/ser/Speech-Emotion-Recognition/model_weight/CNNModel.pth'))

allAudios = os.listdir("/Users/ishananand/Desktop/ser/testAudios")
rootpath = "/Users/ishananand/Desktop/ser/testAudios"
emotion_class = {
    0: "happy",
    1: "angry",
    2: "fear",
    3: "sad",
    4: "disgust",
    5: "neutral"
}
X, Y = [], []
for each_audio in allAudios:
    print(rootpath + "/" +each_audio)

    test_audio = getMelVector(rootpath + "/" +each_audio, 5)
    # print(test_audio)
    
    X.append(test_audio)
    if("angry" in each_audio):
        Y.append(1)
    elif("disgust" in each_audio):
        Y.append(4)
    elif("happy" in each_audio):
        Y.append(0)
    elif("sad" in each_audio):
        Y.append(3)
    elif("neutral" in each_audio):
        Y.append(5)
    elif("fear" in each_audio):
        Y.append(2)

X = np.array(X)
Y = np.array(Y)
Y = Y.astype(np.int64)
# X = (X - (-1 * 0.2380136)) / 0.07312169  # This normalization will use the same mean and std for all images
X_tensor = torch.tensor(X)
Y_tensor = torch.tensor(Y, dtype=torch.long)  # Use torch.float32 for regression, torch.long for classification

dataset = TensorDataset(X_tensor, Y_tensor)
customLoader = DataLoader(dataset, batch_size=1, shuffle=False)
# customLoader.dataset
with torch.no_grad():
    for x, y in customLoader:

        x = x.unsqueeze(1)  # Adds a channel dimension at position 1
        y = y.long()
        x = x.to(device)
        y = y.to(device)
        # print(y)
        scores = test_model(x)
        print(scores)
        y = int(y.to(torch.int32))
        predictions = torch.argmax(scores, dim=1)
        tensor_int = int(predictions.to(torch.int32))
        # print(customLoader.)
        print(predictions, "-----", y)
        # print(f" The true Value is {emotion_class[y]} and predicted class is {emotion_class[tensor_int]}")
        # print(scores)

        
        # print(predictions)

    test_model.train()

/Users/ishananand/Desktop/ser/testAudios/happy.wav
/Users/ishananand/Desktop/ser/testAudios/sad.wav
/Users/ishananand/Desktop/ser/testAudios/sad_G.wav
/Users/ishananand/Desktop/ser/testAudios/fear.wav
/Users/ishananand/Desktop/ser/testAudios/angry.wav
/Users/ishananand/Desktop/ser/testAudios/disgust.wav
/Users/ishananand/Desktop/ser/testAudios/happy1.wav
/Users/ishananand/Desktop/ser/testAudios/neutral1.wav
tensor([[  2.3857,  10.6884,   1.6839,   9.5668, -17.5970,  -4.3711]])
tensor([1]) ----- 0
tensor([[  2.8608,   9.2331,   0.2235,  11.0609, -19.5865,  -3.7930]])
tensor([3]) ----- 3
tensor([[  2.0851,   7.0425,  -3.9147,  11.7141, -15.8335,   1.4583]])
tensor([3]) ----- 3
tensor([[ -0.2709,   9.5504,   4.4252,   9.8490, -16.2871,  -5.3582]])
tensor([3]) ----- 2
tensor([[  1.6054,  10.3512,   4.0008,   7.0067, -13.6818,  -4.9789]])
tensor([1]) ----- 1
tensor([[-0.7818, 11.3698,  6.9910,  4.9887, -9.1297, -5.5539]])
tensor([1]) ----- 4
tensor([[ 0.8191,  6.1445,  4.4944,  2.3692, -2.9