In [5]:

#creating a model

import torch
import torch.nn as nn

# TimeDistributed layer for applying a module to each time step in a sequence
class TimeDistributed(nn.Module):
    def __init__(self, module):
        super(TimeDistributed, self).__init__()
        self.module = module

    def forward(self, x):
        if len(x.size()) <= 2:
            return self.module(x)

        if len(x.size()) == 3:
            x_reshape = x.contiguous().view(-1, x.size(2))
        elif len(x.size()) == 4:
            x_reshape = x.contiguous().view(-1, x.size(2), x.size(3))
        else:
            x_reshape = x.contiguous().view(-1, x.size(2), x.size(3), x.size(4))

        # Apply the module to the reshaped tensor
        y = self.module(x_reshape)


        if len(x.size()) == 3:
            y = y.contiguous().view(x.size(0), -1, y.size(1))
        elif len(x.size()) == 4:
            y = y.contiguous().view(x.size(0), -1, y.size(1), y.size(2))
        else:
            y = y.contiguous().view(x.size(0), -1, y.size(1), y.size(2), y.size(3))
        return y



In [6]:
import torch
import torch.nn as nn

class HybridModel(nn.Module):
    def __init__(self, num_emotions):
        """
        Initialize the HybridModel with convolutional, LSTM, and attention layers.

        Parameters:
        - num_emotions: Number of emotion classes for classification.
        """
        super().__init__()

        # Convolutional Block
        self.conv2Dblock = nn.Sequential(
            # 1. Convolutional Block
            TimeDistributed(nn.Conv2d(in_channels=1,
                                   out_channels=16,
                                   kernel_size=3,
                                   stride=1,
                                   padding=1)),
            TimeDistributed(nn.BatchNorm2d(16)),
            TimeDistributed(nn.ReLU()),
            TimeDistributed(nn.MaxPool2d(kernel_size=2, stride=2)),
            TimeDistributed(nn.Dropout(p=0.3)),
            # 2. Convolutional Block
            TimeDistributed(nn.Conv2d(in_channels=16,
                                   out_channels=32,
                                   kernel_size=3,
                                   stride=1,
                                   padding=1)),
            TimeDistributed(nn.BatchNorm2d(32)),
            TimeDistributed(nn.ReLU()),
            TimeDistributed(nn.MaxPool2d(kernel_size=4, stride=4)),
            TimeDistributed(nn.Dropout(p=0.3)),
            # 3. Convolutional Block
            TimeDistributed(nn.Conv2d(in_channels=32,
                                   out_channels=64,
                                   kernel_size=3,
                                   stride=1,
                                   padding=1)),
            TimeDistributed(nn.BatchNorm2d(64)),
            TimeDistributed(nn.ReLU()),
            TimeDistributed(nn.MaxPool2d(kernel_size=4, stride=4)),
            TimeDistributed(nn.Dropout(p=0.3))
        )

        # LSTM Block
        hidden_size = 64
        self.lstm = nn.LSTM(input_size=1024, hidden_size=hidden_size, bidirectional=True, batch_first=True)
        self.dropout_lstm = nn.Dropout(p=0.4)
        self.attention_linear = nn.Linear(2 * hidden_size, 1)  # 2 * hidden_size for bidirectional LSTM

        # Linear layer for output classification
        self.out_linear = nn.Linear(2 * hidden_size, num_emotions)

    def forward(self, x):
        conv_embedding = self.conv2Dblock(x)

        conv_embedding = torch.flatten(conv_embedding, start_dim=2)

        # Apply LSTM layers
        lstm_embedding, (h, c) = self.lstm(conv_embedding)
        lstm_embedding = self.dropout_lstm(lstm_embedding)

        # Compute attention weights
        batch_size, T, _ = lstm_embedding.shape
        attention_weights = [None] * T
        for t in range(T):
            embedding = lstm_embedding[:, t, :]
            attention_weights[t] = self.attention_linear(embedding)

        # Normalize attention weights
        attention_weights_norm = nn.functional.softmax(torch.stack(attention_weights, -1), dim=-1)

        # Apply attention to LSTM outputs
        attention = torch.bmm(attention_weights_norm, lstm_embedding)  # (Bx1xT)*(B,T,hidden_size*2) = (B,1,2*hidden_size)
        attention = torch.squeeze(attention, 1)

        # Compute output logits and softmax probabilities
        output_logits = self.out_linear(attention)
        output_softmax = nn.functional.softmax(output_logits, dim=1)

        return output_logits, output_softmax, attention_weights_norm


def loss_fnc(predictions, targets):

    loss_function = nn.CrossEntropyLoss()

    return loss_function(input=predictions, target=targets)



In [13]:
def getMELspectrogram(audio, sample_rate):
    mel_spec=librosa.feature.melspectrogram(y=audio,sr=sample_rate,n_fft=1024,win_length=512,window='hamming',hop_length=256,n_mels=128,fmax=sample_rate/2)
    # Convert power spectrogram to decibel scale
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

def splitIntoChunks(mel_spec, win_size, stride):
    t = mel_spec.shape[1]

    num_of_chunks = int(t / stride)

    chunks = []

    for i in range(num_of_chunks):
        chunk = mel_spec[:, i * stride:i * stride + win_size]
        if chunk.shape[1] == win_size:
            chunks.append(chunk)

    return np.stack(chunks, axis=0)


In [16]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [17]:
from pydub import AudioSegment
import os

def convert_to_wav(input_file, output_file, sample_rate=48000, bitrate="768k"):
    # Load audio using pydub
    audio = AudioSegment.from_file(input_file)

    # Set sample rate and export as WAV
    audio = audio.set_frame_rate(sample_rate)

    # Export as wav file
    audio.export(output_file, format="wav", bitrate=bitrate)

# Example usage
input_file = '/content/abcde.mp3'
output_file = '/content/abcde.wav'  # Provide the filename here
convert_to_wav(input_file, output_file, sample_rate=48000, bitrate="768k")


In [18]:
import librosa
import numpy as np
import torch
from IPython.display import Audio, display
import os
import joblib



EMOTIONS = {
    1: 'neutral',
    2: 'calm',
    3: 'happy',
    4: 'sad',
    5: 'angry',
    6: 'fear',
    7: 'disgust',
    0: 'surprise'  # Note: 'surprise' is mapped to 0 instead of the original 8
}

# Load your trained model
LOAD_PATH = os.path.join(os.getcwd(), 'models')
model = HybridModel(len(EMOTIONS))

# # Load model weights and move to the appropriate device
# model.load_state_dict(torch.load(os.path.join(LOAD_PATH, '/content/speech_sentiment_asr.pt')))
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)  # Move the model to the GPU or keep it on CPU
# print('Model is loaded from {}'.format(os.path.join(LOAD_PATH, 'speech_sentiment_asr.pt')))

# Load model weights and move to the appropriate device
model.load_state_dict(torch.load(os.path.join(LOAD_PATH, '/content/speech_sentiment_asr.pt'), map_location=torch.device('cpu'))) # Added map_location argument to load the model onto the CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)  # Move the model to the GPU or keep it on CPU
print('Model is loaded from {}'.format(os.path.join(LOAD_PATH, 'speech_sentiment_asr.pt')))



SAMPLE_RATE = 48000
DURATION = 3
NUM_MEL_BINS = 128

# Load your fitted scaler
scaler = joblib.load('/content/scaler.pkl')

def process_audio(audio_file_path):

    audio, sample_rate = librosa.load(audio_file_path, sr=SAMPLE_RATE)


    target_length = SAMPLE_RATE * DURATION
    if len(audio) > target_length:
        audio = audio[:target_length]
    else:
        audio = np.pad(audio, (0, target_length - len(audio)), 'constant')  # Pad if too short


    mel_spectrogram = getMELspectrogram(audio, SAMPLE_RATE)

    # Print mel spectrogram shape
    print(f"Mel Spectrogram Shape: {mel_spectrogram.shape}")


    chunks = splitIntoChunks(mel_spectrogram, win_size=128, stride=64)

    #
    print(f"Chunks Shape Before Scaling: {chunks.shape}")


    num_chunks = chunks.shape[0]
    print(f"Number of Chunks: {num_chunks}")

    # Check if the number of chunks is equal to the sequence length expected by the model
    if num_chunks < 7:
        padding = np.zeros((7 - num_chunks, 128, 128))
        chunks = np.concatenate((chunks, padding), axis=0)
    elif num_chunks > 7:
        chunks = chunks[:7]


    chunks = chunks[np.newaxis, :]
    chunks = np.expand_dims(chunks, axis=1)

    chunks_reshaped = chunks.reshape(1, 7, 1, 128, 128)

    print(f"Chunks Shape After Reshaping: {chunks_reshaped.shape}")

    chunks_scaled = scaler.transform(chunks_reshaped.reshape(1, -1))

    chunks_scaled = chunks_scaled.reshape(1, 7, 1, 128, 128)
    print(f"Chunks Shape After Scaling: {chunks_scaled.shape}")

    chunks_tensor = torch.tensor(chunks_scaled, device=device).float()

    # Make predictions with the model
    with torch.no_grad():
        model.eval()
        _, output_softmax, _ = model(chunks_tensor)
        predictions = torch.argmax(output_softmax, dim=1)
        predicted_emotion = EMOTIONS[predictions.item()]

    # Display the audio
    display(Audio(audio_file_path))

    # Print the predicted emotion
    print(f"Predicted Emotion: {predicted_emotion}")

    return predicted_emotion


file_path = input("Enter the path to your .wav file: ")

process_audio(file_path)


  model.load_state_dict(torch.load(os.path.join(LOAD_PATH, '/content/speech_sentiment_asr.pt'), map_location=torch.device('cpu'))) # Added map_location argument to load the model onto the CPU
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Model is loaded from /content/models/speech_sentiment_asr.pt
Enter the path to your .wav file: /content/abcde.wav
Mel Spectrogram Shape: (128, 563)
Chunks Shape Before Scaling: (7, 128, 128)
Number of Chunks: 7
Chunks Shape After Reshaping: (1, 7, 1, 128, 128)
Chunks Shape After Scaling: (1, 7, 1, 128, 128)


Predicted Emotion: happy


'happy'