In [None]:
# Importing Libraries
import librosa  
import numpy as np  
import os
import random
from librosa.effects import time_stretch, pitch_shift
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split

In [None]:
# Utliity Functions
def extract_features(data, sample_rate=22050):

    # Extracting features from the audio data
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    chroma_stft = np.mean(librosa.feature.chroma_stft(y=data, sr=sample_rate).T, axis=0)
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    mel_spectrogram = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=data, sr=sample_rate).T, axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=data, sr=sample_rate).T, axis=0)
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=data, sr=sample_rate).T, axis=0)
    poly_features = np.mean(librosa.feature.poly_features(y=data, sr=sample_rate).T, axis=0)

    # Horizontally stacking features
    features = np.hstack([zcr, chroma_stft, mfcc, rms, mel_spectrogram, spectral_contrast, tonnetz, spectral_rolloff, poly_features])

    # Scaling features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features.reshape(-1, 1))

    return scaled_features

In [None]:
# Defining a mapping from class names to integers
class_map = {'Angry': 0, 'Disgusted': 1, 'Fearful': 2, 'Happy': 3, 'Neutral': 4, 'Sad': 5, 'Surprised': 6}

# Defining the noise transformation function
def noise(data, noise_factor=1.0):
    noise_amp = 0.025*np.random.uniform()*np.amax(data) 
    data = data + noise_factor * noise_amp * np.random.normal(size=data.shape[0])  # adding random amount of gaussian noise for the entirety of the audio
    return data


# Making the generator function
def data_generator(files, batch_size=32):
    while True:
        # Shuffling the list of files
        random.shuffle(files)

        # Applying transformations to each file and stack them
        batch_data = []
        batch_labels = []
        for file in files:
            # Loading the audio file
            data, sr = librosa.load(file, sr=22050, res_type='kaiser_fast')

            # Applying transformations with random intensities
            data = time_stretch(data, rate = random.uniform(0.5, 1.5) )
            data = pitch_shift(data, sr, n_steps = random.randint(-5, 5))
            data = noise(data, noise_factor = random.uniform(0, 1.5)) 

            features = extract_features(data, sr=sr)

            # Get the label from the file name
            batch_data.append(features)
            label = os.path.basename(os.path.dirname(file))
            batch_labels.append(class_map[label])

            # Yield batches
            if len(batch_data) == batch_size:
                yield np.array(batch_data), np.array(batch_labels)
                batch_data = []
                batch_labels = []
        if batch_data:
            yield np.array(batch_data), np.array(batch_labels)

In [None]:
audio_dir = os.path.join(os.getcwd(), 'filtered_dataset')                
# Get a list of all audio files in the directory
all_files = []
for subdir, dirs, files in os.walk(audio_dir):
    for file in files:
        if file.endswith(".wav"):
            all_files.append(os.path.join(subdir, file))


In [None]:
# Split the list of files into training and test sets
train_files, test_files = train_test_split(all_files, test_size=0.3, random_state=48)

# Create generators for training and test sets
train_generator = data_generator(train_files)
test_generator = data_generator(test_files)


In [None]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

input_shape = next(train_generator)[0].shape[1:]

model = Sequential([
    Input(shape=input_shape),
    Conv1D(64, kernel_size=3, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.2),

    Conv1D(128, kernel_size=3, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.2),

    Conv1D(256, kernel_size=3, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.2),

    Flatten(),
    
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(7, activation='softmax')
])

In [None]:
# Compiling the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Defining the early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=14, verbose=1)

In [None]:

# Train the model
history = model.fit(train_generator, validation_data=(test_generator), epochs=100, batch_size=32, callbacks=[early_stopping])

In [None]:
# Evaluating the model
model.evaluate(test_generator)

In [None]:
# Visualizing the training history
import matplotlib.pyplot as plt

# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(6, 4))

# Plot training loss and validation loss
ax1.plot(history.history['loss'], label='Training Loss')
ax1.plot(history.history['val_loss'], label='Validation Loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.set_title('Training and Validation Loss')
ax1.legend()

# Plot training accuracy and validation accuracy
ax2.plot(history.history['accuracy'], label='Training Accuracy')
ax2.plot(history.history['val_accuracy'], label='Validation Accuracy')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.set_title('Training and Validation Accuracy')
ax2.legend()

# Adjust spacing between subplots
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# Saving the model
model.save('saved_model/model.keras')

In [None]:
# Making the detection function
def detect(audio_path):
    features = extract_features(audio_path)
    features = np.expand_dims(features, axis=0) # Adding a batch dimension
    pred = model.predict(features) 
    pred_index = np.argmax(pred)
    label = {v: k for k, v in class_map.items()}[pred_index] # reversing the class_map to fetch the label from the index
    return label