In [1]:
import os
import librosa
import numpy as np
import pandas as pd

# Define a mapping for the emotions in RAVDESS
EMOTION_MAPPING = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

def load_ravdess_data(data_path):
    """
    Load the RAVDESS dataset, extracting audio features and labels.
    
    Args:
        data_path (str): Path to the RAVDESS dataset folder.
        
    Returns:
        X (list): List of audio features extracted from recordings.
        y (list): List of corresponding emotion labels for each recording.
    """
    X = []
    y = []
    
    # Walk through the dataset folder
    for root, _, files in os.walk(data_path):
        for file in files:
            if file.endswith(".wav"):
                # Extract the emotion label from the file name
                parts = file.split('-')
                emotion_code = parts[2]
                emotion = EMOTION_MAPPING.get(emotion_code, None)
                
                if emotion:
                    # Load the audio file
                    file_path = os.path.join(root, file)
                    audio, sr = librosa.load(file_path, sr=None)
                    
                    feature_vec = []
                    # features explained
                    # 0-25 each of 13 MFCC mean, std dev
                    # 26-27 spectral centroid mean, std dev
                    # 28-29 spectral bandwidth mean, std dev
                    # 30-31 zero cross rate mean, std dev
                    # 32-55 each of 12 chroma bins mean, std dev

                    # Extract MFCC features
                    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
                    mfcc_mean = np.mean(mfcc.T, axis=0)  # Take the mean of the MFCC features
                    mfcc_std = np.std(mfcc.T, axis=0) # Take the std of the MFCC features
                    feature_vec.extend(mfcc_mean)
                    feature_vec.extend(mfcc_std)

                    # 2. Spectral Centroid
                    #spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
                    spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
                    sc_mean = np.mean(spectral_centroid)
                    sc_std = np.std(spectral_centroid)
                    feature_vec.append(sc_mean)
                    feature_vec.append(sc_std)
                    
                    # 3. Spectral Bandwidth
                    #spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
                    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)
                    sb_mean = np.mean(spectral_bandwidth)
                    sb_std = np.std(spectral_bandwidth)
                    feature_vec.append(sb_mean)
                    feature_vec.append(sb_std)
                    
                    # 4. Zero-Crossing Rate
                    zero_crossing_rate = librosa.feature.zero_crossing_rate(y=audio)
                    zcr_mean = np.mean(zero_crossing_rate)
                    zcr_std = np.std(zero_crossing_rate)
                    feature_vec.append(zcr_mean)
                    feature_vec.append(zcr_std)
                    
                    # 5. Chroma Frequencies
                    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
                    chroma_mean = np.mean(chroma.T, axis=0)
                    chroma_std = np.std(chroma.T, axis=0)
                    feature_vec.extend(chroma_mean)
                    feature_vec.extend(chroma_std)
                    
                    '''
                    # Extract features (MFCCs as an example)
                    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
                    mfccs_mean = np.mean(mfccs.T, axis=0)  # Take mean across time
                    
                    # Append the feature and label
                    X.append(mfccs_mean)
                    y.append(emotion)
                    '''

                    X.append(feature_vec)
                    y.append(emotion)
    
    return np.array(X), np.array(y)

# Set the path to your RAVDESS dataset
data_path = '/Users/harshith/noCloud/mlsp_project/RAVDESS'

# Load data
X, y = load_ravdess_data(data_path)

# Display dataset information
print(f"Shape of X (features): {X.shape}")
print(f"Shape of y (labels): {y.shape}")


Shape of X (features): (1440, 110)
Shape of y (labels): (1440,)


In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y, train_size=0.8)

In [4]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


def classify_with_svm(X_train, y_train, X_test, y_test):
    """
    Train and evaluate an SVM classifier using MFCC features.

    Parameters:
    - X_train: array-like, MFCC features for training
    - y_train: array-like, corresponding emotion labels for training
    - X_test: array-like, MFCC features for testing
    - y_test: array-like, corresponding emotion labels for testing

    Returns:
    - None, prints classification report and accuracy
    """
    # Initialize and train the SVM classifier
    svm = SVC(kernel='rbf', C=10.0, max_iter=100000)
    svm.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = svm.predict(X_test)

    # Print classification report and accuracy
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

classify_with_svm(X_train, y_train, X_test, y_test)

Classification Report:
              precision    recall  f1-score   support

       angry       0.83      0.89      0.86        38
        calm       0.83      0.89      0.86        38
     disgust       0.81      0.58      0.68        38
     fearful       0.80      0.82      0.81        39
       happy       0.78      0.74      0.76        39
     neutral       0.58      0.58      0.58        19
         sad       0.77      0.87      0.81        38
   surprised       0.82      0.85      0.84        39

    accuracy                           0.79       288
   macro avg       0.78      0.78      0.78       288
weighted avg       0.79      0.79      0.79       288

Accuracy: 0.7916666666666666


In [11]:
'''
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the SVM classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Print classification report and accuracy
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
'''

'\nfrom sklearn.neighbors import KNeighborsClassifier\n\n# Initialize and train the SVM classifier\nknn = KNeighborsClassifier(n_neighbors=3)\nknn.fit(X_train, y_train)\n\n# Make predictions on the test set\ny_pred = knn.predict(X_test)\n\n# Print classification report and accuracy\nprint("Classification Report:")\nprint(classification_report(y_test, y_pred))\nprint(f"Accuracy: {accuracy_score(y_test, y_pred)}")\n'

In [None]:
'''
import os
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report, accuracy_score

def train_gmm_classifiers(features, labels):
    """
    Train a GMM classifier for each emotion class.

    Parameters:
    - features: array-like, MFCC features
    - labels: array-like, corresponding emotion labels

    Returns:
    - gmm_models: dict, a dictionary of trained GMM models for each class
    """
    unique_labels = np.unique(labels)
    gmm_models = {}
    i = 0
    for label in unique_labels:
        # Select features corresponding to the current label
        class_features = features[labels == label]
        if i==0:
            print(class_features.shape)
            print(class_features[0].shape)
            i+=1

        # Train a GMM model for the current class
        gmm = GaussianMixture(n_components=64, covariance_type='full',max_iter=10000, random_state=42)
        gmm.fit(class_features)
        gmm_models[label] = gmm

    print("GMM models trained for each class.")
    return gmm_models

def classify_with_gmm(gmm_models, features, labels):
    """
    Classify features using the trained GMM models and evaluate performance.

    Parameters:
    - gmm_models: dict, a dictionary of trained GMM models
    - features: array-like, MFCC features to classify
    - labels: array-like, true emotion labels

    Returns:
    - None, prints classification report and accuracy
    """
    predictions = []

    for feature in features:
        # Calculate the log likelihood for each class and select the class with the highest likelihood
        log_likelihoods = {label: gmm.score([feature]) for label, gmm in gmm_models.items()}
        predicted_label = max(log_likelihoods, key=log_likelihoods.get)
        predictions.append(predicted_label)

    # Evaluate the performance
    print("Classification Report:")
    print(classification_report(labels, predictions))
    print(f"Accuracy: {accuracy_score(labels, predictions)}")

# Train GMM models for each emotion class
gmm_models = train_gmm_classifiers(X_train, y_train)

# Classify and evaluate using the GMM models
classify_with_gmm(gmm_models, X_test, y_test)
'''

# DCNN

Make X only MFCCs

In [13]:
import os
import librosa
import numpy as np
import pandas as pd

# Define a mapping for the emotions in RAVDESS
EMOTION_MAPPING = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

def load_ravdess_full_mfcc(data_path, n_mfcc=40):
    """
    Load the RAVDESS dataset, extracting full MFCC features and labels.
    
    Args:
        data_path (str): Path to the RAVDESS dataset folder.
        n_mfcc (int): Number of MFCC coefficients to extract.
        
    Returns:
        X (list): List of full MFCC matrices (time vs. coefficients) for each recording.
        y (list): List of corresponding emotion labels for each recording.
    """
    X = []
    y = []
    
    # Walk through the dataset folder
    for root, _, files in os.walk(data_path):
        for file in files:
            if file.endswith(".wav"):
                # Extract the emotion label from the file name
                parts = file.split('-')
                emotion_code = parts[2]
                emotion = EMOTION_MAPPING.get(emotion_code, None)
                
                if emotion:
                    # Load the audio file
                    file_path = os.path.join(root, file)
                    audio, sr = librosa.load(file_path, sr=None)
                    
                    # Extract full MFCCs
                    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
                    
                    # Append the feature and label
                    X.append(mfccs)
                    y.append(emotion)
    
    return X, y

# Set the path to your RAVDESS dataset
data_path = '/Users/harshith/noCloud/mlsp_project/RAVDESS'

# Load data with full MFCCs
X, y = load_ravdess_full_mfcc(data_path)

# Display dataset information
print(f"Number of samples: {len(X)}")
print(f"Shape of an example MFCC matrix: {X[0].shape}")
print(f"Labels: {np.unique(y)}")


Number of samples: 2880
Shape of an example MFCC matrix: (40, 366)
Labels: ['angry' 'calm' 'disgust' 'fearful' 'happy' 'neutral' 'sad' 'surprised']


# Create Image Directory

In [15]:
import os
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

def save_mfcc_images(X, y, output_dir, emotions):
    """
    Save MFCC features as images for use in a DCNN.
    
    Args:
        X (numpy.ndarray): Array of MFCC features.
        y (numpy.ndarray): Array of corresponding labels.
        output_dir (str): Directory to save the images.
        emotions (list): List of unique emotion labels.
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    for emotion in emotions:
        os.makedirs(os.path.join(output_dir, emotion), exist_ok=True)
    
    # Determine global min and max across all MFCCs
    global_min = min([mfcc.min() for mfcc in X])
    global_max = max([mfcc.max() for mfcc in X])

    # Iterate over the dataset
    for i, mfcc in enumerate(X):
        label = y[i]
        
        # Create a plot
        plt.figure(figsize=(5, 4))
        librosa.display.specshow(mfcc, cmap='viridis', vmin=global_min, vmax=global_max)
        #librosa.display.specshow(mfcc, x_axis='time', cmap='viridis')
        #plt.colorbar()
        #plt.title(f"MFCC ({label})")
        plt.tight_layout()
        
        # Save the plot as an image
        file_name = f"{i}.png"
        file_path = os.path.join(output_dir, label, file_name)
        plt.savefig(file_path, dpi=300)
        plt.close()

# Example usage
# Assuming `X` is a 3D array of MFCCs (e.g., shape [num_samples, n_mfcc, time_frames])
# and `y` is a list/array of labels.

output_dir = '/Users/harshith/noCloud/mlsp_project/save_images_samCode' # Directory to save images
emotions = np.unique(y)  # Unique emotion labels in the dataset
save_mfcc_images(X, y, output_dir, emotions)


# Load and Preprocess MFCC Images

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing import image_dataset_from_directory

# Define the path to the MFCC images directory
image_dir = '/Users/sambentlin/Documents/JHU/MLSP/Project/Code/RAVDESS_MFCC_plots_minmax_13'

# Load the dataset, splitting into training and test sets
batch_size = 32
img_size = (128, 128)  # Resize images to a fixed size for the CNN

train_dataset = image_dataset_from_directory(
    image_dir,
    validation_split=0.2,
    subset="training",
    seed=42,
    image_size=img_size,
    batch_size=batch_size
)

test_dataset = image_dataset_from_directory(
    image_dir,
    validation_split=0.2,
    subset="validation",
    seed=42,
    image_size=img_size,
    batch_size=batch_size
)

# Display class names (emotions)
class_names = train_dataset.class_names
print(f"Classes: {class_names}")

# NORMALIZE THE DATA
# Normalize the datasets
normalization_layer = tf.keras.layers.Rescaling(1.0 / 255)

train_dataset = train_dataset.map(lambda x, y: (normalization_layer(x), y))
test_dataset = test_dataset.map(lambda x, y: (normalization_layer(x), y))

# Enable prefetching for performance
train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
test_dataset = test_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

# Define the DCNN Model

In [None]:
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(img_size[0], img_size[1], 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(class_names), activation='softmax')  # One output per class
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

# Train the Model

In [None]:
# Train the model
epochs = 100
history = model.fit(
    train_dataset,
    #validation_data=test_dataset,
    epochs=epochs
)

# Evaluate the Model on the Test Set

In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")

# 40 epochs, yes min/max, no colorbar, 40 mfccs
# Test Accuracy: 0.6979

# 40 epochs, no min/max, yes colorbar, 40 mfccs
# Test Accuracy: 0.6076

# 40 epochs, yes min/max, no colorbar, 13 mfccs
# Test Accuracy: 0.6215
# Test Loss: 1.0762

# 40 epochs, yes min/max, no colorbar, 13 mfccs, 2x2 window
# Test Accuracy: 0.6736
# Test Loss: 1.1122

# 100 epochs, yes min/max, no colorbar, 40 mfccs
# Test Accuracy: 0.7639

# 100 epochs, yes min/max, no colorbar, 13 mfccs, 2x2 window
# Test Accuracy: 0.6875
# Test Loss: 1.5162


# Generate a Classification Report

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Predict labels for the test dataset
y_pred = []
y_true = []

for images, labels in test_dataset:
    preds = model.predict(images)
    y_pred.extend(np.argmax(preds, axis=1))
    y_true.extend(labels.numpy())

# Generate classification report
report = classification_report(
    y_true, y_pred, target_names=class_names
)
print("\nClassification Report:")
print(report)

# Generate confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm, annot=True, fmt='d', cmap='Blues',
    xticklabels=class_names, yticklabels=class_names
)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
