Features Extraction

In [None]:
# Set the number of Mel-Spectrogram bands to extract.
n_mels = 128

# Initialize empty arrays to hold the feature matrix from audio and corresponding labels.
X_train = []
y_train = []

# Determine the maximum time length
max_time_length = 0
for file in sorted(os.listdir(data_path)):
    if file.endswith(".wav"):
        audio_data, sample_rate = librosa.load(os.path.join(data_path, file))
        spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate, n_mels=n_mels)
        max_time_length = max(max_time_length, spectrogram.shape[1])

# Iterate through all of the .wav files in the training directory.
for file in sorted(os.listdir(data_path)):
    if file.endswith(".wav"):
        # Load the audio file using librosa and our files in Google Drive.
        audio_data, sample_rate = librosa.load(os.path.join(data_path, file))

        # Determine the Mel-Spectrogram features from the audio data provided above.
        spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate, n_mels=n_mels)

        # Pad the spectrogram to have the same time length
        pad_width = max_time_length - spectrogram.shape[1]
        spectrogram_padded = np.pad(spectrogram, pad_width=((0, 0), (0, pad_width)), mode='constant')

        # Add the Mel-Spectrogram features to the data matrix
        X_train.append(spectrogram_padded)

        # Determine the emotion label of the audio file based on its filename.
        label = ""
        idx = 0
        while not (file[idx].isdigit()):
            label += file[idx]
            idx += 1
        y_train.append(label)

# Convert our data into np.arrays and expand the dimensions
X_train = np.array(X_train)[:, :, :, np.newaxis]
y_train = np.array(y_train)

# Encode the labels
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train_encoded, test_size=0.2, random_state=42, stratify=y_train_encoded)

# Create the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(n_mels, max_time_length, 1)),
    MaxPooling2D((2, 2)),
    Dropout(0.25),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.25),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(np.unique(y_train)), activation='softmax')
])

# Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Set up callbacks
checkpoint = ModelCheckpoint("best_model.h5", save_best_only=True, verbose=1)
early_stopping = EarlyStopping(patience=5, restore_best_weights=True, verbose=1)

history = model.fit(
X_train, y_train,
validation_data=(X_val, y_val),
epochs=40,
batch_size=32,
callbacks=[checkpoint, early_stopping]
)
model.load_weights("best_model.h5")

test: Predictions

In [None]:
# Load the best model
model = tf.keras.models.load_model("best_model.h5")

# Prepare the test data
file_names = []
X_test = []

for file in sorted(os.listdir(test_path)):
    if file.endswith(".wav"):
        file_names.append(file.split(".")[0])
        audio_data, sample_rate = librosa.load(os.path.join(test_path, file))
        spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate, n_mels=n_mels)
        spectrogram_padded = np.pad(spectrogram, ((0, 0), (0, max_time_length - spectrogram.shape[1])), mode='constant')
        X_test.append(spectrogram_padded)

X_test = np.array(X_test)[:, :, :, np.newaxis]

# Make predictions
y_pred = np.argmax(model.predict(X_test), axis=1)

# Convert the integer labels back into emotion strings
y_pred_emotions = encoder.inverse_transform(y_pred)

# Prepare the predictions for saving to a CSV file
predictions = np.column_stack((file_names, y_pred_emotions))

# Sort the predictions array by the first column (the sorted file names)
predictions = predictions[np.lexsort((predictions[:, 0],))]

# Save the predictions to a CSV file
np.savetxt("predictions.csv", predictions, fmt='%s', delimiter=",", encoding='utf-8')
with open("predictions.csv", "r+") as f:
    content = f.read()
    f.seek(0, 0)
    f.write("filename,Label\n" + content)