# Prediction of the Best Neural Network (Continuation of the Project, refer to "Genre Classification of Audio Signal Using Neural Networks")

### Libraries

In [48]:
# Data Manipulation
import os
import numpy as np
import pandas as pd

# Audio Processing
import librosa
import librosa.display
from pydub import AudioSegment
import IPython.display as ipd  # For displaying audio in Jupyter Notebooks

# Data Visualization
import matplotlib.pyplot as plt

# Machine Learning & Neural Networks
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization, Flatten, Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow import keras
from tensorflow.keras import layers
from keras import regularizers
from keras import models

# Preprocessing & Model Evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score

### Data Import

In [2]:
# Path to the audio folders
base_folder_path = r'C:\Users\manue\Desktop\DataScience\Audio\genres_original'
folders = ['Blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

In [3]:
# Empty list to hold audio data information
audio_data = []

In [4]:
# Loop through each folder and load audio files
for folder in folders:
    folder_path = os.path.join(base_folder_path, folder)
    
    # Get all audio files in the folder
    audio_files = [f for f in os.listdir(folder_path) if f.endswith(('.wav', '.mp3'))]
    
    for audio_file in audio_files:
        audio_path = os.path.join(folder_path, audio_file)
        
        try:
            # Load the audio file
            y, sr = librosa.load(audio_path, sr=None)
            
            # Append relevant information to the list
            audio_data.append({
                'file_name': audio_file,
                'folder': folder
            })
        
        except Exception as e:
            print(f"Error loading {audio_path}: {e}")

Error loading C:\Users\manue\Desktop\DataScience\Audio\genres_original\jazz\jazz.00054.wav: 


In [5]:
# The df is correctly showing all the audio files as well as the category to which they belong
df = pd.DataFrame(audio_data)
df.head()

Unnamed: 0,file_name,folder
0,blues.00000.wav,Blues
1,blues.00001.wav,Blues
2,blues.00002.wav,Blues
3,blues.00003.wav,Blues
4,blues.00004.wav,Blues


### Data Preparation

In [6]:
# Function to extract features (e.g., MFCC) from the audio file
def features_extractor(file_path):
    y, sr = librosa.load(file_path, sr=None)  # audio file
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)  # MFCC features
    return np.mean(mfccs.T, axis=0)  # Return the mean of the MFCC features

# empty list to store the extracted features
extracted_features = []

# Iterate through each row in df
for index_num, row in tqdm(df.iterrows()):
    # file path
    file_name = os.path.join(os.path.abspath(base_folder_path), row["folder"], row["file_name"])
    
    # folder as the label 
    final_class_labels = row["folder"]
    
    # features extraction from the audio file
    data = features_extractor(file_name)
    
    # Append the extracted features and corresponding label to the list
    extracted_features.append([data, final_class_labels])

# list of extracted features is converted to a DataFrame for easier analysis
extracted_features_df = pd.DataFrame(extracted_features, columns=['features', 'label'])

extracted_features_df.head()

999it [01:14, 13.45it/s]


Unnamed: 0,features,label
0,"[-113.59882, 121.57067, -19.162262, 42.36394, ...",Blues
1,"[-207.52383, 123.98514, 8.947019, 35.86715, 2....",Blues
2,"[-90.757164, 140.44087, -29.084547, 31.686693,...",Blues
3,"[-199.57513, 150.0861, 5.663404, 26.855278, 1....",Blues
4,"[-160.35417, 126.20948, -35.581394, 22.139256,...",Blues


In [7]:
# Randomized rows
randomized_df = extracted_features_df.sample(frac=1, random_state=42).reset_index(drop=True)
randomized_df.head()

Unnamed: 0,features,label
0,"[-144.9216, 66.675064, 44.269436, 22.51216, 6....",hiphop
1,"[-42.241444, 82.16851, -2.2273612, 19.91206, 1...",pop
2,"[12.145776, 58.997505, 0.9395953, 26.743252, 9...",country
3,"[-68.92688, 63.117622, -15.855327, 28.002327, ...",disco
4,"[-204.71895, 101.62684, 26.381647, 9.190344, 1...",pop


In [8]:
### dataset split into X and Y
X=np.array(randomized_df['features'].tolist())
y=np.array(randomized_df['label'].tolist())

In [9]:
y = np.array(pd.get_dummies(y))

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=98)

###  Best Model

In [17]:
epochs1=100

#  model building
model = Sequential()

# First layer
model.add(Dense(128, input_shape=(40,)))  # Increased units
model.add(BatchNormalization())  # Batch normalization
model.add(Activation('relu'))
model.add(Dropout(0.5))

# Second layer
model.add(Dense(256))  # Increased units
model.add(BatchNormalization())  # Batch normalization
model.add(Activation('relu'))
model.add(Dropout(0.5))

# Third layer
model.add(Dense(128))  # Increased units
model.add(BatchNormalization())  # Batch normalization
model.add(Activation('relu'))
model.add(Dropout(0.5))

# Final layer
model.add(Dense(10))  # Adjusted to match the output classes
model.add(Activation('softmax'))  # softmax for multi-class classification

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',  # categorical crossentropy for one-hot encoded labels
              metrics=['accuracy'])

# callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# model training
history = model.fit(X_train, y_train,
                    batch_size=32,
                    epochs=epochs1,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping],
                    verbose=1)

Epoch 1/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.1362 - loss: 2.7482 - val_accuracy: 0.0800 - val_loss: 5.2836
Epoch 2/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2311 - loss: 2.2089 - val_accuracy: 0.1320 - val_loss: 3.8112
Epoch 3/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2716 - loss: 2.1227 - val_accuracy: 0.1840 - val_loss: 3.0135
Epoch 4/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3204 - loss: 2.0142 - val_accuracy: 0.2320 - val_loss: 2.4134
Epoch 5/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3558 - loss: 1.9997 - val_accuracy: 0.2520 - val_loss: 2.0126
Epoch 6/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3134 - loss: 1.9915 - val_accuracy: 0.2840 - val_loss: 1.8668
Epoch 7/100
[1m24/24[0m [32m━━

In [18]:
model.summary()

In [19]:
test_accuracy2=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy2[1])

0.6439999938011169


### Predictions

In [38]:
# class-to-label mapping
class_labels = {0: 'Blues', 1: 'Classical', 2: 'Country', 3: 'Disco', 4: 'Hip Hop', 
                5: 'Jazz', 6: 'Metal', 7: 'Pop', 8: 'Reggae', 9: 'Rock'}

In [39]:
# predictions
predictions = model.predict(X_test)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


In [40]:
# predicted classes (indices of max probability in each prediction)
predicted_classes = predictions.argmax(axis=1)

In [41]:
# Get actual classes (if y_test is one-hot encoded, convert it)
actual_classes = y_test.argmax(axis=1) if y_test.ndim > 1 else y_test

In [42]:
comparison_df = pd.DataFrame({
    'Actual Class': actual_classes,
    'Predicted Class': predicted_classes,
    'Highest Prediction': predictions.max(axis=1)  # Get highest prediction probability
})

In [43]:
comparison_df['Label'] = comparison_df['Actual Class'].map(class_labels)

In [44]:
print("Actual vs Predicted Classes with Highest Prediction Probabilities:")
comparison_df.head(40)

Actual vs Predicted Classes with Highest Prediction Probabilities:


Unnamed: 0,Actual Class,Predicted Class,Highest Prediction,Label
0,8,2,0.519263,Reggae
1,9,3,0.736338,Rock
2,1,1,0.92312,Classical
3,9,2,0.477087,Rock
4,4,4,0.883821,Hip Hop
5,5,5,0.898182,Jazz
6,8,8,0.791531,Reggae
7,1,1,0.99196,Classical
8,8,4,0.332139,Reggae
9,3,3,0.496104,Disco


### Accuracy by class

In [49]:
# accuracy by class
accuracy_by_class = comparison_df.groupby('Actual Class').apply(
    lambda x: (x['Actual Class'] == x['Predicted Class']).sum() / len(x)
)

# Convert to a DataFrame and add labels
accuracy_df = accuracy_by_class.reset_index(name='Accuracy')
accuracy_df['Label'] = accuracy_df['Actual Class'].map(class_labels)
accuracy_df['Accuracy'] = (accuracy_df['Accuracy'] * 100).round(2)  # Convert to %
accuracy_df = accuracy_df[['Actual Class', 'Label', 'Accuracy']]

print("Accuracy by class with labels (in %):")
accuracy_df.head(10)

Accuracy by class with labels (in %):


Unnamed: 0,Actual Class,Label,Accuracy
0,0,Blues,61.9
1,1,Classical,100.0
2,2,Country,50.0
3,3,Disco,56.52
4,4,Hip Hop,68.0
5,5,Jazz,66.67
6,6,Metal,83.33
7,7,Pop,84.21
8,8,Reggae,61.76
9,9,Rock,29.63


### Conclusion

It is really interesting to see how much the accuracy varies among all the diferent genders, specially considering that the data is totally balanced. (there are 100 audio samples for each genre).

It catches my attention that classical reaches a 100% of accuracy, which might be attributed to its distinctive melodies. This uniqueness could also explain the high accuracy observed for metal. . I would have the accuracy for Pop to be lower, and for Hip Hop or Reggae to be higher. 

It should be mention that I have no background knowledge of music, as I do not play any instruments, and I listen to it just as a hobby. 

The accuracy in Rock is so low, that if it was left out, the overall accuracy would increased by 6%.