In [3]:
import os
import librosa
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Set path to your audio dataset
data_path = '/content/drive/MyDrive/speech_data/speech_data/'

# Parameters for loading and processing audio
SAMPLE_RATE = 22050
DURATION = 2  # seconds
N_MFCC = 13

# Helper function to extract features from audio files
def extract_features(file_path, sample_rate=SAMPLE_RATE, duration=DURATION, n_mfcc=N_MFCC):
    audio, _ = librosa.load(file_path, sr=sample_rate, duration=duration)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)
    return np.mean(mfccs.T, axis=0)

# Prepare dataset and labels
X = []
y = []

# Loop over all classes (folders)
for label in os.listdir(data_path):
    class_dir = os.path.join(data_path, label)
    if os.path.isdir(class_dir):
        # Loop over all audio files in class folder
        for file in os.listdir(class_dir):
            if file.endswith('.wav'):
                file_path = os.path.join(class_dir, file)
                features = extract_features(file_path)
                X.append(features)
                y.append(label)

# Convert lists to numpy arrays
X = np.array(X)
y = np.array(y)

# Encode labels to numeric values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)


In [5]:
# Corrected number of output neurons to match the number of classes
num_classes = len(np.unique(y_encoded))  # Should be 35 in your case

# Define the model architecture
model = Sequential([
    Dense(256, activation='relu', input_shape=(N_MFCC,)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')  # Set output to 35 classes
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X, y_categorical, epochs=30, batch_size=32, validation_split=0.2)


Epoch 1/30
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.0393 - loss: 74.6269 - val_accuracy: 0.0000e+00 - val_loss: 5.1390
Epoch 2/30
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0302 - loss: 13.9594 - val_accuracy: 0.0000e+00 - val_loss: 3.5808
Epoch 3/30
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0345 - loss: 4.2178 - val_accuracy: 0.0000e+00 - val_loss: 3.6073
Epoch 4/30
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0332 - loss: 3.7113 - val_accuracy: 0.0000e+00 - val_loss: 3.6338
Epoch 5/30
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0390 - loss: 3.5811 - val_accuracy: 0.0000e+00 - val_loss: 3.6597
Epoch 6/30
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0400 - loss: 3.7454 - val_accuracy: 0.0000e+00 - val_loss: 3.6843
Epoch 7/30
[

In [6]:
# Evaluate the model
evaluation = model.evaluate(X, y_categorical)
print(f"Test Loss: {evaluation[0]}, Test Accuracy: {evaluation[1]}")

# Save the model for future use
model.save('fine_tuned_model.h5')


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.0873 - loss: 3.4353     




Test Loss: 3.5790865421295166, Test Accuracy: 0.03465346619486809
