In [None]:
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from scipy.io import wavfile
import matplotlib.pyplot as plt

# Load and preprocess the dataset
DATA_DIR = 'path_to_dataset'  # Replace with your dataset path
LABELS = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']

# Convert labels to integer values
label_to_index = {label: i for i, label in enumerate(LABELS)}

def load_data(data_dir, labels):
    X = []
    y = []
    for label in labels:
        folder = os.path.join(data_dir, label)
        for file in os.listdir(folder):
            if file.endswith('.wav'):
                sample_rate, samples = wavfile.read(os.path.join(folder, file))
                samples = samples / 32768.0  # Normalize the audio
                X.append(samples)
                y.append(label_to_index[label])
    return np.array(X), np.array(y)

X, y = load_data(DATA_DIR, LABELS)

# Padding or trimming the audio data to a fixed length
MAX_LEN = 16000  # Based on the 1-second audio at a 16kHz sample rate

def pad_or_trim(data, max_len):
    padded_data = []
    for sample in data:
        if len(sample) > max_len:
            sample = sample[:max_len]
        else:
            sample = np.pad(sample, (0, max_len - len(sample)), mode='constant')
        padded_data.append(sample)
    return np.array(padded_data)

X_padded = pad_or_trim(X, MAX_LEN)

# Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Reshape for CNN input (samples, 16000, 1)
X_train = X_train.reshape(-1, MAX_LEN, 1)
X_val = X_val.reshape(-1, MAX_LEN, 1)
X_test = X_test.reshape(-1, MAX_LEN, 1)

# Define the CNN model
model = models.Sequential([
    layers.Conv1D(16, kernel_size=13, activation='relu', input_shape=(MAX_LEN, 1)),
    layers.MaxPooling1D(pool_size=2),
    layers.Conv1D(32, kernel_size=11, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    layers.Conv1D(64, kernel_size=9, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    layers.Conv1D(128, kernel_size=7, activation='relu'),
    layers.GlobalAveragePooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(LABELS), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)

# Evaluate the model on the test data
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f'Test Accuracy: {test_acc:.2f}')

# Plot training and validation accuracy/loss
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# Testing the model with a single audio sample (from test set)
def predict_audio(model, audio_sample):
    audio_sample = pad_or_trim([audio_sample], MAX_LEN).reshape(1, MAX_LEN, 1)
    prediction = model.predict(audio_sample)
    return LABELS[np.argmax(prediction)]

# Test with a random sample from the test set
random_index = np.random.randint(0, len(X_test))
sample = X_test[random_index]
actual_label = LABELS[y_test[random_index]]

predicted_label = predict_audio(model, sample)
print(f"Actual: {actual_label}, Predicted: {predicted_label}")