In [2]:
# Install required libraries
!pip install -U tensorflow tensorflow_datasets librosa seaborn

# Import necessary modules
import os
import pathlib
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import librosa.display
from tensorflow.keras import layers, models
from IPython.display import Audio

# Set random seed for reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

# Ensure the dataset path is correct
DATASET_PATH = pathlib.Path("data/mini_speech_commands")

# Check and download dataset if not available
if not DATASET_PATH.exists():
    zip_path = tf.keras.utils.get_file(
        "mini_speech_commands.zip",
        origin="http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip",
        extract=True,
        cache_dir=".",  # Extract inside ./data/
        cache_subdir="data"
    )

# Set correct extracted dataset directory
data_dir = pathlib.Path("data/mini_speech_commands")

# Confirm dataset extraction
if not data_dir.exists():
    raise FileNotFoundError("Dataset extraction failed. Please check manually.")
else:
    print("Dataset successfully extracted.")

# List available commands
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[(commands != 'README.md') & (commands != '.DS_Store')]
print("Commands:", commands)

# Load dataset using audio_dataset_from_directory
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE

train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory=data_dir,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    seed=0,
    output_sequence_length=16000,
    subset="both"
)

# Check dataset shape
for audio, labels in train_ds.take(1):
    print(f"Audio shape: {audio.shape}, Labels shape: {labels.shape}")

# Convert waveforms into spectrograms
def get_spectrogram(waveform):
    spectrogram = tf.signal.stft(waveform, frame_length=255, frame_step=128)
    spectrogram = tf.abs(spectrogram)
    return spectrogram

def preprocess_dataset(dataset):
    return dataset.map(lambda x, y: (get_spectrogram(x), y), num_parallel_calls=AUTOTUNE)

train_ds = preprocess_dataset(train_ds)
val_ds = preprocess_dataset(val_ds)

# Build CNN model for speech recognition
model = models.Sequential([
    layers.Input(shape=(None, 129)),
    layers.Reshape((-1, 129, 1)),
    layers.Conv2D(32, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(len(commands), activation='softmax')
])

model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

# Train the model
EPOCHS = 10
history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

# Plot training accuracy and loss
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label="Train Accuracy")
plt.plot(history.history['val_accuracy'], label="Val Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label="Train Loss")
plt.plot(history.history['val_loss'], label="Val Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

# Function to test the model on a sample audio
def predict_sample(index=0):
    sample = next(iter(val_ds))
    audio_sample = sample[0][index]
    label = sample[1][index].numpy()

    prediction = model.predict(tf.expand_dims(audio_sample, axis=0))
    predicted_label = commands[np.argmax(prediction)]

    print(f"Actual: {commands[label]}")
    print(f"Predicted: {predicted_label}")

    # Play the audio sample
    return Audio(tf.squeeze(audio_sample), rate=16000)

# Test the model with a random sample
predict_sample()

# Save the trained model
model.save("speech_command_model.h5")

# Load the model later if needed
# model = tf.keras.models.load_model("speech_command_model.h5")




FileNotFoundError: Dataset extraction failed. Please check manually.

In [3]:
import os

# Check dataset storage location
!ls -lh ./data/
!ls -lh ~/.keras/datasets/


total 174M
drwxr-xr-x 4 root root 4.0K Feb 19 08:03 mini_speech_commands_extracted
-rw-r--r-- 1 root root 174M Feb 19 08:03 mini_speech_commands.zip
ls: cannot access '/root/.keras/datasets/': No such file or directory


In [4]:
import tensorflow as tf
import pathlib
import numpy as np
import os

# Correct dataset path
DATASET_PATH = "data/mini_speech_commands_extracted"

# Verify dataset extraction
data_dir = pathlib.Path(DATASET_PATH)

if not data_dir.exists():
    raise FileNotFoundError(f"Dataset extraction failed. Expected at: {DATASET_PATH}")
else:
    print("✅ Dataset successfully found!")

# List available commands
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[(commands != 'README.md') & (commands != '.DS_Store')]
print("Commands:", commands)


✅ Dataset successfully found!
Commands: ['__MACOSX' 'mini_speech_commands']


In [5]:
# Correct dataset path
DATASET_PATH = "data/mini_speech_commands_extracted/mini_speech_commands"

# Verify dataset extraction
data_dir = pathlib.Path(DATASET_PATH)

if not data_dir.exists():
    raise FileNotFoundError(f"Dataset extraction failed. Expected at: {DATASET_PATH}")
else:
    print("✅ Final dataset path set correctly!")

# List available commands
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[(commands != 'README.md') & (commands != '.DS_Store')]
print("Commands:", commands)


✅ Final dataset path set correctly!
Commands: ['right' 'down' 'yes' 'stop' 'no' 'up' 'go' 'left']
