In [9]:
import os
import numpy as np
import pathlib
import tensorflow as tf
import random
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [10]:
# ==== CONFIGURATION ====
DATASET_PATH = pathlib.Path('/content/drive/MyDrive/speech_commands_subset')
COMMANDS = ['up', 'down', 'left', 'right']
BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 1000
EPOCHS = 15
MAX_FILES_PER_COMMAND = 2000  # Limit to prevent memory issues
SAMPLE_RATE = 16000
DURATION = 1.0  # 1 second audio clips

print("Configuration set up successfully!")
print(f"Commands: {COMMANDS}")

Configuration set up successfully!
Commands: ['up', 'down', 'left', 'right']


In [11]:
# ==== GPU CHECK ====
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
else:
    print('GPU detected:')
    print(gpu_info)

GPU detected:
Wed Sep  3 00:46:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P0             48W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                  

In [12]:
# ==== DATA LOADING AND PREPROCESSING ====
def load_and_prepare_data():
    """Load and prepare the speech command dataset"""

    # Find all WAV files
    all_audio_paths = list(DATASET_PATH.glob('*/*.wav'))
    print(f"Total .wav files found: {len(all_audio_paths)}")

    # Group files by command
    command_to_files = defaultdict(list)
    for p in all_audio_paths:
        label = p.parent.name
        if label in COMMANDS:
            command_to_files[label].append(p)

    # Limit files per command and split into train/val/test
    train_files, val_files, test_files = [], [], []

    for command in COMMANDS:
        files = command_to_files[command]
        if len(files) > MAX_FILES_PER_COMMAND:
            files = random.sample(files, MAX_FILES_PER_COMMAND)

        random.shuffle(files)
        train_split = int(0.7 * len(files))  # 70% train
        val_split = int(0.85 * len(files))   # 15% validation

        train_files.extend(files[:train_split])
        val_files.extend(files[train_split:val_split])
        test_files.extend(files[val_split:])

    print(f"Train: {len(train_files)}, Val: {len(val_files)}, Test: {len(test_files)}")

    return train_files, val_files, test_files

train_files, val_files, test_files = load_and_prepare_data()

Total .wav files found: 9479
Train: 5600, Val: 1200, Test: 1200


In [13]:
# ==== AUDIO PROCESSING FUNCTIONS ====
def decode_audio(audio_binary):
    """Decode WAV file to audio tensor"""
    audio, _ = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(audio, axis=1)

def get_label(file_path):
    """Extract label from file path"""
    parts = tf.strings.split(file_path, os.path.sep)
    return parts[-2]

def get_waveform_and_label(file_path):
    """Load waveform and label from file path"""
    label = get_label(file_path)
    audio_binary = tf.io.read_file(file_path)
    waveform = decode_audio(audio_binary)
    return waveform, label

def preprocess_audio(waveform):
    """Preprocess audio: normalize, ensure correct length"""
    # Normalize audio
    waveform = waveform / tf.reduce_max(tf.abs(waveform))

    # Ensure exactly 16000 samples (1 second at 16kHz)
    waveform = waveform[:16000]
    padding = tf.maximum(16000 - tf.shape(waveform)[0], 0)
    waveform = tf.pad(waveform, paddings=[[0, padding]], mode='CONSTANT')

    return waveform

def get_spectrogram(waveform):
    """Convert waveform to spectrogram"""
    # Compute STFT
    spectrogram = tf.signal.stft(waveform, frame_length=255, frame_step=128)
    spectrogram = tf.abs(spectrogram)

    # Add channel dimension
    spectrogram = tf.expand_dims(spectrogram, -1)

    return spectrogram

def preprocess_dataset(waveform, label):
    """Full preprocessing pipeline"""
    waveform = preprocess_audio(waveform)
    spectrogram = get_spectrogram(waveform)
    label_id = tf.argmax(label == COMMANDS)
    return spectrogram, label_id


In [16]:
# ==== CREATE TENSORFLOW DATASETS ====
def create_dataset(file_paths, shuffle=False):
    """Create TensorFlow dataset from file paths"""
    ds = tf.data.Dataset.from_tensor_slices([str(p) for p in file_paths])
    ds = ds.map(get_waveform_and_label, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.map(preprocess_dataset, num_parallel_calls=tf.data.AUTOTUNE)

    if shuffle:
        ds = ds.shuffle(SHUFFLE_BUFFER_SIZE)

    return ds

# Create datasets
train_ds = create_dataset(train_files, shuffle=True).batch(BATCH_SIZE).cache().prefetch(tf.data.AUTOTUNE)
val_ds = create_dataset(val_files).batch(BATCH_SIZE).cache().prefetch(tf.data.AUTOTUNE)
test_ds = create_dataset(test_files).batch(BATCH_SIZE).cache().prefetch(tf.data.AUTOTUNE)

# Get input shape from first batch
for spectrogram, label in train_ds.take(1):
    input_shape = spectrogram.shape[1:]
print(f"Input shape: {input_shape}")

# ==== BUILD MODEL ====
def create_model(input_shape, num_classes):
    """Create CNN model for speech recognition"""
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=input_shape),

        # First conv block
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Dropout(0.25),

        # Second conv block
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Dropout(0.25),

        # Third conv block
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Dropout(0.25),

        # Reshape before flattening
        tf.keras.layers.Reshape((-1, input_shape[0] // 8 * input_shape[1] // 8 * 128)), # Calculate flattened size based on conv/pool layers

        # Flatten and dense layers
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),

        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),

        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])

    return model

Input shape: (124, 129, 1)


In [18]:
# Create and compile model
model = create_model(input_shape, len(COMMANDS))
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# ==== TRAINING CALLBACKS ====
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=3,
        min_lr=1e-7
    )
]

# ==== TRAIN MODEL ====
print("Starting training...")
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

ValueError: The total size of the tensor must be unchanged, however, the input size cannot by divided by the specified dimensions in target_shape. Received: input_shape=(13, 14, 128), target_shape=(-1, 30848)

In [9]:
# ==== EVALUATE MODEL ====
print("Evaluating model...")
test_loss, test_accuracy = model.evaluate(test_ds)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")

# ==== PLOT TRAINING HISTORY ====
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

Evaluating model...


InvalidArgumentError: Graph execution error:

Detected at node sequential_1/flatten_1/Reshape defined at (most recent call last):
<stack traces unavailable>
only one input size may be -1, not both 0 and 1

Stack trace for op definition: 
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance
File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start
File "/usr/local/lib/python3.12/dist-packages/tornado/platform/asyncio.py", line 205, in start
File "/usr/lib/python3.12/asyncio/base_events.py", line 645, in run_forever
File "/usr/lib/python3.12/asyncio/base_events.py", line 1999, in _run_once
File "/usr/lib/python3.12/asyncio/events.py", line 88, in _run
File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 499, in process_one
File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelbase.py", line 730, in execute_request
File "/usr/local/lib/python3.12/dist-packages/ipykernel/ipkernel.py", line 383, in do_execute
File "/usr/local/lib/python3.12/dist-packages/ipykernel/zmqshell.py", line 528, in run_cell
File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell
File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell
File "/usr/local/lib/python3.12/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner
File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async
File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes
File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
File "/tmp/ipython-input-2240975926.py", line 3, in <cell line: 0>
File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler
File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 489, in evaluate
File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 220, in function
File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 133, in multi_step_on_iterator
File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 114, in one_step_on_data
File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/trainer.py", line 90, in test_step
File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler
File "/usr/local/lib/python3.12/dist-packages/keras/src/layers/layer.py", line 936, in __call__
File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler
File "/usr/local/lib/python3.12/dist-packages/keras/src/ops/operation.py", line 58, in __call__
File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler
File "/usr/local/lib/python3.12/dist-packages/keras/src/models/sequential.py", line 220, in call
File "/usr/local/lib/python3.12/dist-packages/keras/src/models/functional.py", line 183, in call
File "/usr/local/lib/python3.12/dist-packages/keras/src/ops/function.py", line 177, in _run_through_graph
File "/usr/local/lib/python3.12/dist-packages/keras/src/models/functional.py", line 648, in call
File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler
File "/usr/local/lib/python3.12/dist-packages/keras/src/layers/layer.py", line 936, in __call__
File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler
File "/usr/local/lib/python3.12/dist-packages/keras/src/ops/operation.py", line 58, in __call__
File "/usr/local/lib/python3.12/dist-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler
File "/usr/local/lib/python3.12/dist-packages/keras/src/layers/reshaping/flatten.py", line 54, in call
File "/usr/local/lib/python3.12/dist-packages/keras/src/ops/numpy.py", line 5074, in reshape
File "/usr/local/lib/python3.12/dist-packages/keras/src/backend/tensorflow/numpy.py", line 2068, in reshape

	 [[{{node sequential_1/flatten_1/Reshape}}]]
	tf2xla conversion failed while converting __inference_one_step_on_data_6073[]. Run with TF_DUMP_GRAPH_PREFIX=/path/to/dump/dir and --vmodule=xla_compiler=2 to obtain a dump of the compiled functions.
	 [[StatefulPartitionedCall]] [Op:__inference_multi_step_on_iterator_6142]

In [2]:
# ==== CONFUSION MATRIX ====
print("Generating confusion matrix...")
y_true = []
y_pred = []

for spectrograms, labels in test_ds:
    predictions = model.predict(spectrograms, verbose=0)
    y_true.extend(labels.numpy())
    y_pred.extend(np.argmax(predictions, axis=1))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=COMMANDS, yticklabels=COMMANDS)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
# Classification report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=COMMANDS))

Generating confusion matrix...


NameError: name 'test_ds' is not defined

In [3]:
# ==== CONVERT TO TFLITE ====
print("Converting to TFLite format...")
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

# Save TFLite model
tflite_model_path = '/content/keyword_spotting.tflite'
with open(tflite_model_path, 'wb') as f:
    f.write(tflite_model)

print(f"TFLite model saved to: {tflite_model_path}")
print(f"Model size: {len(tflite_model) / 1024:.2f} KB")

Converting to TFLite format...


NameError: name 'tf' is not defined

In [7]:
# ==== SAVE MODEL FOR JETSON NANO ====
# Save Keras model
model.save('/content/speech_command_model.h5')
print("Keras model saved as 'speech_command_model.h5'")

# Save class labels
with open('/content/class_labels.txt', 'w') as f:
    for label in COMMANDS:
        f.write(f"{label}\n")
print("Class labels saved as 'class_labels.txt'")


# ==== JETSON NANO DEPLOYMENT INSTRUCTIONS ====
print("\n" + "="*50)
print("DEPLOYMENT INSTRUCTIONS FOR JETSON NANO:")
print("="*50)
print("1. Transfer downloaded files to Jetson Nano")
print("2. Install required packages:")
print("   pip install tensorflow tensorflow-io pyaudio")
print("3. Use this inference code on Jetson Nano:")

NameError: name 'model' is not defined

In [8]:
# ==== DOWNLOAD FILES ====
from google.colab import files

files_to_download = [
    'keyword_spotting.tflite',
    'speech_command_model.h5',
    'class_labels.txt'
]

for file in files_to_download:
    if os.path.exists(f'/content/{file}'):
        files.download(f'/content/{file}')
        print(f"Downloaded: {file}")

print("Training completed successfully!")
print("Files ready for deployment on NVIDIA Jetson Nano:")
print("1. keyword_spotting.tflite - Optimized for edge devices")
print("2. speech_command_model.h5 - Full Keras model")
print("3. class_labels.txt - Command labels")

NameError: name 'os' is not defined