# Retrain YAMNet Model

This notebook retrains a transfer-learned YAMNet model using raw audio data from `../data/used/`. The model expects 5-second, 16kHz WAV files and outputs probabilities for 12 classes. Steps include loading data, splitting into train/val/test sets, fine-tuning the model (initially the head only), and evaluating performance with visualizations like confusion matrices and loss/accuracy plots.

- **Input**: Raw audio (80000 samples, 16kHz).
- **Output**: 12-class probabilities.
- **Goal**: Improve detection of `gunshots`, `glass_breaking`, etc.

imports and setup

In [1]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import librosa
from glob import glob

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

tf.random.set_seed(50)
np.random.seed(50)

DATA_DIR = "data/used"  # Correct for audit_wavs location
FULL_MODEL_DIR = "full_model"
RETRAINED_MODELS_DIR = "retrained_models"

os.makedirs(RETRAINED_MODELS_DIR, exist_ok=True)

print("Setup complete!")

2025-03-28 10:07:32.664296: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743152852.780065    1239 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743152852.801729    1239 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-28 10:07:33.052688: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Setup complete!


Define Utility Functions

In [2]:
import logging
import soundfile as sf
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
import librosa
import os

logging.basicConfig(level=logging.INFO, filename='audio_processing.log', filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s')

def load_audio(file_path, sr=16000, min_duration=5.0, audit_dir=None, save_audit=False):
    """Load a WAV file, validate, and optionally save audit WAV without fixing length."""
    try:
        file_path = file_path.decode('utf-8') if isinstance(file_path, bytes) else file_path
        audio, orig_sr = librosa.load(file_path, sr=None, mono=True)
        if orig_sr != sr:
            logging.warning(f"File {file_path} has sample rate {orig_sr}Hz, resampling to {sr}Hz")
            audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=sr)
        duration = librosa.get_duration(y=audio, sr=sr)
        if duration < min_duration:
            logging.warning(f"File {file_path} duration {duration}s is less than {min_duration}s")
            # Pad if too short, but don’t fix to 80000
            audio = librosa.util.pad_center(audio, size=int(min_duration * sr))
        # Removed: audio = librosa.util.fix_length(audio, size=80000)
        if audit_dir and save_audit:
            os.makedirs(audit_dir, exist_ok=True)
            output_path = os.path.join(audit_dir, f"load_audio_{os.path.basename(file_path)}")
            sf.write(output_path, audio, sr, subtype='PCM_16')
            logging.info(f"Saved audit WAV: {output_path}")
        return audio
    except Exception as e:
        logging.error(f"Failed to load {file_path}: {str(e)}")
        return np.zeros(int(sr * min_duration), dtype=np.float32)

def split_dataset(df, train_size=0.7, val_size=0.15, test_size=0.15, shuffle=True):
    assert train_size + val_size + test_size == 1.0, "Split sizes must sum to 1"
    train_df, temp_df = train_test_split(df, train_size=train_size, shuffle=shuffle, random_state=42)
    val_ratio = val_size / (val_size + test_size)
    val_df, test_df = train_test_split(temp_df, train_size=val_ratio, shuffle=shuffle, random_state=42)
    return train_df, val_df, test_df

def create_data_loader(df, batch_size=32, shuffle=True, audit_dir=None, audit=False):
    audio_paths = df['filename'].values
    labels = np.array([list(map(int, label)) for label in df['label'].values], dtype=np.float32)
    dataset = tf.data.Dataset.from_tensor_slices((audio_paths, labels))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(df))
    
    def load_and_process(path, label):
        audio = tf.numpy_function(
            lambda p: load_audio(p, audit_dir=audit_dir, save_audit=audit),
            [path],
            tf.float32
        )
        audio.set_shape([None])  # Variable length
        return audio, label
    
    dataset = dataset.map(load_and_process, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

print("Utility functions defined!")

Utility functions defined!


Load Metadata


In [3]:
import os
from glob import glob
import pandas as pd

metadata_file = os.path.join("data", "used_files.csv")
df = pd.read_csv(metadata_file, dtype={'label': str})

n_files = 2000
df = df[df['label'].apply(lambda x: any(int(c) for c in x))].sample(n=n_files, random_state=42)
df['filename'] = df['filename'].apply(lambda x: os.path.join("data", x) if not os.path.isabs(x) else x)

missing_files = df['filename'].apply(lambda x: not os.path.exists(x)).sum()
invalid_labels = df['label'].apply(lambda x: all(int(c) == 0 for c in x)).sum()
print(f"Loaded {metadata_file}")
print(f"Total samples (after sampling): {len(df)}")
print(f"Missing files: {missing_files}")
print(f"Invalid (all-zero) labels: {invalid_labels}")
print("Sample rows:")
print(df.head())

assert missing_files == 0, "Missing audio files detected!"
assert invalid_labels == 0, "Invalid labels detected!"
total_samples = len(df)

train_df, val_df, test_df = split_dataset(df)  # Add this line if not elsewhere
batch_size = 32

Loaded data/used_files.csv
Total samples (after sampling): 2000
Missing files: 0
Invalid (all-zero) labels: 0
Sample rows:
                                                filename         label
196    data/used/blended_gunshot_audio/4391_gunshot_g...  000000101000
6134   data/used/blended_gunshot_audio/3963_gunshot_g...  000000101000
20937  data/used/concatenated_glass_audio/39_concated...  000000100100
14941  data/used/blended_glass_audio/9465_glass_break...  000000100010
2201   data/used/blended_gunshot_audio/2053_gunshot_h...  000000001100


Data Processing

In [4]:
audio_data = []
labels = []

for idx, row in df.iterrows():
    audio = load_audio(row['filename'])
    if audio is not None:  # Only append if loading succeeded
        audio_data.append(audio)
        labels.append(list(map(int, row['label'])))
    else:
        print(f"Skipping file {row['filename']} due to loading error")
    if (idx + 1) % 1000 == 0:
        print(f"Processed {idx + 1}/{len(df)} files")

X = np.array(audio_data, dtype=np.float32)
y = np.array(labels, dtype=np.float32)

# Save for reuse
np.save(os.path.join(DATA_DIR, "X_raw.npy"), X)
np.save(os.path.join(DATA_DIR, "y_labels.npy"), y)

print(f"Processed {len(X)} audio samples, shape: {X.shape}")
print(f"Labels shape: {y.shape}")

Processed 15000/2000 files
Processed 2000 audio samples, shape: (2000, 80000)
Labels shape: (2000, 12)


Split Dataset

In [5]:
train_df, val_df, test_df = split_dataset(df, train_size=0.7, val_size=0.15, test_size=0.15)

print(f"Training samples: {len(train_df)} (~70%)")
print(f"Validation samples: {len(val_df)} (~15%)")
print(f"Test samples: {len(test_df)} (~15%)")
assert len(train_df) + len(val_df) + len(test_df) == total_samples, "Split sizes don’t match total!"

# Check class distribution
class_order = ["crying_baby", "sneezing", "clapping", "coughing", "footsteps", "laughing",
               "glass_breaking", "other", "gunshots", "human_speech", "bicycle", "bicycle_bell"]
train_dist = np.sum([list(map(int, label)) for label in train_df['label']], axis=0)
val_dist = np.sum([list(map(int, label)) for label in val_df['label']], axis=0)
test_dist = np.sum([list(map(int, label)) for label in test_df['label']], axis=0)

print("Class distribution (Train / Val / Test):")
for i, cls in enumerate(class_order):
    print(f"{cls}: {train_dist[i]} / {val_dist[i]} / {test_dist[i]}")

Training samples: 1400 (~70%)
Validation samples: 300 (~15%)
Test samples: 300 (~15%)
Class distribution (Train / Val / Test):
crying_baby: 0 / 0 / 0
sneezing: 0 / 0 / 0
clapping: 0 / 0 / 0
coughing: 0 / 0 / 0
footsteps: 0 / 0 / 0
laughing: 0 / 0 / 0
glass_breaking: 763 / 159 / 150
other: 238 / 43 / 51
gunshots: 582 / 137 / 135
human_speech: 483 / 116 / 106
bicycle: 484 / 101 / 104
bicycle_bell: 0 / 0 / 0


Create Data Loaders

In [7]:
import os
import shutil
try:
    from IPython.display import Audio, display
    ipython_available = True
except ImportError:
    ipython_available = False
    print("IPython not available; audio playback skipped. Check saved WAVs manually.")


audit_dir = os.path.join(DATA_DIR, "audit_wavs")
if os.path.exists(audit_dir):
    shutil.rmtree(audit_dir)

val_audit_dir = os.path.join(DATA_DIR, "val_audit_wavs")
if os.path.exists(val_audit_dir):
    shutil.rmtree(val_audit_dir)

train_loader_full = create_data_loader(train_df, batch_size=batch_size, shuffle=True)
audit_loader = create_data_loader(train_df, batch_size=5, shuffle=False, audit_dir=audit_dir, audit=True).take(1)
val_audit_loader = create_data_loader(val_df, batch_size=20, shuffle=False, audit_dir=val_audit_dir, audit=True).take(1)
val_loader = create_data_loader(val_df, batch_size=batch_size, shuffle=False)
test_loader = create_data_loader(test_df, batch_size=batch_size, shuffle=False)

train_batches = (len(train_df) + batch_size - 1) // batch_size
val_batches = (len(val_df) + batch_size - 1) // batch_size
print(f"Training samples: {len(train_df)}, Batches: {train_batches}")
print(f"Validation samples: {len(val_df)}, Batches: {val_batches}")
print(f"Audit WAVs (5 examples) saved to: {audit_dir}")
print("Data loaders created!")

for batch_idx, (audio, label) in enumerate(audit_loader):
    print(f"Audit batch {batch_idx + 1}: Audio shape: {audio.shape}, Label shape: {label.shape}")
    batch_filenames = train_df['filename'].values[:5]
    for i, (filename, audio_sample, lbl) in enumerate(zip(batch_filenames, audio, label)):
        processed_wav = os.path.join(audit_dir, f"load_audio_{os.path.basename(filename)}")
        print(f"Original file {i + 1}: {filename}")
        print(f"Processed WAV: {processed_wav}")
        print(f"Label: {lbl.numpy()}")
        if ipython_available:
            if os.path.exists(filename):
                print("Playing original:")
                display(Audio(filename))
            else:
                print(f"Original file not found: {filename}")
            print("Playing processed:")
            display(Audio(audio_sample.numpy(), rate=16000))
        else:
            print("Audio playback skipped; listen to files in audit_wavs manually.")

Training samples: 1400, Batches: 44
Validation samples: 300, Batches: 10
Audit WAVs (5 examples) saved to: data/used/audit_wavs
Data loaders created!
Audit batch 1: Audio shape: (5, 80000), Label shape: (5, 12)
Original file 1: data/used/blended_gunshot_audio/5573_gunshot_glass_breaking.wav
Processed WAV: data/used/audit_wavs/load_audio_5573_gunshot_glass_breaking.wav
Label: [0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.]
Playing original:


Playing processed:


Original file 2: data/used/blended_glass_audio/4424_glass_breaking_human_speech.wav
Processed WAV: data/used/audit_wavs/load_audio_4424_glass_breaking_human_speech.wav
Label: [0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.]
Playing original:


Playing processed:


Original file 3: data/used/blended_glass_audio/8517_glass_breaking_bicycle.wav
Processed WAV: data/used/audit_wavs/load_audio_8517_glass_breaking_bicycle.wav
Label: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.]
Playing original:


Playing processed:


Original file 4: data/used/isolated_data/Other/586_fireworks_other_aug2.wav
Processed WAV: data/used/audit_wavs/load_audio_586_fireworks_other_aug2.wav
Label: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
Playing original:


Playing processed:


Original file 5: data/used/blended_glass_audio/5571_glass_breaking_bicycle.wav
Processed WAV: data/used/audit_wavs/load_audio_5571_glass_breaking_bicycle.wav
Label: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.]
Playing original:


Playing processed:


2025-03-28 10:09:16.720907: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Load in Model

In [8]:
import tensorflow as tf

# Load SavedModel directly
loaded_model = tf.saved_model.load(FULL_MODEL_DIR)
base_model_fn = loaded_model.signatures['serving_default']

# Define a custom layer to wrap the SavedModel
class SavedModelLayer(tf.keras.layers.Layer):
    def __init__(self, model_fn, full_model_dir, **kwargs):
        super(SavedModelLayer, self).__init__(**kwargs)
        self.model_fn = model_fn
        self.full_model_dir = full_model_dir  # Store path for reconstruction

    def call(self, inputs, training=False):
        # inputs: (batch_size, 80000)
        def apply_model_fn(audio):
            outputs = self.model_fn(audio=audio)  # Pass (80000,) directly
            return outputs['classifier']  # (12,)
        outputs = tf.map_fn(apply_model_fn, inputs, dtype=tf.float32)  # (batch_size, 12)
        return outputs

    def compute_output_shape(self, input_shape):
        return (input_shape[0], 12)  # (batch_size, 12)

    def get_config(self):
        config = super(SavedModelLayer, self).get_config()
        config.update({"full_model_dir": self.full_model_dir})
        return config

    @classmethod
    def from_config(cls, config):
        full_model_dir = config.pop("full_model_dir")
        loaded_model = tf.saved_model.load(full_model_dir)
        model_fn = loaded_model.signatures['serving_default']
        return cls(model_fn, full_model_dir, **config)

# Define Keras model
inputs = tf.keras.Input(shape=(80000,), dtype=tf.float32, name='audio')  # Match runtime shape
base_output = SavedModelLayer(base_model_fn, FULL_MODEL_DIR)(inputs)
dropout = tf.keras.layers.Dropout(0.3)(base_output)
predictions = tf.keras.layers.Dense(12, activation='sigmoid')(dropout)
model = tf.keras.Model(inputs, predictions)

print("Model initialized from original SavedModel!")

Model initialized from original SavedModel!


In [9]:
# Training Setup
# Configure training parameters and callbacks for monitoring and saving.

epochs = 10  # Tweakable
batch_size = 32  # Tweakable
learning_rate = 0.0001  

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint(
        os.path.join(FULL_MODEL_DIR, "retrained_yamnet_best_v1.keras"),  # Unique name
        monitor='val_loss', save_best_only=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
]

print(f"Training setup: epochs={epochs}, batch_size={batch_size}, learning_rate={learning_rate}")

Training setup: epochs=10, batch_size=32, learning_rate=0.0001


In [10]:
import tensorflow as tf

FULL_MODEL_DIR = "full_model"

class SavedModelWrapper(tf.keras.layers.Layer):
    def __init__(self, saved_model_dir):
        super(SavedModelWrapper, self).__init__()
        self.saved_model = tf.saved_model.load(saved_model_dir)
        self.call_fn = self.saved_model.signatures['serving_default']

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        flat_inputs = tf.reshape(inputs, [batch_size, -1])
        outputs = tf.map_fn(
            lambda x: self.call_fn(audio=x)['classifier'],
            flat_inputs,
            fn_output_signature=tf.TensorSpec(shape=(12,), dtype=tf.float32)
        )  # (batch_size, 12)
        return tf.nn.sigmoid(outputs)  # Convert logits to probabilities

    def compute_output_shape(self, input_shape):
        return (input_shape[0], 12)

inputs = tf.keras.Input(shape=(None,), name="audio", dtype=tf.float32)
outputs = SavedModelWrapper(FULL_MODEL_DIR)(inputs)
model = tf.keras.Model(inputs, outputs)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              loss='binary_crossentropy',
              metrics=['accuracy'])

print("Model loaded and compiled (inference-only with sigmoid)!")
model.summary()
epochs = 10
callbacks = []

Model loaded and compiled (inference-only with sigmoid)!


In [None]:
import tensorflow as tf

FULL_MODEL_DIR = "full_model"

class SavedModelHead(tf.keras.layers.Layer):
    def __init__(self, saved_model_dir):
        super(SavedModelHead, self).__init__()
        self.saved_model = tf.saved_model.load(saved_model_dir)
        self.call_fn = self.saved_model.signatures['serving_default']
        # Extract weights
        weights = {var.name: var.numpy() for var in self.saved_model.variables}
        print("Available weights:", list(weights.keys()))  # Debug
        self.dense_2 = tf.keras.layers.Dense(1024, activation='relu', name='dense_2',
                                             kernel_initializer=tf.constant_initializer(weights.get('dense_2/kernel:0')),
                                             bias_initializer=tf.constant_initializer(weights.get('dense_2/bias:0')))
        self.dense_3 = tf.keras.layers.Dense(12, activation='sigmoid', name='dense_3',
                                             kernel_initializer=tf.constant_initializer(weights.get('dense_3/kernel:0')),
                                             bias_initializer=tf.constant_initializer(weights.get('dense_3/bias:0')))

    def build(self, input_shape):
        self.dense_2.build((None, 12))  # Adjust if needed
        self.dense_3.build((None, 1024))

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        flat_inputs = tf.reshape(inputs, [batch_size, -1])
        x = tf.map_fn(
            lambda x: self.call_fn(audio=x)['classifier'],
            flat_inputs,
            fn_output_signature=tf.TensorSpec(shape=(12,), dtype=tf.float32)
        )
        x = self.dense_2(x)
        x = self.dense_3(x)
        return x

    def compute_output_shape(self, input_shape):
        return (input_shape[0], 12)

inputs = tf.keras.Input(shape=(None,), name="audio", dtype=tf.float32)
outputs = SavedModelHead(FULL_MODEL_DIR)(inputs)
model = tf.keras.Model(inputs, outputs)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              loss='binary_crossentropy',
              metrics=['accuracy'])

print("Trainable model compiled!")
model.summary()
epochs = 10
callbacks = []

Train Model

In [None]:
from datetime import datetime

steps_per_epoch = (len(train_df) + batch_size - 1) // batch_size
validation_steps = (len(val_df) + batch_size - 1) // batch_size

# Evaluate before training
train_eval = model.evaluate(train_loader_full, steps=steps_per_epoch)
val_eval = model.evaluate(val_loader, steps=validation_steps)
print(f"Pre-training - Train Loss: {train_eval[0]}, Train Acc: {train_eval[1]}")
print(f"Pre-training - Val Loss: {val_eval[0]}, Val Acc: {val_eval[1]}")

history = model.fit(train_loader_full,
                    validation_data=val_loader,
                    epochs=epochs,
                    steps_per_epoch=steps_per_epoch,
                    validation_steps=validation_steps,
                    callbacks=callbacks,
                    verbose=1)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model.save(os.path.join(RETRAINED_MODELS_DIR, f"retrained_yamnet_final_{timestamp}.keras"))
print("Training complete! Model saved.")

Plot Training Progress

In [None]:

# # Plot Training Progress
# Visualize training and validation loss/accuracy over epochs.

plt.figure(figsize=(12, 4))

# Loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss Over Time')
plt.legend()
plt.grid(True)

# Accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy Over Time')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

Evaluate on Test Set

In [None]:

y_true = []
y_pred = []

for audio, label in test_loader:
    pred = model.predict(audio, verbose=0)
    y_true.extend(label.numpy())
    y_pred.extend(pred)

y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Metrics
print("Classification Report:")
print(classification_report(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1), target_names=class_order))

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

# Collect predictions and true labels
y_true = []
y_pred = []

for audio, label in test_loader:
    pred = model.predict(audio, verbose=0)
    y_true.extend(label.numpy())
    y_pred.extend(pred)

y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Assuming multi-label classification
class_order = ["crying_baby", "sneezing", "clapping", "coughing", "footsteps", "laughing",
               "glass_breaking", "other", "gunshots", "human_speech", "bicycle", "bicycle_bell"]

# Plot ROC Curves
plt.figure(figsize=(12, 8))
for i, cls in enumerate(class_order):
    fpr, tpr, _ = roc_curve(y_true[:, i], y_pred[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{cls} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves per Class')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Plot Precision-Recall Curves
plt.figure(figsize=(12, 8))
for i, cls in enumerate(class_order):
    precision, recall, _ = precision_recall_curve(y_true[:, i], y_pred[:, i])
    avg_precision = average_precision_score(y_true[:, i], y_pred[:, i])
    plt.plot(recall, precision, label=f'{cls} (AP = {avg_precision:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves per Class')
plt.legend(loc='lower left')
plt.grid(True)
plt.show()

# Updated classification report for multi-label
y_pred_binary = (y_pred > 0.5).astype(int)
print("Multi-label Classification Report:")
print(classification_report(y_true, y_pred_binary, target_names=class_order))

Confusion Matrix

In [None]:
plot_confusion_matrix(y_true, y_pred, class_order)