In [4]:
!pip install --quiet librosa soundfile
!git clone --quiet https://github.com/karoldvl/ESC-50.git

fatal: destination path 'ESC-50' already exists and is not an empty directory.


In [5]:
# We are using tensorflow because it fits with the selected model

import tensorflow as tf
import tensorflow_hub as hub

import librosa
import numpy as np
import soundfile as sf
import os
import pandas as pd

# schdeuled learning rate
optim = tf.keras.optimizers.Adam(.01, clipnorm=.9)



In [6]:
class ComplexDense(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super().__init__(**kwargs)
        self.real__dense = tf.keras.layers.Dense(
            units,
            kernel_initializer="glorot_uniform",
            bias_initializer="zeros", **kwargs
        )
        self.imag__dense = tf.keras.layers.Dense(
            units,
            kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1e-3),
            bias_initializer="zeros", **kwargs
        )
    def call(self, inputs):
        real = self.real__dense(inputs)
        imag = self.imag__dense(inputs)
        return tf.complex(real, imag)

meta = pd.read_csv("ESC-50/meta/esc50.csv")
file_paths = meta["filename"].apply(lambda f: os.path.join("ESC-50/audio", f)).tolist()
labels     = meta["category"].astype("category").cat.codes.values
num_classes = meta["category"].nunique()

print("min label:", labels.min(), " max label:", labels.max())
print("num_classes:", num_classes)

min label: 0  max label: 49
num_classes: 50


In [7]:
# ⚙️ Cell: Build tf.data pipeline (fixed path handling) ─────────────────────
SR = 16000
DURATION = 5.0
AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE = 32

import numpy as np

def load_and_preprocess(path, label):
    # `path` here is a numpy bytes_ or str, not a tf.Tensor
    # Decode it to a Python string:
    if isinstance(path, bytes):
        audio_path = path.decode("utf-8")
    elif isinstance(path, np.bytes_):
        audio_path = path.tobytes().decode("utf-8")
    else:
        audio_path = str(path)
    # Load & resample
    wav, _ = librosa.load(audio_path, sr=SR, mono=True, duration=DURATION)
    target_len = int(SR * DURATION)
    if len(wav) < target_len:
        wav = np.pad(wav, (0, target_len - len(wav)))
    else:
        wav = wav[:target_len]
    # Normalize to [-1,1]
    wav = wav.astype(np.float32)
    wav = wav / (np.max(np.abs(wav)) + 1e-9)
    # Return waveform and label (as numpy types)
    return wav, np.int64(label)

def tf_preprocess(path, label):
    wav, lbl = tf.numpy_function(
        func=load_and_preprocess,
        inp=[path, label],
        Tout=[tf.float32, tf.int64]
    )
    # Tell TF the shapes
    wav.set_shape([int(SR * DURATION)])
    lbl.set_shape([])
    return wav, lbl

# Build the Dataset
ds = (
    tf.data.Dataset
      .from_tensor_slices((file_paths, labels))
      .shuffle(len(file_paths))
      .map(tf_preprocess, num_parallel_calls=AUTOTUNE)
      .batch(BATCH_SIZE)
      .prefetch(AUTOTUNE)
)

waveforms, lbls = next(iter(ds.take(1)))
print("Waveforms:", waveforms.shape,
      "min/max:", waveforms.numpy().min(), waveforms.numpy().max())
print("Labels:", lbls.shape, "unique:", np.unique(lbls.numpy()))


Waveforms: (32, 80000) min/max: -1.0 1.0
Labels: (32,) unique: [ 1  3  4  5  7 10 11 15 16 19 21 23 25 26 35 36 41 42 43 46 48 49]


In [8]:
yamnet_saved=hub.load("https://tfhub.dev/google/yamnet/1")


def extract_embeds(wavs):
    # wavs: [batch, 80000]
    def embed_one(wav):
        # yamnet_saved(wav) → (scores, embeddings [frames,1024], spec)
        _, emb, _ = yamnet_saved(wav)
        return emb
    # Map over the batch to get [batch, frames, 1024]
    embs = tf.map_fn(
        embed_one,
        wavs,
        fn_output_signature=tf.TensorSpec((None, 1024), tf.float32)
    )
    # Freeze backbone
    return tf.stop_gradient(embs)




In [31]:
class ModReLU(tf.keras.layers.Layer):
    def __init__(self, eps=1e-6, **kw):
        super().__init__(**kw); self.eps = eps
    def build(self, input_shape):
        d = int(input_shape[-1])
        self.beta = self.add_weight(shape=(d,), initializer="zeros", name="beta")
    def call(self, z):  # z: complex64
        mag  = tf.abs(z) + self.eps
        gate = tf.nn.relu(mag + self.beta) / mag
        return tf.complex(tf.math.real(z) * gate, tf.math.imag(z) * gate)


In [47]:
num_classes = 50
wav_in  = tf.keras.Input(shape=(int(SR*DURATION),), dtype=tf.float32, name="waveform")
emb_seq = tf.keras.layers.Lambda(
    extract_embeds,
    output_shape=(None, 1024),
    name="yamnet_embed"
)(wav_in)


pooled  = tf.keras.layers.GlobalAveragePooling1D(name="avg_pool")(emb_seq)  # [batch,1024]
normed  = tf.keras.layers.LayerNormalization(name="ln_embed")(pooled)       # [batch,1024]

F = tf.keras.layers.Dense(256, activation="relu", kernel_initializer='glorot_uniform',
                          bias_initializer='zeros', name="shared")(normed)

real_branch = tf.keras.layers.BatchNormalization(name="bn_real")(F)
real_branch = tf.keras.layers.Activation("relu", name="relu_real")(real_branch)


#imaginary branch
Z_0 = tf.keras.layers.Lambda(
    lambda y : tf.complex(tf.zeros_like(y), tf.zeros_like(y)),
    name="complex_0"
)(F)

theta = tf.keras.layers.Dense(256, name="theta")(normed)

gate  = tf.keras.layers.Lambda(
    lambda th: tf.complex(tf.cos(th), tf.sin(th)), name="unit_phase_gate"
)(theta)

Z1 = tf.keras.layers.Lambda(lambda xs: xs[0] * xs[1], name="phase_rotate")(
    [Z_0, gate]
)
Z1 = ModReLU(name="modrelu")(Z1)

# now I don't lose information
ReZ = tf.keras.layers.Lambda(tf.math.real, name="ReZ")(Z1)
ImZ = tf.keras.layers.Lambda(tf.math.imag, name="ImZ")(Z1)

ReZ = tf.keras.layers.BatchNormalization(name="bn_ReZ")(ReZ)
ImZ = tf.keras.layers.BatchNormalization(name="bn_ImZ")(ImZ)
ReZ = tf.keras.layers.Activation("relu", name="relu_ReZ")(ReZ)
ImZ = tf.keras.layers.Activation("relu", name="relu_ImZ")(ImZ)




feat = tf.keras.layers.Concatenate(name="fuse_real_ReZ_ImZ")([real_branch, ReZ, ImZ])

logits = tf.keras.layers.Dense(num_classes, name="classifier")(feat)
probs  = tf.keras.layers.Activation("softmax", name="probs")(logits)


complex_model = tf.keras.Model(inputs=wav_in, outputs=probs, name="complex_model")
complex_model.compile(optimizer=tf.keras.optimizers.Adam(1e-4, clipnorm=1.0),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])
complex_model.summary()









# with a split we can work in both real and complex planes seperatley

In [46]:
# ─── Cell: Real‑Valued Baseline Model ─────────────────────────────────────
import tensorflow as tf
import tensorflow_hub as hub

# 1) Load YAMNet
yamnet_saved = hub.load("https://tfhub.dev/google/yamnet/1")

# 2) Batch embedding extractor (same as before)
def extract_embeds_real(wavs):
    def one(wav):
        _, emb, _ = yamnet_saved(wav)
        return emb
    embs = tf.map_fn(
        one,
        wavs,
        fn_output_signature=tf.TensorSpec((None,1024), tf.float32)
    )
    return tf.stop_gradient(embs)

# 3) Build the real baseline
SR, DURATION = 16000, 5.0
num_classes = 50

wav_in_r = tf.keras.Input(shape=(int(SR*DURATION),), dtype=tf.float32, name="wav_real")
emb_seq_r = tf.keras.layers.Lambda(
    extract_embeds_real,
    output_shape=(None,1024),
    name="yamnet_embed_real"
)(wav_in_r)

# Pool & normalize
pooled_r = tf.keras.layers.GlobalAveragePooling1D(name="avg_pool_real")(emb_seq_r)
norm_r   = tf.keras.layers.LayerNormalization(name="ln_real")(pooled_r)

# A small real MLP head
x_r      = tf.keras.layers.Dense(512, activation="relu", name="dense_real1")(norm_r)
x_r      = tf.keras.layers.Dropout(0.5, name="dropout_real")(x_r)
x_r = tf.keras.layers.Dense(256, activation="relu", name="dense_real2")(x_r)
x_r = tf.keras.layers.Dropout(.5, name="dropout_real2")(x_r)
logits_r = tf.keras.layers.Dense(num_classes, name="logits_real")(x_r)
probs_r  = tf.keras.layers.Activation("softmax", name="probs_real")(logits_r)

real_model = tf.keras.Model(wav_in_r, probs_r, name="real_tl")
real_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4, clipnorm=1.0),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)
real_model.summary()


In [48]:
history_complex = complex_model.fit(
    ds,
    epochs=10,
    validation_data=ds.take(10)
)

Epoch 1/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 268ms/step - accuracy: 0.0528 - loss: 3.7796 - val_accuracy: 0.3250 - val_loss: 3.1378
Epoch 2/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 143ms/step - accuracy: 0.4502 - loss: 2.7526 - val_accuracy: 0.6094 - val_loss: 2.3729
Epoch 3/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 144ms/step - accuracy: 0.6466 - loss: 2.0738 - val_accuracy: 0.7344 - val_loss: 1.8664
Epoch 4/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 146ms/step - accuracy: 0.7096 - loss: 1.6832 - val_accuracy: 0.8000 - val_loss: 1.5149
Epoch 5/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 162ms/step - accuracy: 0.7526 - loss: 1.4325 - val_accuracy: 0.7906 - val_loss: 1.3187
Epoch 6/100
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 136ms/step - accuracy: 0.8025 - loss: 1.2194 - val_accuracy: 0.8125 - val_loss: 1.1383
Epoch 7/100
[1m63

KeyboardInterrupt: 

In [None]:
history_real = real_model.fit(
    ds,
    epochs=10,
    validation_data=ds.take(10)
)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,4))

# Val accuracy
plt.subplot(1,2,1)
plt.plot(history_complex.history["val_accuracy"], label="Complex TL")
plt.plot(history_real.history["val_accuracy"],    label="Real TL")
plt.title("Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

# Val loss
plt.subplot(1,2,2)
plt.plot(history_complex.history["val_loss"], label="Complex TL")
plt.plot(history_real.history["val_loss"],    label="Real TL")
plt.title("Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()



plt.tight_layout()
plt.show()