In [1]:
!pip -q install tensorflow soundfile scipy scikit-learn

import os, glob, zipfile, random
import numpy as np
import tensorflow as tf
import soundfile as sf
import scipy.signal
from sklearn.model_selection import train_test_split
from google.colab import files

print("TF:", tf.__version__)


TF: 2.19.0


In [2]:
uploaded = files.upload()  # sube student_yes_no_samples.zip
zip_name = [k for k in uploaded.keys() if k.endswith(".zip")][0]

os.makedirs("student_samples", exist_ok=True)
with zipfile.ZipFile(zip_name, "r") as z:
    z.extractall("student_samples")

print("Student YES:", len(glob.glob("student_samples/student_data/yes/*.wav")))
print("Student  NO:", len(glob.glob("student_samples/student_data/no/*.wav")))


Saving student_yes_no_samples.zip to student_yes_no_samples.zip
Student YES: 10
Student  NO: 10


In [3]:
DATASET_URL = "http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz"
!wget -q $DATASET_URL -O speech_commands.tar.gz
!tar -xf speech_commands.tar.gz
print("Speech Commands extracted.")


Speech Commands extracted.


In [5]:
# === Frontend params: deben calzar con tu sl_ml_audio_feature_generation_config.h ===
SR = 16000
SAMPLE_LEN_MS = 1000
WINDOW_MS = 25
STEP_MS = 10
FFT_LEN = 512
N_CHANNELS = 40
LOWER_HZ = 0.0
UPPER_HZ = 8000.0  # <= SR/2
LOG_SCALE = True

def load_audio_1s(path):
    x, sr = sf.read(path)
    if x.ndim > 1:
        x = x[:,0]
    if sr != SR:
        x = scipy.signal.resample_poly(x, SR, sr)
    n = int(SR * (SAMPLE_LEN_MS/1000.0))
    if len(x) < n:
        x = np.pad(x, (0, n-len(x)))
    else:
        x = x[:n]
    return x.astype(np.float32)

def hz_to_mel(hz): return 2595.0 * np.log10(1.0 + hz/700.0)
def mel_to_hz(m):  return 700.0 * (10**(m/2595.0) - 1.0)

def mel_filterbank(sr, nfft, n_mels, fmin, fmax):
    # nfft -> bins = nfft/2+1
    mels = np.linspace(hz_to_mel(fmin), hz_to_mel(fmax), n_mels + 2)
    hz = mel_to_hz(mels)
    bins = np.floor((nfft + 1) * hz / sr).astype(int)

    fb = np.zeros((n_mels, nfft//2 + 1), dtype=np.float32)
    for m in range(1, n_mels+1):
        f0, f1, f2 = bins[m-1], bins[m], bins[m+1]
        if f1 == f0: f1 += 1
        if f2 == f1: f2 += 1
        for k in range(f0, f1):
            fb[m-1, k] = (k - f0) / (f1 - f0)
        for k in range(f1, f2):
            fb[m-1, k] = (f2 - k) / (f2 - f1)
    return fb

FBANK = mel_filterbank(SR, FFT_LEN, N_CHANNELS, LOWER_HZ, UPPER_HZ)

def frontend_features(x):
    win = int(SR * WINDOW_MS/1000.0)   # 240
    step = int(SR * STEP_MS/1000.0)    # 160
    # framing
    frames = np.lib.stride_tricks.sliding_window_view(x, win)[::step].copy()
    frames *= np.hamming(win).astype(np.float32)

    # power spectrum
    spec = np.fft.rfft(frames, n=FFT_LEN)
    mag = np.abs(spec).astype(np.float32)
    pow_spec = (mag ** 2) / FFT_LEN

    # mel filterbank energies
    mel_e = np.dot(pow_spec, FBANK.T)
    mel_e = np.maximum(mel_e, 1e-10)

    if LOG_SCALE:
        mel_e = np.log(mel_e)

    feat = mel_e.flatten().astype(np.float32)  # 49*32 = 1568
    return feat

# quick sanity: length must be 1568
_test = frontend_features(np.zeros(int(SR*1.0), dtype=np.float32))
print("Feature length:", _test.shape[0])


Feature length: 3920


In [6]:
labels = ["yes", "no", "unknown", "background"]
lab2i = {l:i for i,l in enumerate(labels)}

student_yes = glob.glob("student_samples/student_data/yes/*.wav")
student_no  = glob.glob("student_samples/student_data/no/*.wav")

google_yes = glob.glob("yes/*.wav")
google_no  = glob.glob("no/*.wav")

# unknown: otras carpetas
exclude = set(["yes","no","_background_noise_"])
unknown_words = ["up","down","left","right","stop","go","one","two","three","four"]
unknown_pool = []
for w in unknown_words:
    unknown_pool += glob.glob(os.path.join(w, "*.wav"))
random.shuffle(unknown_pool)

# background noise wavs
noise_files = glob.glob("_background_noise_/*.wav")

def sample_bg_chunks(noise_paths, n):
    out = []
    n_samp = int(SR * 1.0)
    for _ in range(n):
        p = random.choice(noise_paths)
        a, sr = sf.read(p)
        if a.ndim > 1: a = a[:,0]
        if sr != SR:
            a = scipy.signal.resample_poly(a, SR, sr)
        if len(a) < n_samp:
            a = np.pad(a, (0, n_samp-len(a)))
            start = 0
        else:
            start = random.randint(0, len(a)-n_samp)
        out.append(a[start:start+n_samp].astype(np.float32))
    return out

# balance
MAX_GOOGLE = 1200
google_yes = google_yes[:MAX_GOOGLE]
google_no  = google_no[:MAX_GOOGLE]
unknown_files = unknown_pool[:MAX_GOOGLE]
bg_audios = sample_bg_chunks(noise_files, MAX_GOOGLE)

X, y = [], []

def add_files(files_list, lab):
    for f in files_list:
        a = load_audio_1s(f)
        X.append(frontend_features(a))
        y.append(lab2i[lab])

add_files(student_yes, "yes")
add_files(student_no, "no")
add_files(google_yes, "yes")
add_files(google_no, "no")
add_files(unknown_files, "unknown")

for a in bg_audios:
    X.append(frontend_features(a))
    y.append(lab2i["background"])

X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.int32)

print("X:", X.shape, "y:", y.shape)
print("Counts:", {l:int(np.sum(y==lab2i[l])) for l in labels})


X: (4820, 3920) y: (4820,)
Counts: {'yes': 1210, 'no': 1210, 'unknown': 1200, 'background': 1200}


In [7]:
train_x, test_x, train_y, test_y = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = tf.keras.Sequential([
  tf.keras.layers.Input(shape=(train_x.shape[1],)),
  tf.keras.layers.Dense(128, activation="relu"),
  tf.keras.layers.Dense(64, activation="relu"),
  tf.keras.layers.Dense(4, activation="softmax"),
])

model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.fit(train_x, train_y, epochs=15, batch_size=32, validation_data=(test_x, test_y))
print("Eval:", model.evaluate(test_x, test_y, verbose=0))

def rep_gen():
    for i in range(min(300, len(train_x))):
        yield [train_x[i:i+1].astype(np.float32)]

converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = rep_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8

tflite_int8 = converter.convert()

out_name = "keyword_spotting_on_off.tflite"
with open(out_name, "wb") as f:
    f.write(tflite_int8)

# sanity check
interp = tf.lite.Interpreter(model_path=out_name)
interp.allocate_tensors()
out = interp.get_output_details()[0]
print("Output dtype/shape:", out["dtype"], out["shape"])

files.download(out_name)


Epoch 1/15
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.3641 - loss: 14.8983 - val_accuracy: 0.6162 - val_loss: 1.2828
Epoch 2/15
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5501 - loss: 1.7334 - val_accuracy: 0.6556 - val_loss: 0.8885
Epoch 3/15
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5730 - loss: 1.5528 - val_accuracy: 0.6722 - val_loss: 2.6747
Epoch 4/15
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6396 - loss: 1.3021 - val_accuracy: 0.7344 - val_loss: 0.6682
Epoch 5/15
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6717 - loss: 1.1337 - val_accuracy: 0.6753 - val_loss: 1.2179
Epoch 6/15
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6851 - loss: 0.8751 - val_accuracy: 0.7448 - val_loss: 0.6435
Epoch 7/15
[1m121/121[0m



Output dtype/shape: <class 'numpy.int8'> [1 4]


    TF 2.20. Please use the LiteRT interpreter from the ai_edge_litert package.
    See the [migration guide](https://ai.google.dev/edge/litert/migration)
    for details.
    


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>