In [8]:
!pip install pvporcupine
!pip install pyaudio
!pip install sounddevice
!pip install datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Using cached multiprocess-0.70.16-py311-none-any.whl (143 kB)
Downloading pyarrow-20.0.0-cp311-cp311-win_amd64.whl (25.8 MB)
   ---------------------------------------- 0.0/25.8 MB ? eta -:--:--
    --------------------------------------- 0.5/25.8 MB 4.2 MB/s eta 0:00:07
   -- ------------------------------------- 1.3/25.8 MB 3.5 MB/s eta 0:00:07
   --- ----------

In [47]:
import pvporcupine
import pyaudio
import struct
from dotenv import load_dotenv
import os
import sounddevice as sd
import soundfile as sf
from datasets import load_dataset
import librosa
from librosa.feature import melspectrogram
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, BatchNormalization
from sklearn.metrics import accuracy_score, classification_report



In [45]:
keyword_path = "../models/Mamba_en_windows.ppn"
load_dotenv()
access_key = os.getenv("PORCUPINE_ACCESS_KEY")

## 1. Porcupine

In [3]:
porcupine = pvporcupine.create(keyword_paths=[keyword_path], access_key=access_key)


In [17]:
pa = pyaudio.PyAudio()
stream = pa.open(
    rate=porcupine.sample_rate,
    channels=1,
    format=pyaudio.paInt16,
    input=True,
    frames_per_buffer=porcupine.frame_length
)
print("Nasłuchiwanie... Powiedz słowo-kluczowe.")


Nasłuchiwanie... Powiedz słowo-kluczowe.


In [18]:
try:
    while True:
        pcm = stream.read(porcupine.frame_length, exception_on_overflow=False)
        pcm = struct.unpack_from("h" * porcupine.frame_length, pcm)
        result = porcupine.process(pcm)
        if result >= 0:
            print("🎉 Wykryto słowo-kluczowe!")
except KeyboardInterrupt:
    print("Zamykam...")
finally:
    stream.stop_stream()
    stream.close()
    pa.terminate()
    porcupine.delete()

🎉 Wykryto słowo-kluczowe!
Zamykam...


## 2. My own model

### 2.1 generating wake words audio

In [5]:
samplerate = 16000
duration = 1.0

In [None]:
for i in range(500):
    input("Naciśnij Enter by nagrać próbkę...")
    audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1)
    sd.wait()
    sf.write(f"positive/wake_word_{i:03}.wav", audio, samplerate)

In [None]:
for i in range(300):
    input("Naciśnij Enter by nagrać próbkę...")
    audio = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1)
    sd.wait()
    sf.write(f"negative/wake_word_{i:03}.wav", audio, samplerate)

In [11]:
nemo = load_dataset("amu-cai/nEMO", split="train")


README.md:   0%|          | 0.00/6.08k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00002.parquet:   0%|          | 0.00/267M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/265M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4481 [00:00<?, ? examples/s]

In [13]:
sr = 16000
target_chunks = 2000
chunk_duration = 1 
chunk_size = sr * chunk_duration
output_dir = "nemo_chunks"
os.makedirs(output_dir, exist_ok=True)

In [15]:

saved = 0

for example in tqdm(nemo):
    audio_array = example["audio"]["array"]
    if len(audio_array) < chunk_size:
        continue 

    for i in range(0, len(audio_array) - chunk_size + 1, chunk_size):
        chunk = audio_array[i:i + chunk_size]
        filename = f"chunk_{saved:04}.wav" 
        filepath = os.path.join(output_dir, filename)
        sf.write(filepath, chunk, sr)
        saved += 1
        if saved >= target_chunks:
            break
    if saved >= target_chunks:
        break

print(f"Zapisano {saved} jednosekundowych plików do folderu '{output_dir}'")

 14%|█▎        | 612/4481 [00:02<00:18, 207.70it/s]

Zapisano 2000 jednosekundowych plików do folderu 'nemo_chunks'





### 2.2 Stworzenie modelu

In [6]:
def extract_features(file_path, sr=16000, n_mels=40):
    y, _ = librosa.load(file_path, sr=sr)
    y = librosa.util.fix_length(y, size=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, )
    log_mel = librosa.power_to_db(mel)
    return log_mel.flatten()

In [48]:
def load_dataset():
    data = []

    for directory in ["negative", "nemo_chunks", "positive"]:
        is_positive = directory == "positive"

        for filename in os.listdir(directory):
            path = os.path.join(directory, filename)

            try:
                features = extract_features(path)
                data.append({
                    "features": features,
                    "positive": is_positive,
                    "filepath": path
                })
            except Exception as e:
                print(f"Error with file {path}: {e}")
            
    
    return pd.DataFrame(data)


In [49]:
dataframe = load_dataset()

In [50]:
print(dataframe.head())

                                            features  positive  \
0  [-78.55861, -77.87644, -80.720764, -78.85032, ...     False   
1  [-62.77656, -62.77656, -62.77656, -62.77656, -...     False   
2  [-59.723557, -59.723557, -59.723557, -59.72355...     False   
3  [-67.97879, -67.97879, -67.97879, -67.97879, -...     False   
4  [-60.303894, -60.303894, -60.303894, -60.30389...     False   

                     filepath  
0  negative\wake_word_000.wav  
1  negative\wake_word_001.wav  
2  negative\wake_word_002.wav  
3  negative\wake_word_003.wav  
4  negative\wake_word_004.wav  


In [14]:
X = np.stack(dataframe["features"].values)

In [18]:
y = dataframe["positive"]

In [51]:
filepaths = dataframe["filepath"].values

In [25]:
X = X.reshape(-1, 40, 32, 1)

In [52]:
X_train, X_test, y_train, y_test, filepaths_train, filepaths_test = train_test_split(
    X, y, filepaths, test_size=0.2, stratify=y, random_state=42
)

In [27]:
X.shape

(2602, 40, 32, 1)

In [24]:
y.shape

(2602,)

In [55]:
filepaths.shape

(2602,)

In [56]:
model = Sequential()

model.add(Conv2D(16, (3, 3), activation='relu', input_shape=(40, 32, 1), padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.3))

model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid')) 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [57]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [58]:
model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_test, y_test))


Epoch 1/20
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.8703 - loss: 0.4681 - val_accuracy: 0.9367 - val_loss: 0.1651
Epoch 2/20
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9346 - loss: 0.1388 - val_accuracy: 0.9655 - val_loss: 0.1127
Epoch 3/20
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9520 - loss: 0.1024 - val_accuracy: 0.9693 - val_loss: 0.0814
Epoch 4/20
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9641 - loss: 0.0997 - val_accuracy: 0.9712 - val_loss: 0.0634
Epoch 5/20
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9681 - loss: 0.0724 - val_accuracy: 0.9750 - val_loss: 0.0657
Epoch 6/20
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9697 - loss: 0.0822 - val_accuracy: 0.9731 - val_loss: 0.0724
Epoch 7/20
[1m66/66[0m [32m━━━━

<keras.src.callbacks.history.History at 0x22c84902010>

In [59]:
model.save("cnn_model.keras")

In [60]:
y_pred = model.predict(X_test)

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


array([3.7331724e-20], dtype=float32)

In [40]:
y_bin = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_bin))


Accuracy: 0.9750479846449136


In [41]:
def preprocess_audio(file_path):
    y, sr = librosa.load(file_path, sr=16000)
    y = librosa.util.fix_length(y, size=16000)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40, hop_length=512)
    log_mel = librosa.power_to_db(mel)
    log_mel = log_mel[:, :32]
    log_mel = log_mel.reshape(1, 40, 32, 1)
    return log_mel

In [43]:
features = preprocess_audio("positive/wake_word_100.wav")
score = model.predict(features)[0][0]
print(score)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
0.99998355


In [44]:
def predict_custom_model(file_path, model):
    features = preprocess_audio(file_path)
    prob = model.predict(features)[0][0]
    return int(prob > 0.8)


In [46]:
def predict_porcupine(file_path, keyword_path, access_key):
    porcupine = pvporcupine.create(access_key=access_key, keyword_paths=[keyword_path])
    audio, sr = sf.read(file_path)
    assert sr == porcupine.sample_rate, "Wrong sample rate"

    detected = False
    for i in range(0, len(audio), porcupine.frame_length):
        frame = audio[i:i + porcupine.frame_length]
        if len(frame) < porcupine.frame_length:
            break
        pcm = (frame * 32767).astype("int16").tolist()
        if porcupine.process(pcm) >= 0:
            detected = True
            break

    porcupine.delete()
    return int(detected)

In [67]:
y_test = y_test.reset_index(drop=True)


In [68]:
y_test[0]

False

In [65]:
filepaths_test[0]

'nemo_chunks\\chunk_1749.wav'

In [69]:
y_true = [] 
y_pred_custom = []
y_pred_porcupine = []

for filepath, is_positive in zip(filepaths_test, y_test):
    y_true.append(is_positive)
    y_pred_custom.append(predict_custom_model(filepath, model))
    y_pred_porcupine.append(predict_porcupine(filepath, keyword_path, access_key))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35

In [70]:
print("My model: ")
print(classification_report(y_true, y_pred_custom))

print("Porcupine:")
print(classification_report(y_true, y_pred_porcupine))

My model: 
              precision    recall  f1-score   support

       False       1.00      0.99      0.99       461
        True       0.92      0.97      0.94        60

    accuracy                           0.99       521
   macro avg       0.96      0.98      0.97       521
weighted avg       0.99      0.99      0.99       521

Porcupine:
              precision    recall  f1-score   support

       False       0.88      1.00      0.94       461
        True       0.00      0.00      0.00        60

    accuracy                           0.88       521
   macro avg       0.44      0.50      0.47       521
weighted avg       0.78      0.88      0.83       521



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
